From: Jeremy Faller Date: Tue, 17 Mar 2020 14:24:40 +0000 (-0400) Subject: [dev.link] cmd/link: parallelize asmb on amd64 X-Git-Tag: go1.15beta1~679^2~17 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=c46632a2e0c61786900c2c324989aa90e8821aea;p=gostls13.git [dev.link] cmd/link: parallelize asmb on amd64 Introduces a parallel OutBuf implementation, and POC on amd64. Due to some of the weird behaviors I saw on MacOS (SIGBUS while calling msync), I will wait for feedback to port to other architectures. On my mac, sped up Asmb by ~78% for cmd/compile (below). Will likely have an appreciable speedup on kubelet benchmark. Asmb 39.1ms ±11% 8.5ms ±10% -78.17% (p=0.000 n=10+9) TotalTime 596ms ± 2% 577ms ± 8% -3.07% (p=0.034 n=8+10) Change-Id: Id2a2577c3f4da155d8dccc862897f43b941877ac Reviewed-on: https://go-review.googlesource.com/c/go/+/223742 Run-TryBot: Jeremy Faller Reviewed-by: Cherry Zhang Reviewed-by: Than McIntosh --- diff --git a/src/cmd/link/internal/amd64/asm.go b/src/cmd/link/internal/amd64/asm.go index 7b925856cc..40ecd2572e 100644 --- a/src/cmd/link/internal/amd64/asm.go +++ b/src/cmd/link/internal/amd64/asm.go @@ -38,6 +38,7 @@ import ( "cmd/link/internal/sym" "debug/elf" "log" + "sync" ) func PADDR(x uint32) uint32 { @@ -693,29 +694,32 @@ func asmb(ctxt *ld.Link) { ld.Asmbelfsetup() } + var wg sync.WaitGroup sect := ld.Segtext.Sections[0] - ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff)) - // 0xCC is INT $3 - breakpoint instruction - ld.CodeblkPad(ctxt, int64(sect.Vaddr), int64(sect.Length), []byte{0xCC}) - for _, sect = range ld.Segtext.Sections[1:] { - ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff)) - ld.Datblk(ctxt, int64(sect.Vaddr), int64(sect.Length)) + offset := sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff + f := func(ctxt *ld.Link, out *ld.OutBuf, start, length int64) { + // 0xCC is INT $3 - breakpoint instruction + ld.CodeblkPad(ctxt, out, start, length, []byte{0xCC}) + } + ld.WriteParallel(&wg, f, ctxt, offset, sect.Vaddr, sect.Length) + + for _, sect := range ld.Segtext.Sections[1:] { + offset := sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff + ld.WriteParallel(&wg, ld.Datblk2, ctxt, offset, sect.Vaddr, sect.Length) } if ld.Segrodata.Filelen > 0 { - ctxt.Out.SeekSet(int64(ld.Segrodata.Fileoff)) - ld.Datblk(ctxt, int64(ld.Segrodata.Vaddr), int64(ld.Segrodata.Filelen)) + ld.WriteParallel(&wg, ld.Datblk2, ctxt, ld.Segrodata.Fileoff, ld.Segrodata.Vaddr, ld.Segrodata.Filelen) } if ld.Segrelrodata.Filelen > 0 { - ctxt.Out.SeekSet(int64(ld.Segrelrodata.Fileoff)) - ld.Datblk(ctxt, int64(ld.Segrelrodata.Vaddr), int64(ld.Segrelrodata.Filelen)) + ld.WriteParallel(&wg, ld.Datblk2, ctxt, ld.Segrelrodata.Fileoff, ld.Segrelrodata.Vaddr, ld.Segrelrodata.Filelen) } - ctxt.Out.SeekSet(int64(ld.Segdata.Fileoff)) - ld.Datblk(ctxt, int64(ld.Segdata.Vaddr), int64(ld.Segdata.Filelen)) + ld.WriteParallel(&wg, ld.Datblk2, ctxt, ld.Segdata.Fileoff, ld.Segdata.Vaddr, ld.Segdata.Filelen) + + ld.WriteParallel(&wg, ld.Dwarfblk2, ctxt, ld.Segdwarf.Fileoff, ld.Segdwarf.Vaddr, ld.Segdwarf.Filelen) - ctxt.Out.SeekSet(int64(ld.Segdwarf.Fileoff)) - ld.Dwarfblk(ctxt, int64(ld.Segdwarf.Vaddr), int64(ld.Segdwarf.Filelen)) + wg.Wait() } func asmb2(ctxt *ld.Link) { diff --git a/src/cmd/link/internal/ld/data.go b/src/cmd/link/internal/ld/data.go index 4174a706cd..7bdeb1f68b 100644 --- a/src/cmd/link/internal/ld/data.go +++ b/src/cmd/link/internal/ld/data.go @@ -730,14 +730,15 @@ func dynreloc(ctxt *Link, data *[sym.SXREF][]*sym.Symbol) { } func Codeblk(ctxt *Link, addr int64, size int64) { - CodeblkPad(ctxt, addr, size, zeros[:]) + CodeblkPad(ctxt, ctxt.Out, addr, size, zeros[:]) } -func CodeblkPad(ctxt *Link, addr int64, size int64, pad []byte) { + +func CodeblkPad(ctxt *Link, out *OutBuf, addr int64, size int64, pad []byte) { if *flagA { ctxt.Logf("codeblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset()) } - blk(ctxt.Out, ctxt.Textp, addr, size, pad) + writeBlocks(out, ctxt.outSem, ctxt.Textp, addr, size, pad) /* again for printing */ if !*flagA { @@ -795,9 +796,75 @@ func CodeblkPad(ctxt *Link, addr int64, size int64, pad []byte) { } } -func blk(out *OutBuf, syms []*sym.Symbol, addr, size int64, pad []byte) { +const blockSize = 1 << 20 // 1MB chunks written at a time. + +// writeBlocks writes a specified chunk of symbols to the output buffer. It +// breaks the write up into ≥blockSize chunks to write them out, and schedules +// as many goroutines as necessary to accomplish this task. This call then +// blocks, waiting on the writes to complete. Note that we use the sem parameter +// to limit the number of concurrent writes taking place. +func writeBlocks(out *OutBuf, sem chan int, syms []*sym.Symbol, addr, size int64, pad []byte) { + for i, s := range syms { + if s.Value >= addr && !s.Attr.SubSymbol() { + syms = syms[i:] + break + } + } + + var wg sync.WaitGroup + max, lastAddr, written := int64(blockSize), addr+size, int64(0) + for addr < lastAddr { + // Find the last symbol we'd write. + idx := -1 + length := int64(0) + for i, s := range syms { + // If the next symbol's size would put us out of bounds on the total length, + // stop looking. + if s.Value+s.Size > lastAddr { + break + } + + // We're gonna write this symbol. + idx = i + length = s.Value + s.Size - addr + + // If we cross over the max size, we've got enough symbols. + if s.Value+s.Size > addr+max { + break + } + } + + // If we didn't find any symbols to write, we're done here. + if idx < 0 { + break + } + + // Start the block output operator. + if o, err := out.View(uint64(out.Offset() + written)); err == nil { + sem <- 1 + wg.Add(1) + go func(o *OutBuf, syms []*sym.Symbol, addr, size int64, pad []byte) { + writeBlock(o, syms, addr, size, pad) + wg.Done() + <-sem + }(o, syms, addr, length, pad) + } else { // output not mmaped, don't parallelize. + writeBlock(out, syms, addr, length, pad) + } + + // Prepare for the next loop. + if idx != -1 { + syms = syms[idx+1:] + } + written += length + addr += length + } + wg.Wait() +} + +func writeBlock(out *OutBuf, syms []*sym.Symbol, addr, size int64, pad []byte) { for i, s := range syms { - if !s.Attr.SubSymbol() && s.Value >= addr { + if s.Value >= addr && !s.Attr.SubSymbol() { syms = syms[i:] break } @@ -841,13 +908,32 @@ func blk(out *OutBuf, syms []*sym.Symbol, addr, size int64, pad []byte) { if addr < eaddr { out.WriteStringPad("", int(eaddr-addr), pad) } - out.Flush() +} + +type writeFn func(*Link, *OutBuf, int64, int64) + +// WriteParallel handles scheduling parallel execution of data write functions. +func WriteParallel(wg *sync.WaitGroup, fn writeFn, ctxt *Link, seek, vaddr, length uint64) { + if out, err := ctxt.Out.View(seek); err != nil { + ctxt.Out.SeekSet(int64(seek)) + fn(ctxt, ctxt.Out, int64(vaddr), int64(length)) + } else { + wg.Add(1) + go func() { + defer wg.Done() + fn(ctxt, out, int64(vaddr), int64(length)) + }() + } } func Datblk(ctxt *Link, addr int64, size int64) { writeDatblkToOutBuf(ctxt, ctxt.Out, addr, size) } +func Datblk2(ctxt *Link, out *OutBuf, addr, size int64) { + writeDatblkToOutBuf(ctxt, out, addr, size) +} + // Used only on Wasm for now. func DatblkBytes(ctxt *Link, addr int64, size int64) []byte { buf := bytes.NewBuffer(make([]byte, 0, size)) @@ -862,7 +948,7 @@ func writeDatblkToOutBuf(ctxt *Link, out *OutBuf, addr int64, size int64) { ctxt.Logf("datblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset()) } - blk(out, ctxt.datap, addr, size, zeros[:]) + writeBlocks(out, ctxt.outSem, ctxt.datap, addr, size, zeros[:]) /* again for printing */ if !*flagA { @@ -931,12 +1017,20 @@ func writeDatblkToOutBuf(ctxt *Link, out *OutBuf, addr int64, size int64) { ctxt.Logf("\t%.8x|\n", uint(eaddr)) } +func Dwarfblk2(ctxt *Link, out *OutBuf, addr int64, size int64) { + if *flagA { + ctxt.Logf("dwarfblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset()) + } + + writeBlocks(out, ctxt.outSem, dwarfp, addr, size, zeros[:]) +} + func Dwarfblk(ctxt *Link, addr int64, size int64) { if *flagA { ctxt.Logf("dwarfblk [%#x,%#x) at offset %#x\n", addr, addr+size, ctxt.Out.Offset()) } - blk(ctxt.Out, dwarfp, addr, size, zeros[:]) + writeBlock(ctxt.Out, dwarfp, addr, size, zeros[:]) } var zeros [512]byte diff --git a/src/cmd/link/internal/ld/lib.go b/src/cmd/link/internal/ld/lib.go index 692cf4fae1..0d6cdab9fe 100644 --- a/src/cmd/link/internal/ld/lib.go +++ b/src/cmd/link/internal/ld/lib.go @@ -31,7 +31,6 @@ package ld import ( - "bufio" "bytes" "cmd/internal/bio" "cmd/internal/obj" @@ -385,14 +384,11 @@ func libinit(ctxt *Link) { Lflag(ctxt, filepath.Join(objabi.GOROOT, "pkg", fmt.Sprintf("%s_%s%s%s", objabi.GOOS, objabi.GOARCH, suffixsep, suffix))) mayberemoveoutfile() - f, err := os.OpenFile(*flagOutfile, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0775) - if err != nil { + + if err := ctxt.Out.Open(*flagOutfile); err != nil { Exitf("cannot create %s: %v", *flagOutfile, err) } - ctxt.Out.w = bufio.NewWriter(f) - ctxt.Out.f = f - if *flagEntrySymbol == "" { switch ctxt.BuildMode { case BuildModeCShared, BuildModeCArchive: @@ -1203,25 +1199,21 @@ func hostlinksetup(ctxt *Link) { *flagTmpdir = dir ownTmpDir = true AtExit(func() { - ctxt.Out.f.Close() + ctxt.Out.Close() os.RemoveAll(*flagTmpdir) }) } // change our output to temporary object file - ctxt.Out.f.Close() + if err := ctxt.Out.Close(); err != nil { + Exitf("error closing output file") + } mayberemoveoutfile() p := filepath.Join(*flagTmpdir, "go.o") - var err error - f, err := os.OpenFile(p, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0775) - if err != nil { + if err := ctxt.Out.Open(p); err != nil { Exitf("cannot create %s: %v", p, err) } - - ctxt.Out.w = bufio.NewWriter(f) - ctxt.Out.f = f - ctxt.Out.off = 0 } // hostobjCopy creates a copy of the object files in hostobj in a @@ -1305,11 +1297,9 @@ func (ctxt *Link) archive() { // Force the buffer to flush here so that external // tools will see a complete file. - ctxt.Out.Flush() - if err := ctxt.Out.f.Close(); err != nil { - Exitf("close: %v", err) + if err := ctxt.Out.Close(); err != nil { + Exitf("error closing %v", *flagOutfile) } - ctxt.Out.f = nil argv := []string{*flagExtar, "-q", "-c", "-s"} if ctxt.HeadType == objabi.Haix { diff --git a/src/cmd/link/internal/ld/link.go b/src/cmd/link/internal/ld/link.go index b32b7c892d..24866d8e8c 100644 --- a/src/cmd/link/internal/ld/link.go +++ b/src/cmd/link/internal/ld/link.go @@ -53,7 +53,9 @@ type Link struct { Target ErrorReporter ArchSyms - Out *OutBuf + + outSem chan int // limits the number of output writers + Out *OutBuf Syms *sym.Symbols diff --git a/src/cmd/link/internal/ld/outbuf.go b/src/cmd/link/internal/ld/outbuf.go index f8e65ef8ae..8e083477ec 100644 --- a/src/cmd/link/internal/ld/outbuf.go +++ b/src/cmd/link/internal/ld/outbuf.go @@ -9,6 +9,7 @@ import ( "cmd/internal/sys" "cmd/link/internal/sym" "encoding/binary" + "errors" "log" "os" ) @@ -23,19 +24,100 @@ import ( // Second, it provides a very cheap offset counter that doesn't require // any system calls to read the value. // -// It also mmaps the output file (if available). The intended usage is: +// Third, it also mmaps the output file (if available). The intended usage is: // - Mmap the output file // - Write the content // - possibly apply any edits in the output buffer // - Munmap the output file // - possibly write more content to the file, which will not be edited later. +// +// And finally, it provides a mechanism by which you can multithread the +// writing of output files. This mechanism is accomplished by copying a OutBuf, +// and using it in the thread/goroutine. +// +// Parallel OutBuf is intended to be used like: +// +// func write(out *OutBuf) { +// var wg sync.WaitGroup +// for i := 0; i < 10; i++ { +// wg.Add(1) +// view, err := out.View(start[i]) +// if err != nil { +// // handle output +// continue +// } +// go func(out *OutBuf, i int) { +// // do output +// wg.Done() +// }(view, i) +// } +// wg.Wait() +// } type OutBuf struct { - arch *sys.Arch - off int64 - w *bufio.Writer - buf []byte // backing store of mmap'd output file - f *os.File - encbuf [8]byte // temp buffer used by WriteN methods + arch *sys.Arch + off int64 + w *bufio.Writer + buf []byte // backing store of mmap'd output file + name string + f *os.File + encbuf [8]byte // temp buffer used by WriteN methods + isView bool // true if created from View() + start, length uint64 // start and length mmaped data. +} + +func (out *OutBuf) Open(name string) error { + if out.f != nil { + return errors.New("cannont open more than one file") + } + f, err := os.OpenFile(name, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0775) + if err != nil { + return err + } + out.off = 0 + out.name = name + out.w = bufio.NewWriter(f) + out.f = f + return nil +} + +func NewOutBuf(arch *sys.Arch) *OutBuf { + return &OutBuf{ + arch: arch, + } +} + +var viewError = errors.New("output not mmapped") + +func (out *OutBuf) View(start uint64) (*OutBuf, error) { + if out.buf == nil { + return nil, viewError + } + return &OutBuf{ + arch: out.arch, + name: out.name, + buf: out.buf, + off: int64(start), + start: start, + length: out.length, + isView: true, + }, nil +} + +var viewCloseError = errors.New("cannot Close OutBuf from View") + +func (out *OutBuf) Close() error { + if out.isView { + return viewCloseError + } + out.Flush() + if out.f == nil { + return nil + } + if err := out.f.Close(); err != nil { + return err + } + out.f = nil + return nil } func (out *OutBuf) SeekSet(p int64) { @@ -45,7 +127,7 @@ func (out *OutBuf) SeekSet(p int64) { if out.buf == nil { out.Flush() if _, err := out.f.Seek(p, 0); err != nil { - Exitf("seeking to %d in %s: %v", p, out.f.Name(), err) + Exitf("seeking to %d in %s: %v", p, out.name, err) } } out.off = p @@ -154,13 +236,16 @@ func (out *OutBuf) WriteStringPad(s string, n int, pad []byte) { // edit to the symbol content. // If the output file is not Mmap'd, just writes the content. func (out *OutBuf) WriteSym(s *sym.Symbol) { + // NB: We inline the Write call for speediness. if out.buf != nil { start := out.off - out.Write(s.P) + n := copy(out.buf[out.off:], s.P) + out.off += int64(n) s.P = out.buf[start:out.off] s.Attr.Set(sym.AttrReadOnly, false) } else { - out.Write(s.P) + n, _ := out.w.Write(s.P) + out.off += int64(n) } } @@ -168,10 +253,11 @@ func (out *OutBuf) Flush() { var err error if out.buf != nil { err = out.Msync() - } else { + } + if out.w != nil { err = out.w.Flush() } if err != nil { - Exitf("flushing %s: %v", out.f.Name(), err) + Exitf("flushing %s: %v", out.name, err) } } diff --git a/src/cmd/link/internal/ld/outbuf_mmap.go b/src/cmd/link/internal/ld/outbuf_mmap.go index 4075141171..c064e9686a 100644 --- a/src/cmd/link/internal/ld/outbuf_mmap.go +++ b/src/cmd/link/internal/ld/outbuf_mmap.go @@ -16,11 +16,15 @@ func (out *OutBuf) Mmap(filesize uint64) error { if err != nil { Exitf("resize output file failed: %v", err) } + out.length = filesize out.buf, err = syscall.Mmap(int(out.f.Fd()), 0, int(filesize), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_FILE) return err } func (out *OutBuf) Munmap() { + if out.buf == nil { + return + } err := out.Msync() if err != nil { Exitf("msync output file failed: %v", err) @@ -34,9 +38,12 @@ func (out *OutBuf) Munmap() { } func (out *OutBuf) Msync() error { + if out.buf == nil || out.length <= 0 { + return nil + } // TODO: netbsd supports mmap and msync, but the syscall package doesn't define MSYNC. // It is excluded from the build tag for now. - _, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(&out.buf[0])), uintptr(len(out.buf)), syscall.MS_SYNC) + _, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(&out.buf[0])), uintptr(out.length), syscall.MS_SYNC) if errno != 0 { return errno } diff --git a/src/cmd/link/internal/ld/outbuf_windows.go b/src/cmd/link/internal/ld/outbuf_windows.go index 1cb05c301f..e7cda75fc0 100644 --- a/src/cmd/link/internal/ld/outbuf_windows.go +++ b/src/cmd/link/internal/ld/outbuf_windows.go @@ -36,6 +36,7 @@ func (out *OutBuf) Munmap() { return } err := syscall.UnmapViewOfFile(uintptr(unsafe.Pointer(&out.buf[0]))) + out.buf = nil if err != nil { Exitf("UnmapViewOfFile failed: %v", err) } diff --git a/src/cmd/link/internal/ld/sym.go b/src/cmd/link/internal/ld/sym.go index 62e6af25b4..97966ed7e4 100644 --- a/src/cmd/link/internal/ld/sym.go +++ b/src/cmd/link/internal/ld/sym.go @@ -36,13 +36,15 @@ import ( "cmd/internal/sys" "cmd/link/internal/sym" "log" + "runtime" ) func linknew(arch *sys.Arch) *Link { ctxt := &Link{ Target: Target{Arch: arch}, Syms: sym.NewSymbols(), - Out: &OutBuf{arch: arch}, + outSem: make(chan int, 2*runtime.GOMAXPROCS(0)), + Out: NewOutBuf(arch), LibraryByPkg: make(map[string]*sym.Library), } @@ -51,8 +53,8 @@ func linknew(arch *sys.Arch) *Link { } AtExit(func() { - if nerrors > 0 && ctxt.Out.f != nil { - ctxt.Out.f.Close() + if nerrors > 0 { + ctxt.Out.Close() mayberemoveoutfile() } }) diff --git a/src/cmd/link/internal/x86/asm.go b/src/cmd/link/internal/x86/asm.go index 30ad08688c..0fdd7307ae 100644 --- a/src/cmd/link/internal/x86/asm.go +++ b/src/cmd/link/internal/x86/asm.go @@ -626,7 +626,7 @@ func asmb(ctxt *ld.Link) { sect := ld.Segtext.Sections[0] ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff)) // 0xCC is INT $3 - breakpoint instruction - ld.CodeblkPad(ctxt, int64(sect.Vaddr), int64(sect.Length), []byte{0xCC}) + ld.CodeblkPad(ctxt, ctxt.Out, int64(sect.Vaddr), int64(sect.Length), []byte{0xCC}) for _, sect = range ld.Segtext.Sections[1:] { ctxt.Out.SeekSet(int64(sect.Vaddr - ld.Segtext.Vaddr + ld.Segtext.Fileoff)) ld.Datblk(ctxt, int64(sect.Vaddr), int64(sect.Length))