From: Josh Bleecher Snyder Date: Sun, 10 Apr 2016 16:08:00 +0000 (-0700) Subject: cmd/compile: avoid a spill in append fast path X-Git-Tag: go1.7beta1~791 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=6b33b0e98e9be77d98b026ae2adf10dd71be5a1b;p=gostls13.git cmd/compile: avoid a spill in append fast path Instead of spilling newlen, recalculate it. This removes a spill from the fast path, at the cost of a cheap recalculation on the (rare) growth path. This uses 8 bytes less of stack space. It generates two more bytes of code, but that is due to suboptimal register allocation; see far below. Runtime append microbenchmarks are all over the map, presumably due to incidental code movement. Sample code: func s(b []byte) []byte { b = append(b, 1, 2, 3) return b } Before: "".s t=1 size=160 args=0x30 locals=0x48 0x0000 00000 (append.go:8) TEXT "".s(SB), $72-48 0x0000 00000 (append.go:8) MOVQ (TLS), CX 0x0009 00009 (append.go:8) CMPQ SP, 16(CX) 0x000d 00013 (append.go:8) JLS 149 0x0013 00019 (append.go:8) SUBQ $72, SP 0x0017 00023 (append.go:8) FUNCDATA $0, gclocals·6432f8c6a0d23fa7bee6c5d96f21a92a(SB) 0x0017 00023 (append.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0017 00023 (append.go:9) MOVQ "".b+88(FP), CX 0x001c 00028 (append.go:9) LEAQ 3(CX), DX 0x0020 00032 (append.go:9) MOVQ DX, "".autotmp_0+64(SP) 0x0025 00037 (append.go:9) MOVQ "".b+96(FP), BX 0x002a 00042 (append.go:9) CMPQ DX, BX 0x002d 00045 (append.go:9) JGT $0, 86 0x002f 00047 (append.go:8) MOVQ "".b+80(FP), AX 0x0034 00052 (append.go:9) MOVB $1, (AX)(CX*1) 0x0038 00056 (append.go:9) MOVB $2, 1(AX)(CX*1) 0x003d 00061 (append.go:9) MOVB $3, 2(AX)(CX*1) 0x0042 00066 (append.go:10) MOVQ AX, "".~r1+104(FP) 0x0047 00071 (append.go:10) MOVQ DX, "".~r1+112(FP) 0x004c 00076 (append.go:10) MOVQ BX, "".~r1+120(FP) 0x0051 00081 (append.go:10) ADDQ $72, SP 0x0055 00085 (append.go:10) RET 0x0056 00086 (append.go:9) LEAQ type.[]uint8(SB), AX 0x005d 00093 (append.go:9) MOVQ AX, (SP) 0x0061 00097 (append.go:9) MOVQ "".b+80(FP), BP 0x0066 00102 (append.go:9) MOVQ BP, 8(SP) 0x006b 00107 (append.go:9) MOVQ CX, 16(SP) 0x0070 00112 (append.go:9) MOVQ BX, 24(SP) 0x0075 00117 (append.go:9) MOVQ DX, 32(SP) 0x007a 00122 (append.go:9) PCDATA $0, $0 0x007a 00122 (append.go:9) CALL runtime.growslice(SB) 0x007f 00127 (append.go:9) MOVQ 40(SP), AX 0x0084 00132 (append.go:9) MOVQ 56(SP), BX 0x0089 00137 (append.go:8) MOVQ "".b+88(FP), CX 0x008e 00142 (append.go:9) MOVQ "".autotmp_0+64(SP), DX 0x0093 00147 (append.go:9) JMP 52 0x0095 00149 (append.go:9) NOP 0x0095 00149 (append.go:8) CALL runtime.morestack_noctxt(SB) 0x009a 00154 (append.go:8) JMP 0 After: "".s t=1 size=176 args=0x30 locals=0x40 0x0000 00000 (append.go:8) TEXT "".s(SB), $64-48 0x0000 00000 (append.go:8) MOVQ (TLS), CX 0x0009 00009 (append.go:8) CMPQ SP, 16(CX) 0x000d 00013 (append.go:8) JLS 151 0x0013 00019 (append.go:8) SUBQ $64, SP 0x0017 00023 (append.go:8) FUNCDATA $0, gclocals·6432f8c6a0d23fa7bee6c5d96f21a92a(SB) 0x0017 00023 (append.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0017 00023 (append.go:9) MOVQ "".b+80(FP), CX 0x001c 00028 (append.go:9) LEAQ 3(CX), DX 0x0020 00032 (append.go:9) MOVQ "".b+88(FP), BX 0x0025 00037 (append.go:9) CMPQ DX, BX 0x0028 00040 (append.go:9) JGT $0, 81 0x002a 00042 (append.go:8) MOVQ "".b+72(FP), AX 0x002f 00047 (append.go:9) MOVB $1, (AX)(CX*1) 0x0033 00051 (append.go:9) MOVB $2, 1(AX)(CX*1) 0x0038 00056 (append.go:9) MOVB $3, 2(AX)(CX*1) 0x003d 00061 (append.go:10) MOVQ AX, "".~r1+96(FP) 0x0042 00066 (append.go:10) MOVQ DX, "".~r1+104(FP) 0x0047 00071 (append.go:10) MOVQ BX, "".~r1+112(FP) 0x004c 00076 (append.go:10) ADDQ $64, SP 0x0050 00080 (append.go:10) RET 0x0051 00081 (append.go:9) LEAQ type.[]uint8(SB), AX 0x0058 00088 (append.go:9) MOVQ AX, (SP) 0x005c 00092 (append.go:9) MOVQ "".b+72(FP), BP 0x0061 00097 (append.go:9) MOVQ BP, 8(SP) 0x0066 00102 (append.go:9) MOVQ CX, 16(SP) 0x006b 00107 (append.go:9) MOVQ BX, 24(SP) 0x0070 00112 (append.go:9) MOVQ DX, 32(SP) 0x0075 00117 (append.go:9) PCDATA $0, $0 0x0075 00117 (append.go:9) CALL runtime.growslice(SB) 0x007a 00122 (append.go:9) MOVQ 40(SP), AX 0x007f 00127 (append.go:9) MOVQ 48(SP), CX 0x0084 00132 (append.go:9) MOVQ 56(SP), BX 0x0089 00137 (append.go:9) ADDQ $3, CX 0x008d 00141 (append.go:9) MOVQ CX, DX 0x0090 00144 (append.go:8) MOVQ "".b+80(FP), CX 0x0095 00149 (append.go:9) JMP 47 0x0097 00151 (append.go:9) NOP 0x0097 00151 (append.go:8) CALL runtime.morestack_noctxt(SB) 0x009c 00156 (append.go:8) JMP 0 Observe that in the following sequence, we should use DX directly instead of using CX as a temporary register, which would make the new code a strict improvement on the old: 0x007f 00127 (append.go:9) MOVQ 48(SP), CX 0x0084 00132 (append.go:9) MOVQ 56(SP), BX 0x0089 00137 (append.go:9) ADDQ $3, CX 0x008d 00141 (append.go:9) MOVQ CX, DX 0x0090 00144 (append.go:8) MOVQ "".b+80(FP), CX Change-Id: I4ee50b18fa53865901d2d7f86c2cbb54c6fa6924 Reviewed-on: https://go-review.googlesource.com/21812 Run-TryBot: Josh Bleecher Snyder TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index 7c5f906d76..d69559d945 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -337,12 +337,13 @@ var ( memVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "mem"}} // dummy nodes for temporary variables - ptrVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ptr"}} - capVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "cap"}} - typVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "typ"}} - idataVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "idata"}} - okVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ok"}} - deltaVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "delta"}} + ptrVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ptr"}} + newlenVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "newlen"}} + capVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "cap"}} + typVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "typ"}} + idataVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "idata"}} + okVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ok"}} + deltaVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "delta"}} ) // startBlock sets the current block we're generating code in to b. @@ -2089,15 +2090,16 @@ func (s *state) expr(n *Node) *ssa.Value { // exprAppend converts an OAPPEND node n to an ssa.Value, adds it to s, and returns the Value. func (s *state) exprAppend(n *Node) *ssa.Value { // append(s, e1, e2, e3). Compile like: - // ptr,len,cap := s + // ptr, len, cap := s // newlen := len + 3 // if newlen > s.cap { - // ptr,_,cap = growslice(s, newlen) + // ptr, len, cap = growslice(s, newlen) + // newlen = len + 3 // recalculate to avoid a spill // } // *(ptr+len) = e1 // *(ptr+len+1) = e2 // *(ptr+len+2) = e3 - // makeslice(ptr,newlen,cap) + // makeslice(ptr, newlen, cap) et := n.Type.Elem() pt := Ptrto(et) @@ -2117,6 +2119,7 @@ func (s *state) exprAppend(n *Node) *ssa.Value { nl := s.newValue2(s.ssaOp(OADD, Types[TINT]), Types[TINT], l, s.constInt(Types[TINT], nargs)) cmp := s.newValue2(s.ssaOp(OGT, Types[TINT]), Types[TBOOL], nl, c) s.vars[&ptrVar] = p + s.vars[&newlenVar] = nl s.vars[&capVar] = c b := s.endBlock() b.Kind = ssa.BlockIf @@ -2132,8 +2135,7 @@ func (s *state) exprAppend(n *Node) *ssa.Value { r := s.rtcall(growslice, true, []*Type{pt, Types[TINT], Types[TINT]}, taddr, p, l, c, nl) s.vars[&ptrVar] = r[0] - // Note: we don't need to read r[1], the result's length. It will be nl. - // (or maybe we should, we just have to spill/restore nl otherwise?) + s.vars[&newlenVar] = s.newValue2(s.ssaOp(OADD, Types[TINT]), Types[TINT], r[1], s.constInt(Types[TINT], nargs)) s.vars[&capVar] = r[2] b = s.endBlock() b.AddEdgeTo(assign) @@ -2154,8 +2156,9 @@ func (s *state) exprAppend(n *Node) *ssa.Value { } } - p = s.variable(&ptrVar, pt) // generates phi for ptr - c = s.variable(&capVar, Types[TINT]) // generates phi for cap + p = s.variable(&ptrVar, pt) // generates phi for ptr + nl = s.variable(&newlenVar, Types[TINT]) // generates phi for nl + c = s.variable(&capVar, Types[TINT]) // generates phi for cap p2 := s.newValue2(ssa.OpPtrIndex, pt, p, l) // TODO: just one write barrier call for all of these writes? // TODO: maybe just one writeBarrier.enabled check? @@ -2178,6 +2181,7 @@ func (s *state) exprAppend(n *Node) *ssa.Value { // make result delete(s.vars, &ptrVar) + delete(s.vars, &newlenVar) delete(s.vars, &capVar) return s.newValue3(ssa.OpSliceMake, n.Type, p, nl, c) } diff --git a/src/runtime/slice.go b/src/runtime/slice.go index 0bc0299f72..4ab221056c 100644 --- a/src/runtime/slice.go +++ b/src/runtime/slice.go @@ -37,6 +37,12 @@ func makeslice(t *slicetype, len64, cap64 int64) slice { // It is passed the slice type, the old slice, and the desired new minimum capacity, // and it returns a new slice with at least that capacity, with the old data // copied into it. +// The new slice's length is set to the old slice's length, +// NOT to the new requested capacity. +// This is for codegen convenience. The old slice's length is used immediately +// to calculate where to write new values during an append. +// TODO: When the old backend is gone, reconsider this decision. +// The SSA backend might prefer the new length or to return only ptr/cap and save stack space. func growslice(t *slicetype, old slice, cap int) slice { if raceenabled { callerpc := getcallerpc(unsafe.Pointer(&t))