From: Joe Tsai Date: Thu, 8 Dec 2022 11:51:04 +0000 (-0800) Subject: bytes, strings: avoid unnecessary zero initialization X-Git-Tag: go1.21rc1~1458 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=132fae93b789ce512068ff4300c665b40635b74e;p=gostls13.git bytes, strings: avoid unnecessary zero initialization Add bytealg.MakeNoZero that specially allocates a []byte without zeroing it. It assumes the caller will populate every byte. From within the bytes and strings packages, we can use bytealg.MakeNoZero in a way where our logic ensures that the entire slice is overwritten such that uninitialized bytes are never leaked to the end user. We use bytealg.MakeNoZero from within the following functions: * bytes.Join * bytes.Repeat * bytes.ToUpper * bytes.ToLower * strings.Builder.Grow The optimization in strings.Builder transitively benefits the following: * strings.Join * strings.Map * strings.Repeat * strings.ToUpper * strings.ToLower * strings.ToValidUTF8 * strings.Replace * any user logic that depends on strings.Builder This optimization is especially notable on large buffers that do not fit in the CPU cache, such that the cost of runtime.memclr and runtime.memmove are non-trivial since they are both limited by the relatively slow speed of physical RAM. Performance: RepeatLarge/256/1 66.0ns ± 3% 64.5ns ± 1% ~ (p=0.095 n=5+5) RepeatLarge/256/16 55.4ns ± 5% 53.1ns ± 3% -4.17% (p=0.016 n=5+5) RepeatLarge/512/1 95.5ns ± 7% 87.1ns ± 2% -8.78% (p=0.008 n=5+5) RepeatLarge/512/16 84.4ns ± 9% 76.2ns ± 5% -9.73% (p=0.016 n=5+5) RepeatLarge/1024/1 161ns ± 4% 144ns ± 7% -10.45% (p=0.016 n=5+5) RepeatLarge/1024/16 148ns ± 3% 141ns ± 5% ~ (p=0.095 n=5+5) RepeatLarge/2048/1 296ns ± 7% 288ns ± 5% ~ (p=0.841 n=5+5) RepeatLarge/2048/16 298ns ± 8% 281ns ± 5% ~ (p=0.151 n=5+5) RepeatLarge/4096/1 593ns ± 8% 539ns ± 8% -8.99% (p=0.032 n=5+5) RepeatLarge/4096/16 568ns ±12% 526ns ± 7% ~ (p=0.056 n=5+5) RepeatLarge/8192/1 1.15µs ± 8% 1.08µs ±12% ~ (p=0.095 n=5+5) RepeatLarge/8192/16 1.12µs ± 4% 1.07µs ± 7% ~ (p=0.310 n=5+5) RepeatLarge/8192/4097 1.77ns ± 1% 1.76ns ± 2% ~ (p=0.310 n=5+5) RepeatLarge/16384/1 2.06µs ± 7% 1.94µs ± 5% ~ (p=0.222 n=5+5) RepeatLarge/16384/16 2.02µs ± 4% 1.92µs ± 6% ~ (p=0.095 n=5+5) RepeatLarge/16384/4097 1.50µs ±15% 1.44µs ±11% ~ (p=0.802 n=5+5) RepeatLarge/32768/1 3.90µs ± 8% 3.65µs ±11% ~ (p=0.151 n=5+5) RepeatLarge/32768/16 3.92µs ±14% 3.68µs ±12% ~ (p=0.222 n=5+5) RepeatLarge/32768/4097 3.71µs ± 5% 3.43µs ± 4% -7.54% (p=0.032 n=5+5) RepeatLarge/65536/1 7.47µs ± 8% 6.88µs ± 9% ~ (p=0.056 n=5+5) RepeatLarge/65536/16 7.29µs ± 4% 6.74µs ± 6% -7.60% (p=0.016 n=5+5) RepeatLarge/65536/4097 7.90µs ±11% 6.34µs ± 5% -19.81% (p=0.008 n=5+5) RepeatLarge/131072/1 17.0µs ±18% 14.1µs ± 6% -17.32% (p=0.008 n=5+5) RepeatLarge/131072/16 15.2µs ± 2% 16.2µs ±17% ~ (p=0.151 n=5+5) RepeatLarge/131072/4097 15.7µs ± 6% 14.8µs ±11% ~ (p=0.095 n=5+5) RepeatLarge/262144/1 30.4µs ± 5% 31.4µs ±13% ~ (p=0.548 n=5+5) RepeatLarge/262144/16 30.1µs ± 4% 30.7µs ±11% ~ (p=1.000 n=5+5) RepeatLarge/262144/4097 31.2µs ± 7% 32.7µs ±13% ~ (p=0.310 n=5+5) RepeatLarge/524288/1 67.5µs ± 9% 63.7µs ± 3% ~ (p=0.095 n=5+5) RepeatLarge/524288/16 67.2µs ± 5% 62.9µs ± 6% ~ (p=0.151 n=5+5) RepeatLarge/524288/4097 65.5µs ± 4% 65.2µs ±18% ~ (p=0.548 n=5+5) RepeatLarge/1048576/1 141µs ± 6% 137µs ±14% ~ (p=0.421 n=5+5) RepeatLarge/1048576/16 140µs ± 2% 134µs ±11% ~ (p=0.222 n=5+5) RepeatLarge/1048576/4097 141µs ± 3% 134µs ±10% ~ (p=0.151 n=5+5) RepeatLarge/2097152/1 258µs ± 2% 271µs ±10% ~ (p=0.222 n=5+5) RepeatLarge/2097152/16 263µs ± 6% 273µs ± 9% ~ (p=0.151 n=5+5) RepeatLarge/2097152/4097 270µs ± 2% 277µs ± 6% ~ (p=0.690 n=5+5) RepeatLarge/4194304/1 684µs ± 3% 467µs ± 6% -31.69% (p=0.008 n=5+5) RepeatLarge/4194304/16 682µs ± 1% 471µs ± 7% -30.91% (p=0.008 n=5+5) RepeatLarge/4194304/4097 685µs ± 2% 465µs ±20% -32.12% (p=0.008 n=5+5) RepeatLarge/8388608/1 1.50ms ± 1% 1.16ms ± 8% -22.63% (p=0.008 n=5+5) RepeatLarge/8388608/16 1.50ms ± 2% 1.22ms ±17% -18.49% (p=0.008 n=5+5) RepeatLarge/8388608/4097 1.51ms ± 7% 1.33ms ±11% -11.56% (p=0.008 n=5+5) RepeatLarge/16777216/1 3.48ms ± 4% 2.66ms ±13% -23.76% (p=0.008 n=5+5) RepeatLarge/16777216/16 3.37ms ± 3% 2.57ms ±13% -23.72% (p=0.008 n=5+5) RepeatLarge/16777216/4097 3.38ms ± 9% 2.50ms ±11% -26.16% (p=0.008 n=5+5) RepeatLarge/33554432/1 7.74ms ± 1% 4.70ms ±19% -39.31% (p=0.016 n=4+5) RepeatLarge/33554432/16 7.90ms ± 4% 4.78ms ± 9% -39.50% (p=0.008 n=5+5) RepeatLarge/33554432/4097 7.80ms ± 2% 4.86ms ±11% -37.60% (p=0.008 n=5+5) RepeatLarge/67108864/1 16.4ms ± 3% 9.7ms ±15% -41.29% (p=0.008 n=5+5) RepeatLarge/67108864/16 16.5ms ± 1% 9.9ms ±15% -39.83% (p=0.008 n=5+5) RepeatLarge/67108864/4097 16.5ms ± 1% 11.0ms ±18% -32.95% (p=0.008 n=5+5) RepeatLarge/134217728/1 35.2ms ±12% 19.2ms ±10% -45.58% (p=0.008 n=5+5) RepeatLarge/134217728/16 34.6ms ± 6% 19.3ms ± 7% -44.07% (p=0.008 n=5+5) RepeatLarge/134217728/4097 33.2ms ± 2% 19.3ms ±14% -41.79% (p=0.008 n=5+5) RepeatLarge/268435456/1 70.9ms ± 2% 36.2ms ± 5% -48.87% (p=0.008 n=5+5) RepeatLarge/268435456/16 77.4ms ± 7% 36.1ms ± 8% -53.33% (p=0.008 n=5+5) RepeatLarge/268435456/4097 75.8ms ± 4% 37.0ms ± 4% -51.15% (p=0.008 n=5+5) RepeatLarge/536870912/1 163ms ±14% 77ms ± 9% -52.94% (p=0.008 n=5+5) RepeatLarge/536870912/16 156ms ± 4% 76ms ± 6% -51.42% (p=0.008 n=5+5) RepeatLarge/536870912/4097 151ms ± 2% 76ms ± 6% -49.64% (p=0.008 n=5+5) RepeatLarge/1073741824/1 293ms ± 5% 149ms ± 8% -49.18% (p=0.008 n=5+5) RepeatLarge/1073741824/16 308ms ± 9% 150ms ± 8% -51.19% (p=0.008 n=5+5) RepeatLarge/1073741824/4097 299ms ± 5% 151ms ± 6% -49.51% (p=0.008 n=5+5) Updates #57153 Change-Id: I024553b7e676d6da6408278109ac1fa8def0a802 Reviewed-on: https://go-review.googlesource.com/c/go/+/456336 Reviewed-by: Dmitri Shuralyov Reviewed-by: Ian Lance Taylor Run-TryBot: Joseph Tsai TryBot-Result: Gopher Robot Reviewed-by: Daniel Martí --- diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go index ea8146c166..1b2dbd4c33 100644 --- a/src/bytes/bytes.go +++ b/src/bytes/bytes.go @@ -533,12 +533,22 @@ func Join(s [][]byte, sep []byte) []byte { // Just return a copy. return append([]byte(nil), s[0]...) } - n := len(sep) * (len(s) - 1) + + var n int + if len(sep) > 0 { + if len(sep) >= maxInt/(len(s)-1) { + panic("bytes: Join output length overflow") + } + n += len(sep) * (len(s) - 1) + } for _, v := range s { + if len(v) > maxInt-n { + panic("bytes: Join output length overflow") + } n += len(v) } - b := make([]byte, n) + b := bytealg.MakeNoZero(n) bp := copy(b, s[0]) for _, v := range s[1:] { bp += copy(b[bp:], sep) @@ -589,22 +599,22 @@ func Repeat(b []byte, count int) []byte { if count == 0 { return []byte{} } + // Since we cannot return an error on overflow, - // we should panic if the repeat will generate - // an overflow. + // we should panic if the repeat will generate an overflow. // See golang.org/issue/16237. if count < 0 { panic("bytes: negative Repeat count") - } else if len(b)*count/count != len(b) { - panic("bytes: Repeat count causes overflow") } + if len(b) >= maxInt/count { + panic("bytes: Repeat output length overflow") + } + n := len(b) * count if len(b) == 0 { return []byte{} } - n := len(b) * count - // Past a certain chunk size it is counterproductive to use // larger chunks as the source of the write, as when the source // is too large we are basically just thrashing the CPU D-cache. @@ -623,9 +633,9 @@ func Repeat(b []byte, count int) []byte { chunkMax = len(b) } } - nb := make([]byte, n) + nb := bytealg.MakeNoZero(n) bp := copy(nb, b) - for bp < len(nb) { + for bp < n { chunk := bp if chunk > chunkMax { chunk = chunkMax @@ -653,7 +663,7 @@ func ToUpper(s []byte) []byte { // Just return a copy. return append([]byte(""), s...) } - b := make([]byte, len(s)) + b := bytealg.MakeNoZero(len(s)) for i := 0; i < len(s); i++ { c := s[i] if 'a' <= c && c <= 'z' { @@ -683,7 +693,7 @@ func ToLower(s []byte) []byte { if !hasUpper { return append([]byte(""), s...) } - b := make([]byte, len(s)) + b := bytealg.MakeNoZero(len(s)) for i := 0; i < len(s); i++ { c := s[i] if 'A' <= c && c <= 'Z' { diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go index ebebce75fe..28f2742c0e 100644 --- a/src/internal/bytealg/bytealg.go +++ b/src/internal/bytealg/bytealg.go @@ -148,3 +148,8 @@ func IndexRabinKarp(s, substr string) int { } return -1 } + +// MakeNoZero makes a slice of length and capacity n without zeroing the bytes. +// It is the caller's responsibility to ensure uninitialized bytes +// do not leak to the end user. +func MakeNoZero(n int) []byte diff --git a/src/runtime/slice.go b/src/runtime/slice.go index 459dc8891e..04062f59fc 100644 --- a/src/runtime/slice.go +++ b/src/runtime/slice.go @@ -345,3 +345,11 @@ func slicecopy(toPtr unsafe.Pointer, toLen int, fromPtr unsafe.Pointer, fromLen } return n } + +//go:linkname bytealg_MakeNoZero internal/bytealg.MakeNoZero +func bytealg_MakeNoZero(len int) []byte { + if uintptr(len) > maxAlloc { + panicmakeslicelen() + } + return unsafe.Slice((*byte)(mallocgc(uintptr(len), nil, false)), len) +} diff --git a/src/strings/builder.go b/src/strings/builder.go index 7710464a0d..299ad51255 100644 --- a/src/strings/builder.go +++ b/src/strings/builder.go @@ -5,6 +5,7 @@ package strings import ( + "internal/bytealg" "unicode/utf8" "unsafe" ) @@ -65,7 +66,7 @@ func (b *Builder) Reset() { // grow copies the buffer to a new, larger buffer so that there are at least n // bytes of capacity beyond len(b.buf). func (b *Builder) grow(n int) { - buf := make([]byte, len(b.buf), 2*cap(b.buf)+n) + buf := bytealg.MakeNoZero(2*cap(b.buf) + n)[:len(b.buf)] copy(buf, b.buf) b.buf = buf } diff --git a/src/strings/strings.go b/src/strings/strings.go index 3f7d6fd1a2..2dd4321142 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -13,6 +13,8 @@ import ( "unicode/utf8" ) +const maxInt = int(^uint(0) >> 1) + // explode splits s into a slice of UTF-8 strings, // one string per Unicode character up to a maximum of n (n < 0 means no limit). // Invalid UTF-8 bytes are sliced individually. @@ -436,9 +438,19 @@ func Join(elems []string, sep string) string { case 1: return elems[0] } - n := len(sep) * (len(elems) - 1) - for i := 0; i < len(elems); i++ { - n += len(elems[i]) + + var n int + if len(sep) > 0 { + if len(sep) >= maxInt/(len(elems)-1) { + panic("strings: Join output length overflow") + } + n += len(sep) * (len(elems) - 1) + } + for _, elem := range elems { + if len(elem) > maxInt-n { + panic("strings: Join output length overflow") + } + n += len(elem) } var b Builder @@ -536,21 +548,20 @@ func Repeat(s string, count int) string { } // Since we cannot return an error on overflow, - // we should panic if the repeat will generate - // an overflow. + // we should panic if the repeat will generate an overflow. // See golang.org/issue/16237. if count < 0 { panic("strings: negative Repeat count") - } else if len(s)*count/count != len(s) { - panic("strings: Repeat count causes overflow") } + if len(s) >= maxInt/count { + panic("strings: Repeat output length overflow") + } + n := len(s) * count if len(s) == 0 { return "" } - n := len(s) * count - // Past a certain chunk size it is counterproductive to use // larger chunks as the source of the write, as when the source // is too large we are basically just thrashing the CPU D-cache.