From: korzhao Date: Wed, 5 Jul 2023 12:23:35 +0000 (+0800) Subject: encoding/base32: optimize Encode X-Git-Tag: go1.22rc1~1400 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=20b5f3ae8b01a14195057af7e082cdcf93fa2bce;p=gostls13.git encoding/base32: optimize Encode Converts the 5 x 8-bit source byte to two 32-bit integers. This will reduce the number of shift operations. benchmark old ns/op new ns/op delta BenchmarkEncode-10 9005 4426 -50.85% BenchmarkEncodeToString-10 10739 6155 -42.69% benchmark old MB/s new MB/s speedup BenchmarkEncode-10 909.69 1850.81 2.03x BenchmarkEncodeToString-10 762.84 1331.02 1.74x Change-Id: I9418d3436b73f94a4eb4b2b525e4f83612ff4d47 Reviewed-on: https://go-review.googlesource.com/c/go/+/514095 Reviewed-by: Michael Knyszek Run-TryBot: Ian Lance Taylor TryBot-Result: Gopher Robot Run-TryBot: Ian Lance Taylor Reviewed-by: Ian Lance Taylor Auto-Submit: Ian Lance Taylor --- diff --git a/src/encoding/base32/base32.go b/src/encoding/base32/base32.go index a4d515edbd..69ced9ca3c 100644 --- a/src/encoding/base32/base32.go +++ b/src/encoding/base32/base32.go @@ -109,77 +109,70 @@ func (enc Encoding) WithPadding(padding rune) *Encoding { // so Encode is not appropriate for use on individual blocks // of a large data stream. Use NewEncoder() instead. func (enc *Encoding) Encode(dst, src []byte) { - for len(src) > 0 { - var b [8]byte - - // Unpack 8x 5-bit source blocks into a 5 byte - // destination quantum - switch len(src) { - default: - b[7] = src[4] & 0x1F - b[6] = src[4] >> 5 - fallthrough - case 4: - b[6] |= (src[3] << 3) & 0x1F - b[5] = (src[3] >> 2) & 0x1F - b[4] = src[3] >> 7 - fallthrough - case 3: - b[4] |= (src[2] << 1) & 0x1F - b[3] = (src[2] >> 4) & 0x1F - fallthrough - case 2: - b[3] |= (src[1] << 4) & 0x1F - b[2] = (src[1] >> 1) & 0x1F - b[1] = (src[1] >> 6) & 0x1F - fallthrough - case 1: - b[1] |= (src[0] << 2) & 0x1F - b[0] = src[0] >> 3 - } - - // Encode 5-bit blocks using the base32 alphabet - size := len(dst) - if size >= 8 { - // Common case, unrolled for extra performance - dst[0] = enc.encode[b[0]&31] - dst[1] = enc.encode[b[1]&31] - dst[2] = enc.encode[b[2]&31] - dst[3] = enc.encode[b[3]&31] - dst[4] = enc.encode[b[4]&31] - dst[5] = enc.encode[b[5]&31] - dst[6] = enc.encode[b[6]&31] - dst[7] = enc.encode[b[7]&31] - } else { - for i := 0; i < size; i++ { - dst[i] = enc.encode[b[i]&31] - } - } + if len(src) == 0 { + return + } + // enc is a pointer receiver, so the use of enc.encode within the hot + // loop below means a nil check at every operation. Lift that nil check + // outside of the loop to speed up the encoder. + _ = enc.encode + + di, si := 0, 0 + n := (len(src) / 5) * 5 + for si < n { + // Combining two 32 bit loads allows the same code to be used + // for 32 and 64 bit platforms. + hi := uint32(src[si+0])<<24 | uint32(src[si+1])<<16 | uint32(src[si+2])<<8 | uint32(src[si+3]) + lo := hi<<8 | uint32(src[si+4]) + + dst[di+0] = enc.encode[(hi>>27)&0x1F] + dst[di+1] = enc.encode[(hi>>22)&0x1F] + dst[di+2] = enc.encode[(hi>>17)&0x1F] + dst[di+3] = enc.encode[(hi>>12)&0x1F] + dst[di+4] = enc.encode[(hi>>7)&0x1F] + dst[di+5] = enc.encode[(hi>>2)&0x1F] + dst[di+6] = enc.encode[(lo>>5)&0x1F] + dst[di+7] = enc.encode[(lo)&0x1F] + + si += 5 + di += 8 + } - // Pad the final quantum - if len(src) < 5 { - if enc.padChar == NoPadding { - break - } + // Add the remaining small block + remain := len(src) - si + if remain == 0 { + return + } - dst[7] = byte(enc.padChar) - if len(src) < 4 { - dst[6] = byte(enc.padChar) - dst[5] = byte(enc.padChar) - if len(src) < 3 { - dst[4] = byte(enc.padChar) - if len(src) < 2 { - dst[3] = byte(enc.padChar) - dst[2] = byte(enc.padChar) - } - } - } + // Encode the remaining bytes in reverse order. + val := uint32(0) + switch remain { + case 4: + val |= uint32(src[si+3]) + dst[di+6] = enc.encode[val<<3&0x1F] + dst[di+5] = enc.encode[val>>2&0x1F] + fallthrough + case 3: + val |= uint32(src[si+2]) << 8 + dst[di+4] = enc.encode[val>>7&0x1F] + fallthrough + case 2: + val |= uint32(src[si+1]) << 16 + dst[di+3] = enc.encode[val>>12&0x1F] + dst[di+2] = enc.encode[val>>17&0x1F] + fallthrough + case 1: + val |= uint32(src[si+0]) << 24 + dst[di+1] = enc.encode[val>>22&0x1F] + dst[di+0] = enc.encode[val>>27&0x1F] + } - break + // Pad the final quantum + if enc.padChar != NoPadding { + nPad := (remain * 8 / 5) + 1 + for i := nPad; i < 8; i++ { + dst[di+i] = byte(enc.padChar) } - - src = src[5:] - dst = dst[8:] } }