From: korzhao <korzhao95@gmail.com>
Date: Wed, 5 Jul 2023 12:23:35 +0000 (+0800)
Subject: encoding/base32: optimize Encode
X-Git-Tag: go1.22rc1~1400
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=20b5f3ae8b01a14195057af7e082cdcf93fa2bce;p=gostls13.git

encoding/base32: optimize Encode

Converts the 5 x 8-bit source byte to two 32-bit integers.
This will reduce the number of shift operations.

benchmark                      old ns/op     new ns/op     delta
BenchmarkEncode-10             9005          4426          -50.85%
BenchmarkEncodeToString-10     10739         6155          -42.69%

benchmark                      old MB/s     new MB/s     speedup
BenchmarkEncode-10             909.69       1850.81      2.03x
BenchmarkEncodeToString-10     762.84       1331.02      1.74x

Change-Id: I9418d3436b73f94a4eb4b2b525e4f83612ff4d47
Reviewed-on: https://go-review.googlesource.com/c/go/+/514095
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Ian Lance Taylor <iant@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Auto-Submit: Ian Lance Taylor <iant@google.com>
---

diff --git a/src/encoding/base32/base32.go b/src/encoding/base32/base32.go
index a4d515edbd..69ced9ca3c 100644
--- a/src/encoding/base32/base32.go
+++ b/src/encoding/base32/base32.go
@@ -109,77 +109,70 @@ func (enc Encoding) WithPadding(padding rune) *Encoding {
 // so Encode is not appropriate for use on individual blocks
 // of a large data stream. Use NewEncoder() instead.
 func (enc *Encoding) Encode(dst, src []byte) {
-	for len(src) > 0 {
-		var b [8]byte
-
-		// Unpack 8x 5-bit source blocks into a 5 byte
-		// destination quantum
-		switch len(src) {
-		default:
-			b[7] = src[4] & 0x1F
-			b[6] = src[4] >> 5
-			fallthrough
-		case 4:
-			b[6] |= (src[3] << 3) & 0x1F
-			b[5] = (src[3] >> 2) & 0x1F
-			b[4] = src[3] >> 7
-			fallthrough
-		case 3:
-			b[4] |= (src[2] << 1) & 0x1F
-			b[3] = (src[2] >> 4) & 0x1F
-			fallthrough
-		case 2:
-			b[3] |= (src[1] << 4) & 0x1F
-			b[2] = (src[1] >> 1) & 0x1F
-			b[1] = (src[1] >> 6) & 0x1F
-			fallthrough
-		case 1:
-			b[1] |= (src[0] << 2) & 0x1F
-			b[0] = src[0] >> 3
-		}
-
-		// Encode 5-bit blocks using the base32 alphabet
-		size := len(dst)
-		if size >= 8 {
-			// Common case, unrolled for extra performance
-			dst[0] = enc.encode[b[0]&31]
-			dst[1] = enc.encode[b[1]&31]
-			dst[2] = enc.encode[b[2]&31]
-			dst[3] = enc.encode[b[3]&31]
-			dst[4] = enc.encode[b[4]&31]
-			dst[5] = enc.encode[b[5]&31]
-			dst[6] = enc.encode[b[6]&31]
-			dst[7] = enc.encode[b[7]&31]
-		} else {
-			for i := 0; i < size; i++ {
-				dst[i] = enc.encode[b[i]&31]
-			}
-		}
+	if len(src) == 0 {
+		return
+	}
+	// enc is a pointer receiver, so the use of enc.encode within the hot
+	// loop below means a nil check at every operation. Lift that nil check
+	// outside of the loop to speed up the encoder.
+	_ = enc.encode
+
+	di, si := 0, 0
+	n := (len(src) / 5) * 5
+	for si < n {
+		// Combining two 32 bit loads allows the same code to be used
+		// for 32 and 64 bit platforms.
+		hi := uint32(src[si+0])<<24 | uint32(src[si+1])<<16 | uint32(src[si+2])<<8 | uint32(src[si+3])
+		lo := hi<<8 | uint32(src[si+4])
+
+		dst[di+0] = enc.encode[(hi>>27)&0x1F]
+		dst[di+1] = enc.encode[(hi>>22)&0x1F]
+		dst[di+2] = enc.encode[(hi>>17)&0x1F]
+		dst[di+3] = enc.encode[(hi>>12)&0x1F]
+		dst[di+4] = enc.encode[(hi>>7)&0x1F]
+		dst[di+5] = enc.encode[(hi>>2)&0x1F]
+		dst[di+6] = enc.encode[(lo>>5)&0x1F]
+		dst[di+7] = enc.encode[(lo)&0x1F]
+
+		si += 5
+		di += 8
+	}
 
-		// Pad the final quantum
-		if len(src) < 5 {
-			if enc.padChar == NoPadding {
-				break
-			}
+	// Add the remaining small block
+	remain := len(src) - si
+	if remain == 0 {
+		return
+	}
 
-			dst[7] = byte(enc.padChar)
-			if len(src) < 4 {
-				dst[6] = byte(enc.padChar)
-				dst[5] = byte(enc.padChar)
-				if len(src) < 3 {
-					dst[4] = byte(enc.padChar)
-					if len(src) < 2 {
-						dst[3] = byte(enc.padChar)
-						dst[2] = byte(enc.padChar)
-					}
-				}
-			}
+	// Encode the remaining bytes in reverse order.
+	val := uint32(0)
+	switch remain {
+	case 4:
+		val |= uint32(src[si+3])
+		dst[di+6] = enc.encode[val<<3&0x1F]
+		dst[di+5] = enc.encode[val>>2&0x1F]
+		fallthrough
+	case 3:
+		val |= uint32(src[si+2]) << 8
+		dst[di+4] = enc.encode[val>>7&0x1F]
+		fallthrough
+	case 2:
+		val |= uint32(src[si+1]) << 16
+		dst[di+3] = enc.encode[val>>12&0x1F]
+		dst[di+2] = enc.encode[val>>17&0x1F]
+		fallthrough
+	case 1:
+		val |= uint32(src[si+0]) << 24
+		dst[di+1] = enc.encode[val>>22&0x1F]
+		dst[di+0] = enc.encode[val>>27&0x1F]
+	}
 
-			break
+	// Pad the final quantum
+	if enc.padChar != NoPadding {
+		nPad := (remain * 8 / 5) + 1
+		for i := nPad; i < 8; i++ {
+			dst[di+i] = byte(enc.padChar)
 		}
-
-		src = src[5:]
-		dst = dst[8:]
 	}
 }