From 41402b59bd7401bf0b4ff3043dfe3d3fa712e05c Mon Sep 17 00:00:00 2001 From: Michael Munday Date: Fri, 23 Mar 2018 13:00:47 +0000 Subject: [PATCH] crypto/rc4: optimize generic implementation slightly MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The compiler can't currently figure out that it can eliminate both c.s loads (using store to load forwarding) in the second line of the following code: ... c.s[i], c.s[j] = c.s[j], c.s[i] x := c.s[j] + c.s[i] ... The compiler eliminates the second load of c.s[j] (using the original value of c.s[i]), however the load of c.s[i] remains because the compiler doesn't know that c.s[i] and c.s[j] either overlap completely or not at all. Introducing temporaries to make this explicit improves the performance of the generic code slightly, the goal being to remove the assembly in this package in the future. This change also hoists a bounds check out of the main loop which gives a slight performance boost and also makes the behaviour identical to the assembly implementation when len(dst) < len(src). name old speed new speed delta RC4_128-4 491MB/s ± 3% 596MB/s ± 5% +21.51% (p=0.000 n=9+9) RC4_1K-4 504MB/s ± 2% 616MB/s ± 1% +22.33% (p=0.000 n=10+10) RC4_8K-4 509MB/s ± 1% 630MB/s ± 2% +23.85% (p=0.000 n=8+9) Change-Id: I27adc775713b2e74a1a94e0c1de0909fb4379463 Reviewed-on: https://go-review.googlesource.com/102335 Run-TryBot: Michael Munday TryBot-Result: Gobot Gobot Reviewed-by: Brad Fitzpatrick --- src/crypto/rc4/rc4.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/crypto/rc4/rc4.go b/src/crypto/rc4/rc4.go index 8274325c81..cf08ba7f8c 100644 --- a/src/crypto/rc4/rc4.go +++ b/src/crypto/rc4/rc4.go @@ -57,12 +57,19 @@ func (c *Cipher) Reset() { // This is the pure Go version. rc4_{amd64,386,arm}* contain assembly // implementations. This is here for tests and to prevent bitrot. func (c *Cipher) xorKeyStreamGeneric(dst, src []byte) { + if len(src) == 0 { + return + } i, j := c.i, c.j + _ = dst[len(src)-1] + dst = dst[:len(src)] // eliminate bounds check from loop for k, v := range src { i += 1 - j += uint8(c.s[i]) - c.s[i], c.s[j] = c.s[j], c.s[i] - dst[k] = v ^ uint8(c.s[uint8(c.s[i]+c.s[j])]) + x := c.s[i] + j += uint8(x) + y := c.s[j] + c.s[i], c.s[j] = y, x + dst[k] = v ^ uint8(c.s[uint8(x+y)]) } c.i, c.j = i, j } -- 2.50.0