From: Andreas Auernhammer Date: Tue, 21 Aug 2018 14:12:36 +0000 (+0200) Subject: crypto/rc4: remove assembler implementations X-Git-Tag: go1.12beta1~1362 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=30eda6715c6578de2086f03df36c4a8def838ec2;p=gostls13.git crypto/rc4: remove assembler implementations This CL removes the RC4 assembler implementations. RC4 is broken and should not be used for encryption anymore. Therefore it's not worth maintaining platform-specific assembler implementations. The native Go implementation may be slower or faster depending on the CPU: name old time/op new time/op delta RC4_128-4 256ns ± 0% 196ns ± 0% -23.78% (p=0.029 n=4+4) RC4_1K-4 2.38µs ± 0% 1.54µs ± 0% -35.22% (p=0.029 n=4+4) RC4_8K-4 19.4µs ± 1% 12.0µs ± 0% -38.35% (p=0.029 n=4+4) name old speed new speed delta RC4_128-4 498MB/s ± 0% 654MB/s ± 0% +31.12% (p=0.029 n=4+4) RC4_1K-4 431MB/s ± 0% 665MB/s ± 0% +54.34% (p=0.029 n=4+4) RC4_8K-4 418MB/s ± 1% 677MB/s ± 0% +62.18% (p=0.029 n=4+4) vendor_id : GenuineIntel cpu family : 6 model : 142 model name : Intel(R) Core(TM) i5-7Y54 CPU @ 1.20GHz stepping : 9 microcode : 0x84 cpu MHz : 800.036 cache size : 4096 KB name old time/op new time/op delta RC4_128-4 235ns ± 1% 431ns ± 0% +83.00% (p=0.000 n=10+10) RC4_1K-4 1.74µs ± 0% 3.41µs ± 0% +96.74% (p=0.000 n=10+10) RC4_8K-4 13.6µs ± 1% 26.8µs ± 0% +97.58% (p=0.000 n=10+9) name old speed new speed delta RC4_128-4 543MB/s ± 0% 297MB/s ± 1% -45.29% (p=0.000 n=10+10) RC4_1K-4 590MB/s ± 0% 300MB/s ± 0% -49.16% (p=0.000 n=10+10) RC4_8K-4 596MB/s ± 1% 302MB/s ± 0% -49.39% (p=0.000 n=10+9) vendor_id : GenuineIntel cpu family : 6 model : 63 model name : Intel(R) Xeon(R) CPU @ 2.30GHz stepping : 0 microcode : 0x1 cpu MHz : 2300.000 cache size : 46080 KB Fixes #25417 Change-Id: I4124037154aaaa8e48d300c23974f125b6055a1c Reviewed-on: https://go-review.googlesource.com/130397 Run-TryBot: Filippo Valsorda Reviewed-by: Brad Fitzpatrick Reviewed-by: Filippo Valsorda TryBot-Result: Gobot Gobot --- diff --git a/src/crypto/rc4/rc4.go b/src/crypto/rc4/rc4.go index c445bb078f..d5e6ebcd71 100644 --- a/src/crypto/rc4/rc4.go +++ b/src/crypto/rc4/rc4.go @@ -54,12 +54,9 @@ func (c *Cipher) Reset() { c.i, c.j = 0, 0 } -// xorKeyStreamGeneric sets dst to the result of XORing src with the -// key stream. Dst and src must overlap entirely or not at all. -// -// This is the pure Go version. rc4_{amd64,386,arm}* contain assembly -// implementations. This is here for tests and to prevent bitrot. -func (c *Cipher) xorKeyStreamGeneric(dst, src []byte) { +// XORKeyStream sets dst to the result of XORing src with the key stream. +// Dst and src must overlap entirely or not at all. +func (c *Cipher) XORKeyStream(dst, src []byte) { if len(src) == 0 { return } diff --git a/src/crypto/rc4/rc4_386.s b/src/crypto/rc4/rc4_386.s deleted file mode 100644 index 54221036ba..0000000000 --- a/src/crypto/rc4/rc4_386.s +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8) -TEXT ·xorKeyStream(SB),NOSPLIT,$0 - MOVL dst+0(FP), DI - MOVL src+4(FP), SI - MOVL state+12(FP), BP - - MOVL i+16(FP), AX - MOVBLZX (AX), AX - MOVL j+20(FP), BX - MOVBLZX (BX), BX - CMPL n+8(FP), $0 - JEQ done - -loop: - // i += 1 - INCB AX - - // j += c.s[i] - MOVBLZX (BP)(AX*4), DX - ADDB DX, BX - MOVBLZX BX, BX - - // c.s[i], c.s[j] = c.s[j], c.s[i] - MOVBLZX (BP)(BX*4), CX - MOVB CX, (BP)(AX*4) - MOVB DX, (BP)(BX*4) - - // *dst = *src ^ c.s[c.s[i]+c.s[j]] - ADDB DX, CX - MOVBLZX CX, CX - MOVB (BP)(CX*4), CX - XORB (SI), CX - MOVBLZX CX, CX - MOVB CX, (DI) - - INCL SI - INCL DI - DECL n+8(FP) - JNE loop - -done: - MOVL i+16(FP), CX - MOVB AX, (CX) - MOVL j+20(FP), CX - MOVB BX, (CX) - - RET diff --git a/src/crypto/rc4/rc4_amd64.s b/src/crypto/rc4/rc4_amd64.s deleted file mode 100644 index 57d941c8f3..0000000000 --- a/src/crypto/rc4/rc4_amd64.s +++ /dev/null @@ -1,179 +0,0 @@ -// Original source: -// http://www.zorinaq.com/papers/rc4-amd64.html -// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2 - -#include "textflag.h" - -// Local modifications: -// -// Transliterated from GNU to 6a assembly syntax by the Go authors. -// The comments and spacing are from the original. -// -// The new EXTEND macros avoid a bad stall on some systems after 8-bit math. -// -// The original code accumulated 64 bits of key stream in an integer -// register and then XOR'ed the key stream into the data 8 bytes at a time. -// Modified to accumulate 128 bits of key stream into an XMM register -// and then XOR the key stream into the data 16 bytes at a time. -// Approximately doubles throughput. - -// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5 -// but makes the code run 2.0x slower on Xeon. -#define EXTEND(r) MOVBLZX r, r - -/* -** RC4 implementation optimized for AMD64. -** -** Author: Marc Bevand -** Licence: I hereby disclaim the copyright on this code and place it -** in the public domain. -** -** The code has been designed to be easily integrated into openssl: -** the exported RC4() function can replace the actual implementations -** openssl already contains. Please note that when linking with openssl, -** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled -** with -DRC4_INT='unsigned long'. -** -** The throughput achieved by this code is about 320 MBytes/sec, on -** a 1.8 GHz AMD Opteron (rev C0) processor. -*/ - -TEXT ·xorKeyStream(SB),NOSPLIT,$0 - MOVQ n+16(FP), BX // rbx = ARG(len) - MOVQ src+8(FP), SI // in = ARG(in) - MOVQ dst+0(FP), DI // out = ARG(out) - MOVQ state+24(FP), BP // d = ARG(data) - MOVQ i+32(FP), AX - MOVBQZX 0(AX), CX // x = *xp - MOVQ j+40(FP), AX - MOVBQZX 0(AX), DX // y = *yp - - LEAQ (SI)(BX*1), R9 // limit = in+len - -l1: CMPQ SI, R9 // cmp in with in+len - JGE finished // jump if (in >= in+len) - - INCB CX - EXTEND(CX) - TESTL $15, CX - JZ wordloop - - MOVBLZX (BP)(CX*4), AX - - ADDB AX, DX // y += tx - EXTEND(DX) - MOVBLZX (BP)(DX*4), BX // ty = d[y] - MOVB BX, (BP)(CX*4) // d[x] = ty - ADDB AX, BX // val = ty+tx - EXTEND(BX) - MOVB AX, (BP)(DX*4) // d[y] = tx - MOVBLZX (BP)(BX*4), R8 // val = d[val] - XORB (SI), R8 // xor 1 byte - MOVB R8, (DI) - INCQ SI // in++ - INCQ DI // out++ - JMP l1 - -wordloop: - SUBQ $16, R9 - CMPQ SI, R9 - JGT end - -start: - ADDQ $16, SI // increment in - ADDQ $16, DI // increment out - - // Each KEYROUND generates one byte of key and - // inserts it into an XMM register at the given 16-bit index. - // The key state array is uint32 words only using the bottom - // byte of each word, so the 16-bit OR only copies 8 useful bits. - // We accumulate alternating bytes into X0 and X1, and then at - // the end we OR X1<<8 into X0 to produce the actual key. - // - // At the beginning of the loop, CX%16 == 0, so the 16 loads - // at state[CX], state[CX+1], ..., state[CX+15] can precompute - // (state+CX) as R12 and then become R12[0], R12[1], ... R12[15], - // without fear of the byte computation CX+15 wrapping around. - // - // The first round needs R12[0], the second needs R12[1], and so on. - // We can avoid memory stalls by starting the load for round n+1 - // before the end of round n, using the LOAD macro. - LEAQ (BP)(CX*4), R12 - -#define KEYROUND(xmm, load, off, r1, r2, index) \ - MOVBLZX (BP)(DX*4), R8; \ - MOVB r1, (BP)(DX*4); \ - load((off+1), r2); \ - MOVB R8, (off*4)(R12); \ - ADDB r1, R8; \ - EXTEND(R8); \ - PINSRW $index, (BP)(R8*4), xmm - -#define LOAD(off, reg) \ - MOVBLZX (off*4)(R12), reg; \ - ADDB reg, DX; \ - EXTEND(DX) - -#define SKIP(off, reg) - - LOAD(0, AX) - KEYROUND(X0, LOAD, 0, AX, BX, 0) - KEYROUND(X1, LOAD, 1, BX, AX, 0) - KEYROUND(X0, LOAD, 2, AX, BX, 1) - KEYROUND(X1, LOAD, 3, BX, AX, 1) - KEYROUND(X0, LOAD, 4, AX, BX, 2) - KEYROUND(X1, LOAD, 5, BX, AX, 2) - KEYROUND(X0, LOAD, 6, AX, BX, 3) - KEYROUND(X1, LOAD, 7, BX, AX, 3) - KEYROUND(X0, LOAD, 8, AX, BX, 4) - KEYROUND(X1, LOAD, 9, BX, AX, 4) - KEYROUND(X0, LOAD, 10, AX, BX, 5) - KEYROUND(X1, LOAD, 11, BX, AX, 5) - KEYROUND(X0, LOAD, 12, AX, BX, 6) - KEYROUND(X1, LOAD, 13, BX, AX, 6) - KEYROUND(X0, LOAD, 14, AX, BX, 7) - KEYROUND(X1, SKIP, 15, BX, AX, 7) - - ADDB $16, CX - - PSLLQ $8, X1 - PXOR X1, X0 - MOVOU -16(SI), X2 - PXOR X0, X2 - MOVOU X2, -16(DI) - - CMPQ SI, R9 // cmp in with in+len-16 - JLE start // jump if (in <= in+len-16) - -end: - DECB CX - ADDQ $16, R9 // tmp = in+len - - // handle the last bytes, one by one -l2: CMPQ SI, R9 // cmp in with in+len - JGE finished // jump if (in >= in+len) - - INCB CX - EXTEND(CX) - MOVBLZX (BP)(CX*4), AX - - ADDB AX, DX // y += tx - EXTEND(DX) - MOVBLZX (BP)(DX*4), BX // ty = d[y] - MOVB BX, (BP)(CX*4) // d[x] = ty - ADDB AX, BX // val = ty+tx - EXTEND(BX) - MOVB AX, (BP)(DX*4) // d[y] = tx - MOVBLZX (BP)(BX*4), R8 // val = d[val] - XORB (SI), R8 // xor 1 byte - MOVB R8, (DI) - INCQ SI // in++ - INCQ DI // out++ - JMP l2 - -finished: - MOVQ j+40(FP), BX - MOVB DX, 0(BX) - MOVQ i+32(FP), AX - MOVB CX, 0(AX) - RET diff --git a/src/crypto/rc4/rc4_amd64p32.s b/src/crypto/rc4/rc4_amd64p32.s deleted file mode 100644 index 970b34e08e..0000000000 --- a/src/crypto/rc4/rc4_amd64p32.s +++ /dev/null @@ -1,192 +0,0 @@ -// Original source: -// http://www.zorinaq.com/papers/rc4-amd64.html -// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2 - -#include "textflag.h" - -// Local modifications: -// -// Transliterated from GNU to 6a assembly syntax by the Go authors. -// The comments and spacing are from the original. -// -// The new EXTEND macros avoid a bad stall on some systems after 8-bit math. -// -// The original code accumulated 64 bits of key stream in an integer -// register and then XOR'ed the key stream into the data 8 bytes at a time. -// Modified to accumulate 128 bits of key stream into an XMM register -// and then XOR the key stream into the data 16 bytes at a time. -// Approximately doubles throughput. -// -// Converted to amd64p32. -// -// To make safe for Native Client, avoid use of BP, R15, -// and two-register addressing modes. - -// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5 -// but makes the code run 2.0x slower on Xeon. -#define EXTEND(r) MOVBLZX r, r - -/* -** RC4 implementation optimized for AMD64. -** -** Author: Marc Bevand -** Licence: I hereby disclaim the copyright on this code and place it -** in the public domain. -** -** The code has been designed to be easily integrated into openssl: -** the exported RC4() function can replace the actual implementations -** openssl already contains. Please note that when linking with openssl, -** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled -** with -DRC4_INT='unsigned long'. -** -** The throughput achieved by this code is about 320 MBytes/sec, on -** a 1.8 GHz AMD Opteron (rev C0) processor. -*/ - -TEXT ·xorKeyStream(SB),NOSPLIT,$0 - MOVL n+8(FP), BX // rbx = ARG(len) - MOVL src+4(FP), SI // in = ARG(in) - MOVL dst+0(FP), DI // out = ARG(out) - MOVL state+12(FP), R10 // d = ARG(data) - MOVL i+16(FP), AX - MOVBQZX 0(AX), CX // x = *xp - MOVL j+20(FP), AX - MOVBQZX 0(AX), DX // y = *yp - - LEAQ (SI)(BX*1), R9 // limit = in+len - -l1: CMPQ SI, R9 // cmp in with in+len - JGE finished // jump if (in >= in+len) - - INCB CX - EXTEND(CX) - TESTL $15, CX - JZ wordloop - LEAL (R10)(CX*4), R12 - - MOVBLZX (R12), AX - - ADDB AX, DX // y += tx - EXTEND(DX) - LEAL (R10)(DX*4), R11 - MOVBLZX (R11), BX // ty = d[y] - MOVB BX, (R12) // d[x] = ty - ADDB AX, BX // val = ty+tx - EXTEND(BX) - LEAL (R10)(BX*4), R13 - MOVB AX, (R11) // d[y] = tx - MOVBLZX (R13), R8 // val = d[val] - XORB (SI), R8 // xor 1 byte - MOVB R8, (DI) - INCQ SI // in++ - INCQ DI // out++ - JMP l1 - -wordloop: - SUBQ $16, R9 - CMPQ SI, R9 - JGT end - -start: - ADDQ $16, SI // increment in - ADDQ $16, DI // increment out - - // Each KEYROUND generates one byte of key and - // inserts it into an XMM register at the given 16-bit index. - // The key state array is uint32 words only using the bottom - // byte of each word, so the 16-bit OR only copies 8 useful bits. - // We accumulate alternating bytes into X0 and X1, and then at - // the end we OR X1<<8 into X0 to produce the actual key. - // - // At the beginning of the loop, CX%16 == 0, so the 16 loads - // at state[CX], state[CX+1], ..., state[CX+15] can precompute - // (state+CX) as R12 and then become R12[0], R12[1], ... R12[15], - // without fear of the byte computation CX+15 wrapping around. - // - // The first round needs R12[0], the second needs R12[1], and so on. - // We can avoid memory stalls by starting the load for round n+1 - // before the end of round n, using the LOAD macro. - LEAQ (R10)(CX*4), R12 - -#define KEYROUND(xmm, load, off, r1, r2, index) \ - LEAL (R10)(DX*4), R11; \ - MOVBLZX (R11), R8; \ - MOVB r1, (R11); \ - load((off+1), r2); \ - MOVB R8, (off*4)(R12); \ - ADDB r1, R8; \ - EXTEND(R8); \ - LEAL (R10)(R8*4), R14; \ - PINSRW $index, (R14), xmm - -#define LOAD(off, reg) \ - MOVBLZX (off*4)(R12), reg; \ - ADDB reg, DX; \ - EXTEND(DX) - -#define SKIP(off, reg) - - LOAD(0, AX) - KEYROUND(X0, LOAD, 0, AX, BX, 0) - KEYROUND(X1, LOAD, 1, BX, AX, 0) - KEYROUND(X0, LOAD, 2, AX, BX, 1) - KEYROUND(X1, LOAD, 3, BX, AX, 1) - KEYROUND(X0, LOAD, 4, AX, BX, 2) - KEYROUND(X1, LOAD, 5, BX, AX, 2) - KEYROUND(X0, LOAD, 6, AX, BX, 3) - KEYROUND(X1, LOAD, 7, BX, AX, 3) - KEYROUND(X0, LOAD, 8, AX, BX, 4) - KEYROUND(X1, LOAD, 9, BX, AX, 4) - KEYROUND(X0, LOAD, 10, AX, BX, 5) - KEYROUND(X1, LOAD, 11, BX, AX, 5) - KEYROUND(X0, LOAD, 12, AX, BX, 6) - KEYROUND(X1, LOAD, 13, BX, AX, 6) - KEYROUND(X0, LOAD, 14, AX, BX, 7) - KEYROUND(X1, SKIP, 15, BX, AX, 7) - - ADDB $16, CX - - PSLLQ $8, X1 - PXOR X1, X0 - MOVOU -16(SI), X2 - PXOR X0, X2 - MOVOU X2, -16(DI) - - CMPQ SI, R9 // cmp in with in+len-16 - JLE start // jump if (in <= in+len-16) - -end: - DECB CX - ADDQ $16, R9 // tmp = in+len - - // handle the last bytes, one by one -l2: CMPQ SI, R9 // cmp in with in+len - JGE finished // jump if (in >= in+len) - - INCB CX - EXTEND(CX) - LEAL (R10)(CX*4), R12 - MOVBLZX (R12), AX - - ADDB AX, DX // y += tx - EXTEND(DX) - LEAL (R10)(DX*4), R11 - MOVBLZX (R11), BX // ty = d[y] - MOVB BX, (R12) // d[x] = ty - ADDB AX, BX // val = ty+tx - EXTEND(BX) - LEAL (R10)(BX*4), R13 - MOVB AX, (R11) // d[y] = tx - MOVBLZX (R13), R8 // val = d[val] - XORB (SI), R8 // xor 1 byte - MOVB R8, (DI) - INCQ SI // in++ - INCQ DI // out++ - JMP l2 - -finished: - MOVL j+20(FP), BX - MOVB DX, 0(BX) - MOVL i+16(FP), AX - MOVB CX, 0(AX) - RET diff --git a/src/crypto/rc4/rc4_arm.s b/src/crypto/rc4/rc4_arm.s deleted file mode 100644 index c726d6d1c0..0000000000 --- a/src/crypto/rc4/rc4_arm.s +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !nacl - -#include "textflag.h" - -// Registers -#define Rdst R0 -#define Rsrc R1 -#define Rn R2 -#define Rstate R3 -#define Rpi R4 -#define Rpj R5 -#define Ri R6 -#define Rj R7 -#define Rk R8 -#define Rt R11 -#define Rt2 R12 - -// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8) -TEXT ·xorKeyStream(SB),NOSPLIT,$0 - MOVW dst+0(FP), Rdst - MOVW src+4(FP), Rsrc - MOVW n+8(FP), Rn - MOVW state+12(FP), Rstate - MOVW i+16(FP), Rpi - MOVW j+20(FP), Rpj - MOVBU (Rpi), Ri - MOVBU (Rpj), Rj - MOVW $0, Rk - -loop: - // i += 1; j += state[i] - ADD $1, Ri - AND $0xff, Ri - MOVBU Ri<<2(Rstate), Rt - ADD Rt, Rj - AND $0xff, Rj - - // swap state[i] <-> state[j] - MOVBU Rj<<2(Rstate), Rt2 - MOVB Rt2, Ri<<2(Rstate) - MOVB Rt, Rj<<2(Rstate) - - // dst[k] = src[k] ^ state[state[i] + state[j]] - ADD Rt2, Rt - AND $0xff, Rt - MOVBU Rt<<2(Rstate), Rt - MOVBU Rk<<0(Rsrc), Rt2 - EOR Rt, Rt2 - MOVB Rt2, Rk<<0(Rdst) - - ADD $1, Rk - CMP Rk, Rn - BNE loop - -done: - MOVB Ri, (Rpi) - MOVB Rj, (Rpj) - RET diff --git a/src/crypto/rc4/rc4_asm.go b/src/crypto/rc4/rc4_asm.go deleted file mode 100644 index fc79e7ffc7..0000000000 --- a/src/crypto/rc4/rc4_asm.go +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build amd64 amd64p32 arm,!nacl 386 - -package rc4 - -import "crypto/internal/subtle" - -func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8) - -// XORKeyStream sets dst to the result of XORing src with the key stream. -// Dst and src must overlap entirely or not at all. -func (c *Cipher) XORKeyStream(dst, src []byte) { - if len(src) == 0 { - return - } - if len(dst) < len(src) { - panic("crypto/cipher: output smaller than input") - } - if subtle.InexactOverlap(dst[:len(src)], src) { - panic("crypto/cipher: invalid buffer overlap") - } - xorKeyStream(&dst[0], &src[0], len(src), &c.s, &c.i, &c.j) -} diff --git a/src/crypto/rc4/rc4_ref.go b/src/crypto/rc4/rc4_ref.go deleted file mode 100644 index 9b98fc49e7..0000000000 --- a/src/crypto/rc4/rc4_ref.go +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !amd64,!amd64p32,!arm,!386 arm,nacl - -package rc4 - -// XORKeyStream sets dst to the result of XORing src with the key stream. -// Dst and src must overlap entirely or not at all. -func (c *Cipher) XORKeyStream(dst, src []byte) { - c.xorKeyStreamGeneric(dst, src) -} diff --git a/src/crypto/rc4/rc4_test.go b/src/crypto/rc4/rc4_test.go index 1fc08b8593..e7356aa45d 100644 --- a/src/crypto/rc4/rc4_test.go +++ b/src/crypto/rc4/rc4_test.go @@ -117,30 +117,19 @@ func TestGolden(t *testing.T) { } func TestBlock(t *testing.T) { - testBlock(t, (*Cipher).XORKeyStream) -} - -// Test the pure Go version. -// Because we have assembly for amd64, 386, and arm, this prevents -// bitrot of the reference implementations. -func TestBlockGeneric(t *testing.T) { - testBlock(t, (*Cipher).xorKeyStreamGeneric) -} - -func testBlock(t *testing.T, xor func(c *Cipher, dst, src []byte)) { c1a, _ := NewCipher(golden[0].key) c1b, _ := NewCipher(golden[1].key) data1 := make([]byte, 1<<20) for i := range data1 { - xor(c1a, data1[i:i+1], data1[i:i+1]) - xor(c1b, data1[i:i+1], data1[i:i+1]) + c1a.XORKeyStream(data1[i:i+1], data1[i:i+1]) + c1b.XORKeyStream(data1[i:i+1], data1[i:i+1]) } c2a, _ := NewCipher(golden[0].key) c2b, _ := NewCipher(golden[1].key) data2 := make([]byte, 1<<20) - xor(c2a, data2, data2) - xor(c2b, data2, data2) + c2a.XORKeyStream(data2, data2) + c2b.XORKeyStream(data2, data2) if !bytes.Equal(data1, data2) { t.Fatalf("bad block")