From 5168fcf63f5001b38f9ac64ce5c5e3c2d397363d Mon Sep 17 00:00:00 2001 From: templexxx Date: Sat, 21 Jul 2018 04:18:08 +0800 Subject: [PATCH] crypto/cipher: use SIMD for xor on amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit cpu: Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz Benchmark: xor name old time/op new time/op delta XORBytes/8Bytes-8 8.21ns ± 1% 6.35ns ± 3% -22.66% (p=0.008 n=5+5) XORBytes/128Bytes-8 17.9ns ± 1% 10.4ns ± 1% -41.68% (p=0.008 n=5+5) XORBytes/2048Bytes-8 187ns ± 1% 78ns ± 0% -58.44% (p=0.008 n=5+5) XORBytes/32768Bytes-8 2.87µs ± 1% 1.38µs ± 0% -52.05% (p=0.008 n=5+5) name old speed new speed delta XORBytes/8Bytes-8 974MB/s ± 1% 1260MB/s ± 2% +29.33% (p=0.008 n=5+5) XORBytes/128Bytes-8 7.15GB/s ± 0% 12.25GB/s ± 1% +71.17% (p=0.008 n=5+5) XORBytes/2048Bytes-8 10.9GB/s ± 1% 26.4GB/s ± 0% +140.99% (p=0.008 n=5+5) XORBytes/32768Bytes-8 11.4GB/s ± 1% 23.8GB/s ± 0% +108.52% (p=0.008 n=5+5) Benchmark: cipher name old time/op new time/op delta AESGCMSeal1K-8 269ns ± 6% 261ns ± 2% ~ (p=0.246 n=5+5) AESGCMOpen1K-8 242ns ± 1% 240ns ± 2% ~ (p=0.190 n=5+5) AESGCMSign8K-8 869ns ± 0% 870ns ± 1% ~ (p=0.683 n=5+5) AESGCMSeal8K-8 1.64µs ± 6% 1.59µs ± 7% ~ (p=0.151 n=5+5) AESGCMOpen8K-8 1.48µs ± 2% 1.46µs ± 0% -1.39% (p=0.008 n=5+5) AESCFBEncrypt1K-8 1.88µs ± 5% 1.62µs ± 1% -13.52% (p=0.008 n=5+5) AESCFBDecrypt1K-8 1.76µs ± 1% 1.58µs ± 1% -10.24% (p=0.016 n=4+5) AESOFB1K-8 1.10µs ± 4% 1.03µs ± 2% -6.36% (p=0.008 n=5+5) AESCTR1K-8 1.24µs ± 1% 1.17µs ± 0% -5.96% (p=0.008 n=5+5) AESCBCEncrypt1K-8 1.74µs ± 0% 1.14µs ± 1% -34.36% (p=0.008 n=5+5) AESCBCDecrypt1K-8 1.28µs ± 1% 1.10µs ± 1% -14.04% (p=0.008 n=5+5) name old speed new speed delta AESGCMSeal1K-8 3.81GB/s ± 6% 3.91GB/s ± 2% ~ (p=0.310 n=5+5) AESGCMOpen1K-8 4.23GB/s ± 1% 4.27GB/s ± 2% ~ (p=0.222 n=5+5) AESGCMSign8K-8 9.43GB/s ± 0% 9.41GB/s ± 1% ~ (p=0.841 n=5+5) AESGCMSeal8K-8 5.01GB/s ± 6% 5.16GB/s ± 6% ~ (p=0.151 n=5+5) AESGCMOpen8K-8 5.54GB/s ± 2% 5.62GB/s ± 0% +1.41% (p=0.008 n=5+5) AESCFBEncrypt1K-8 543MB/s ± 5% 627MB/s ± 1% +15.55% (p=0.008 n=5+5) AESCFBDecrypt1K-8 580MB/s ± 1% 646MB/s ± 1% +11.40% (p=0.016 n=4+5) AESOFB1K-8 925MB/s ± 4% 988MB/s ± 2% +6.73% (p=0.008 n=5+5) AESCTR1K-8 821MB/s ± 1% 873MB/s ± 1% +6.34% (p=0.008 n=5+5) AESCBCEncrypt1K-8 588MB/s ± 1% 897MB/s ± 1% +52.36% (p=0.008 n=5+5) AESCBCDecrypt1K-8 799MB/s ± 1% 929MB/s ± 1% +16.32% (p=0.008 n=5+5) Change-Id: I42e6ba66c23dad853d33c924fca7b0ed805cefdd Reviewed-on: https://go-review.googlesource.com/c/125316 Reviewed-by: Ilya Tocar Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot --- src/crypto/cipher/export_test.go | 8 +++ src/crypto/cipher/xor_amd64.go | 27 ++++++++ src/crypto/cipher/xor_amd64.s | 54 ++++++++++++++++ src/crypto/cipher/{xor.go => xor_generic.go} | 62 +++++++++--------- src/crypto/cipher/xor_test.go | 68 ++++++++++++++++---- 5 files changed, 177 insertions(+), 42 deletions(-) create mode 100644 src/crypto/cipher/export_test.go create mode 100644 src/crypto/cipher/xor_amd64.go create mode 100644 src/crypto/cipher/xor_amd64.s rename src/crypto/cipher/{xor.go => xor_generic.go} (75%) diff --git a/src/crypto/cipher/export_test.go b/src/crypto/cipher/export_test.go new file mode 100644 index 0000000000..cf8007ab49 --- /dev/null +++ b/src/crypto/cipher/export_test.go @@ -0,0 +1,8 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cipher + +// Export internal functions for testing. +var XorBytes = xorBytes diff --git a/src/crypto/cipher/xor_amd64.go b/src/crypto/cipher/xor_amd64.go new file mode 100644 index 0000000000..a595acc017 --- /dev/null +++ b/src/crypto/cipher/xor_amd64.go @@ -0,0 +1,27 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cipher + +// xorBytes xors the bytes in a and b. The destination should have enough +// space, otherwise xorBytes will panic. Returns the number of bytes xor'd. +func xorBytes(dst, a, b []byte) int { + n := len(a) + if len(b) < n { + n = len(b) + } + if n == 0 { + return 0 + } + _ = dst[n-1] + xorBytesSSE2(&dst[0], &a[0], &b[0], n) // amd64 must have SSE2 + return n +} + +func xorWords(dst, a, b []byte) { + xorBytes(dst, a, b) +} + +//go:noescape +func xorBytesSSE2(dst, a, b *byte, n int) diff --git a/src/crypto/cipher/xor_amd64.s b/src/crypto/cipher/xor_amd64.s new file mode 100644 index 0000000000..780d37a06e --- /dev/null +++ b/src/crypto/cipher/xor_amd64.s @@ -0,0 +1,54 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// func xorBytesSSE2(dst, a, b *byte, n int) +TEXT ·xorBytesSSE2(SB), NOSPLIT, $0 + MOVQ dst+0(FP), BX + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVQ n+24(FP), DX + TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned. + JNZ not_aligned + +aligned: + MOVQ $0, AX // position in slices + +loop16b: + MOVOU (SI)(AX*1), X0 // XOR 16byte forwards. + MOVOU (CX)(AX*1), X1 + PXOR X1, X0 + MOVOU X0, (BX)(AX*1) + ADDQ $16, AX + CMPQ DX, AX + JNE loop16b + RET + +loop_1b: + SUBQ $1, DX // XOR 1byte backwards. + MOVB (SI)(DX*1), DI + MOVB (CX)(DX*1), AX + XORB AX, DI + MOVB DI, (BX)(DX*1) + TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b. + JNZ loop_1b + CMPQ DX, $0 // if len is 0, ret. + JE ret + TESTQ $15, DX // AND 15 & len, if zero jump to aligned. + JZ aligned + +not_aligned: + TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b. + JNE loop_1b + SUBQ $8, DX // XOR 8bytes backwards. + MOVQ (SI)(DX*1), DI + MOVQ (CX)(DX*1), AX + XORQ AX, DI + MOVQ DI, (BX)(DX*1) + CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned. + JGE aligned + +ret: + RET diff --git a/src/crypto/cipher/xor.go b/src/crypto/cipher/xor_generic.go similarity index 75% rename from src/crypto/cipher/xor.go rename to src/crypto/cipher/xor_generic.go index 5b26eace09..4d660b0a75 100644 --- a/src/crypto/cipher/xor.go +++ b/src/crypto/cipher/xor_generic.go @@ -2,6 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +// +build !amd64 + package cipher import ( @@ -9,12 +11,9 @@ import ( "unsafe" ) -const wordSize = int(unsafe.Sizeof(uintptr(0))) -const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x" - -// fastXORBytes xors in bulk. It only works on architectures that -// support unaligned read/writes. -func fastXORBytes(dst, a, b []byte) int { +// xorBytes xors the bytes in a and b. The destination should have enough +// space, otherwise xorBytes will panic. Returns the number of bytes xor'd. +func xorBytes(dst, a, b []byte) int { n := len(a) if len(b) < n { n = len(b) @@ -22,6 +21,28 @@ func fastXORBytes(dst, a, b []byte) int { if n == 0 { return 0 } + + switch { + case supportsUnaligned: + fastXORBytes(dst, a, b, n) + default: + // TODO(hanwen): if (dst, a, b) have common alignment + // we could still try fastXORBytes. It is not clear + // how often this happens, and it's only worth it if + // the block encryption itself is hardware + // accelerated. + safeXORBytes(dst, a, b, n) + } + return n +} + +const wordSize = int(unsafe.Sizeof(uintptr(0))) +const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x" + +// fastXORBytes xors in bulk. It only works on architectures that +// support unaligned read/writes. +// n needs to be smaller or equal than the length of a and b. +func fastXORBytes(dst, a, b []byte, n int) { // Assert dst has enough space _ = dst[n-1] @@ -38,34 +59,13 @@ func fastXORBytes(dst, a, b []byte) int { for i := (n - n%wordSize); i < n; i++ { dst[i] = a[i] ^ b[i] } - - return n } -func safeXORBytes(dst, a, b []byte) int { - n := len(a) - if len(b) < n { - n = len(b) - } +// n needs to be smaller or equal than the length of a and b. +func safeXORBytes(dst, a, b []byte, n int) { for i := 0; i < n; i++ { dst[i] = a[i] ^ b[i] } - return n -} - -// xorBytes xors the bytes in a and b. The destination should have enough -// space, otherwise xorBytes will panic. Returns the number of bytes xor'd. -func xorBytes(dst, a, b []byte) int { - if supportsUnaligned { - return fastXORBytes(dst, a, b) - } else { - // TODO(hanwen): if (dst, a, b) have common alignment - // we could still try fastXORBytes. It is not clear - // how often this happens, and it's only worth it if - // the block encryption itself is hardware - // accelerated. - return safeXORBytes(dst, a, b) - } } // fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.) @@ -80,10 +80,12 @@ func fastXORWords(dst, a, b []byte) { } } +// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.) +// The slice arguments a and b are assumed to be of equal length. func xorWords(dst, a, b []byte) { if supportsUnaligned { fastXORWords(dst, a, b) } else { - safeXORBytes(dst, a, b) + safeXORBytes(dst, a, b, len(b)) } } diff --git a/src/crypto/cipher/xor_test.go b/src/crypto/cipher/xor_test.go index d9187eb726..24877efc36 100644 --- a/src/crypto/cipher/xor_test.go +++ b/src/crypto/cipher/xor_test.go @@ -2,27 +2,71 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -package cipher +package cipher_test import ( "bytes" + "crypto/cipher" + "crypto/rand" + "fmt" + "io" "testing" ) func TestXOR(t *testing.T) { - for alignP := 0; alignP < 2; alignP++ { - for alignQ := 0; alignQ < 2; alignQ++ { - for alignD := 0; alignD < 2; alignD++ { - p := make([]byte, 1024)[alignP:] - q := make([]byte, 1024)[alignQ:] - d1 := make([]byte, 1024+alignD)[alignD:] - d2 := make([]byte, 1024+alignD)[alignD:] - xorBytes(d1, p, q) - safeXORBytes(d2, p, q) - if !bytes.Equal(d1, d2) { - t.Error("not equal") + for j := 1; j <= 1024; j++ { + for alignP := 0; alignP < 2; alignP++ { + for alignQ := 0; alignQ < 2; alignQ++ { + for alignD := 0; alignD < 2; alignD++ { + p := make([]byte, j)[alignP:] + q := make([]byte, j)[alignQ:] + d1 := make([]byte, j+alignD)[alignD:] + d2 := make([]byte, j+alignD)[alignD:] + if _, err := io.ReadFull(rand.Reader, p); err != nil { + t.Fatal(err) + } + if _, err := io.ReadFull(rand.Reader, q); err != nil { + t.Fatal(err) + } + cipher.XorBytes(d1, p, q) + n := min(p, q) + for i := 0; i < n; i++ { + d2[i] = p[i] ^ q[i] + } + if !bytes.Equal(d1, d2) { + t.Logf("p: %#v", p) + t.Logf("q: %#v", q) + t.Logf("expect: %#v", d2) + t.Logf("result: %#v", d1) + t.Fatal("not equal") + } } } } } } + +func min(a, b []byte) int { + n := len(a) + if len(b) < n { + n = len(b) + } + return n +} + +func BenchmarkXORBytes(b *testing.B) { + dst := make([]byte, 1<<15) + data0 := make([]byte, 1<<15) + data1 := make([]byte, 1<<15) + sizes := []int64{1 << 3, 1 << 7, 1 << 11, 1 << 15} + for _, size := range sizes { + b.Run(fmt.Sprintf("%dBytes", size), func(b *testing.B) { + s0 := data0[:size] + s1 := data1[:size] + b.SetBytes(int64(size)) + for i := 0; i < b.N; i++ { + cipher.XorBytes(dst, s0, s1) + } + }) + } +} -- 2.48.1