]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/cipher: use SIMD for xor on amd64
authortemplexxx <lucas1x1x@gmail.com>
Fri, 20 Jul 2018 20:18:08 +0000 (04:18 +0800)
committerBrad Fitzpatrick <bradfitz@golang.org>
Tue, 30 Oct 2018 17:09:42 +0000 (17:09 +0000)
cpu: Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz

Benchmark: xor

name                   old time/op    new time/op     delta
XORBytes/8Bytes-8        8.21ns ± 1%     6.35ns ± 3%   -22.66%  (p=0.008 n=5+5)
XORBytes/128Bytes-8      17.9ns ± 1%     10.4ns ± 1%   -41.68%  (p=0.008 n=5+5)
XORBytes/2048Bytes-8      187ns ± 1%       78ns ± 0%   -58.44%  (p=0.008 n=5+5)
XORBytes/32768Bytes-8    2.87µs ± 1%     1.38µs ± 0%   -52.05%  (p=0.008 n=5+5)

name                   old speed      new speed       delta
XORBytes/8Bytes-8       974MB/s ± 1%   1260MB/s ± 2%   +29.33%  (p=0.008 n=5+5)
XORBytes/128Bytes-8    7.15GB/s ± 0%  12.25GB/s ± 1%   +71.17%  (p=0.008 n=5+5)
XORBytes/2048Bytes-8   10.9GB/s ± 1%   26.4GB/s ± 0%  +140.99%  (p=0.008 n=5+5)
XORBytes/32768Bytes-8  11.4GB/s ± 1%   23.8GB/s ± 0%  +108.52%  (p=0.008 n=5+5)

Benchmark: cipher

name               old time/op    new time/op    delta
AESGCMSeal1K-8        269ns ± 6%     261ns ± 2%     ~     (p=0.246 n=5+5)
AESGCMOpen1K-8        242ns ± 1%     240ns ± 2%     ~     (p=0.190 n=5+5)
AESGCMSign8K-8        869ns ± 0%     870ns ± 1%     ~     (p=0.683 n=5+5)
AESGCMSeal8K-8       1.64µs ± 6%    1.59µs ± 7%     ~     (p=0.151 n=5+5)
AESGCMOpen8K-8       1.48µs ± 2%    1.46µs ± 0%   -1.39%  (p=0.008 n=5+5)
AESCFBEncrypt1K-8    1.88µs ± 5%    1.62µs ± 1%  -13.52%  (p=0.008 n=5+5)
AESCFBDecrypt1K-8    1.76µs ± 1%    1.58µs ± 1%  -10.24%  (p=0.016 n=4+5)
AESOFB1K-8           1.10µs ± 4%    1.03µs ± 2%   -6.36%  (p=0.008 n=5+5)
AESCTR1K-8           1.24µs ± 1%    1.17µs ± 0%   -5.96%  (p=0.008 n=5+5)
AESCBCEncrypt1K-8    1.74µs ± 0%    1.14µs ± 1%  -34.36%  (p=0.008 n=5+5)
AESCBCDecrypt1K-8    1.28µs ± 1%    1.10µs ± 1%  -14.04%  (p=0.008 n=5+5)

name               old speed      new speed      delta
AESGCMSeal1K-8     3.81GB/s ± 6%  3.91GB/s ± 2%     ~     (p=0.310 n=5+5)
AESGCMOpen1K-8     4.23GB/s ± 1%  4.27GB/s ± 2%     ~     (p=0.222 n=5+5)
AESGCMSign8K-8     9.43GB/s ± 0%  9.41GB/s ± 1%     ~     (p=0.841 n=5+5)
AESGCMSeal8K-8     5.01GB/s ± 6%  5.16GB/s ± 6%     ~     (p=0.151 n=5+5)
AESGCMOpen8K-8     5.54GB/s ± 2%  5.62GB/s ± 0%   +1.41%  (p=0.008 n=5+5)
AESCFBEncrypt1K-8   543MB/s ± 5%   627MB/s ± 1%  +15.55%  (p=0.008 n=5+5)
AESCFBDecrypt1K-8   580MB/s ± 1%   646MB/s ± 1%  +11.40%  (p=0.016 n=4+5)
AESOFB1K-8          925MB/s ± 4%   988MB/s ± 2%   +6.73%  (p=0.008 n=5+5)
AESCTR1K-8          821MB/s ± 1%   873MB/s ± 1%   +6.34%  (p=0.008 n=5+5)
AESCBCEncrypt1K-8   588MB/s ± 1%   897MB/s ± 1%  +52.36%  (p=0.008 n=5+5)
AESCBCDecrypt1K-8   799MB/s ± 1%   929MB/s ± 1%  +16.32%  (p=0.008 n=5+5)

Change-Id: I42e6ba66c23dad853d33c924fca7b0ed805cefdd
Reviewed-on: https://go-review.googlesource.com/c/125316
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

src/crypto/cipher/export_test.go [new file with mode: 0644]
src/crypto/cipher/xor_amd64.go [new file with mode: 0644]
src/crypto/cipher/xor_amd64.s [new file with mode: 0644]
src/crypto/cipher/xor_generic.go [moved from src/crypto/cipher/xor.go with 75% similarity]
src/crypto/cipher/xor_test.go

diff --git a/src/crypto/cipher/export_test.go b/src/crypto/cipher/export_test.go
new file mode 100644 (file)
index 0000000..cf8007a
--- /dev/null
@@ -0,0 +1,8 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cipher
+
+// Export internal functions for testing.
+var XorBytes = xorBytes
diff --git a/src/crypto/cipher/xor_amd64.go b/src/crypto/cipher/xor_amd64.go
new file mode 100644 (file)
index 0000000..a595acc
--- /dev/null
@@ -0,0 +1,27 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cipher
+
+// xorBytes xors the bytes in a and b. The destination should have enough
+// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
+func xorBytes(dst, a, b []byte) int {
+       n := len(a)
+       if len(b) < n {
+               n = len(b)
+       }
+       if n == 0 {
+               return 0
+       }
+       _ = dst[n-1]
+       xorBytesSSE2(&dst[0], &a[0], &b[0], n) // amd64 must have SSE2
+       return n
+}
+
+func xorWords(dst, a, b []byte) {
+       xorBytes(dst, a, b)
+}
+
+//go:noescape
+func xorBytesSSE2(dst, a, b *byte, n int)
diff --git a/src/crypto/cipher/xor_amd64.s b/src/crypto/cipher/xor_amd64.s
new file mode 100644 (file)
index 0000000..780d37a
--- /dev/null
@@ -0,0 +1,54 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func xorBytesSSE2(dst, a, b *byte, n int)
+TEXT ·xorBytesSSE2(SB), NOSPLIT, $0
+       MOVQ  dst+0(FP), BX
+       MOVQ  a+8(FP), SI
+       MOVQ  b+16(FP), CX
+       MOVQ  n+24(FP), DX
+       TESTQ $15, DX            // AND 15 & len, if not zero jump to not_aligned.
+       JNZ   not_aligned
+
+aligned:
+       MOVQ $0, AX // position in slices
+
+loop16b:
+       MOVOU (SI)(AX*1), X0   // XOR 16byte forwards.
+       MOVOU (CX)(AX*1), X1
+       PXOR  X1, X0
+       MOVOU X0, (BX)(AX*1)
+       ADDQ  $16, AX
+       CMPQ  DX, AX
+       JNE   loop16b
+       RET
+
+loop_1b:
+       SUBQ  $1, DX           // XOR 1byte backwards.
+       MOVB  (SI)(DX*1), DI
+       MOVB  (CX)(DX*1), AX
+       XORB  AX, DI
+       MOVB  DI, (BX)(DX*1)
+       TESTQ $7, DX           // AND 7 & len, if not zero jump to loop_1b.
+       JNZ   loop_1b
+       CMPQ  DX, $0           // if len is 0, ret.
+       JE    ret
+       TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
+       JZ    aligned
+
+not_aligned:
+       TESTQ $7, DX           // AND $7 & len, if not zero jump to loop_1b.
+       JNE   loop_1b
+       SUBQ  $8, DX           // XOR 8bytes backwards.
+       MOVQ  (SI)(DX*1), DI
+       MOVQ  (CX)(DX*1), AX
+       XORQ  AX, DI
+       MOVQ  DI, (BX)(DX*1)
+       CMPQ  DX, $16          // if len is greater or equal 16 here, it must be aligned.
+       JGE   aligned
+
+ret:
+       RET
similarity index 75%
rename from src/crypto/cipher/xor.go
rename to src/crypto/cipher/xor_generic.go
index 5b26eace09d8bd98919c8335ab1f060bb937fa48..4d660b0a75a50d95bb26742977d310987758bfe1 100644 (file)
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build !amd64
+
 package cipher
 
 import (
@@ -9,12 +11,9 @@ import (
        "unsafe"
 )
 
-const wordSize = int(unsafe.Sizeof(uintptr(0)))
-const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
-
-// fastXORBytes xors in bulk. It only works on architectures that
-// support unaligned read/writes.
-func fastXORBytes(dst, a, b []byte) int {
+// xorBytes xors the bytes in a and b. The destination should have enough
+// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
+func xorBytes(dst, a, b []byte) int {
        n := len(a)
        if len(b) < n {
                n = len(b)
@@ -22,6 +21,28 @@ func fastXORBytes(dst, a, b []byte) int {
        if n == 0 {
                return 0
        }
+
+       switch {
+       case supportsUnaligned:
+               fastXORBytes(dst, a, b, n)
+       default:
+               // TODO(hanwen): if (dst, a, b) have common alignment
+               // we could still try fastXORBytes. It is not clear
+               // how often this happens, and it's only worth it if
+               // the block encryption itself is hardware
+               // accelerated.
+               safeXORBytes(dst, a, b, n)
+       }
+       return n
+}
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
+
+// fastXORBytes xors in bulk. It only works on architectures that
+// support unaligned read/writes.
+// n needs to be smaller or equal than the length of a and b.
+func fastXORBytes(dst, a, b []byte, n int) {
        // Assert dst has enough space
        _ = dst[n-1]
 
@@ -38,34 +59,13 @@ func fastXORBytes(dst, a, b []byte) int {
        for i := (n - n%wordSize); i < n; i++ {
                dst[i] = a[i] ^ b[i]
        }
-
-       return n
 }
 
-func safeXORBytes(dst, a, b []byte) int {
-       n := len(a)
-       if len(b) < n {
-               n = len(b)
-       }
+// n needs to be smaller or equal than the length of a and b.
+func safeXORBytes(dst, a, b []byte, n int) {
        for i := 0; i < n; i++ {
                dst[i] = a[i] ^ b[i]
        }
-       return n
-}
-
-// xorBytes xors the bytes in a and b. The destination should have enough
-// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
-func xorBytes(dst, a, b []byte) int {
-       if supportsUnaligned {
-               return fastXORBytes(dst, a, b)
-       } else {
-               // TODO(hanwen): if (dst, a, b) have common alignment
-               // we could still try fastXORBytes. It is not clear
-               // how often this happens, and it's only worth it if
-               // the block encryption itself is hardware
-               // accelerated.
-               return safeXORBytes(dst, a, b)
-       }
 }
 
 // fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
@@ -80,10 +80,12 @@ func fastXORWords(dst, a, b []byte) {
        }
 }
 
+// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The slice arguments a and b are assumed to be of equal length.
 func xorWords(dst, a, b []byte) {
        if supportsUnaligned {
                fastXORWords(dst, a, b)
        } else {
-               safeXORBytes(dst, a, b)
+               safeXORBytes(dst, a, b, len(b))
        }
 }
index d9187eb72638508dfa14baab795d338cc7bc463a..24877efc367e24478c357ea38632abf051a2926d 100644 (file)
@@ -2,27 +2,71 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package cipher
+package cipher_test
 
 import (
        "bytes"
+       "crypto/cipher"
+       "crypto/rand"
+       "fmt"
+       "io"
        "testing"
 )
 
 func TestXOR(t *testing.T) {
-       for alignP := 0; alignP < 2; alignP++ {
-               for alignQ := 0; alignQ < 2; alignQ++ {
-                       for alignD := 0; alignD < 2; alignD++ {
-                               p := make([]byte, 1024)[alignP:]
-                               q := make([]byte, 1024)[alignQ:]
-                               d1 := make([]byte, 1024+alignD)[alignD:]
-                               d2 := make([]byte, 1024+alignD)[alignD:]
-                               xorBytes(d1, p, q)
-                               safeXORBytes(d2, p, q)
-                               if !bytes.Equal(d1, d2) {
-                                       t.Error("not equal")
+       for j := 1; j <= 1024; j++ {
+               for alignP := 0; alignP < 2; alignP++ {
+                       for alignQ := 0; alignQ < 2; alignQ++ {
+                               for alignD := 0; alignD < 2; alignD++ {
+                                       p := make([]byte, j)[alignP:]
+                                       q := make([]byte, j)[alignQ:]
+                                       d1 := make([]byte, j+alignD)[alignD:]
+                                       d2 := make([]byte, j+alignD)[alignD:]
+                                       if _, err := io.ReadFull(rand.Reader, p); err != nil {
+                                               t.Fatal(err)
+                                       }
+                                       if _, err := io.ReadFull(rand.Reader, q); err != nil {
+                                               t.Fatal(err)
+                                       }
+                                       cipher.XorBytes(d1, p, q)
+                                       n := min(p, q)
+                                       for i := 0; i < n; i++ {
+                                               d2[i] = p[i] ^ q[i]
+                                       }
+                                       if !bytes.Equal(d1, d2) {
+                                               t.Logf("p: %#v", p)
+                                               t.Logf("q: %#v", q)
+                                               t.Logf("expect: %#v", d2)
+                                               t.Logf("result: %#v", d1)
+                                               t.Fatal("not equal")
+                                       }
                                }
                        }
                }
        }
 }
+
+func min(a, b []byte) int {
+       n := len(a)
+       if len(b) < n {
+               n = len(b)
+       }
+       return n
+}
+
+func BenchmarkXORBytes(b *testing.B) {
+       dst := make([]byte, 1<<15)
+       data0 := make([]byte, 1<<15)
+       data1 := make([]byte, 1<<15)
+       sizes := []int64{1 << 3, 1 << 7, 1 << 11, 1 << 15}
+       for _, size := range sizes {
+               b.Run(fmt.Sprintf("%dBytes", size), func(b *testing.B) {
+                       s0 := data0[:size]
+                       s1 := data1[:size]
+                       b.SetBytes(int64(size))
+                       for i := 0; i < b.N; i++ {
+                               cipher.XorBytes(dst, s0, s1)
+                       }
+               })
+       }
+}