From: Carlos Eduardo Seo Date: Wed, 19 Sep 2018 21:37:08 +0000 (-0300) Subject: crypto/cipher: add VSX implementation of xorBytes for ppc64x X-Git-Tag: go1.12beta1~552 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=0ff6e5f1b48dfdff9ebf77796eeca1a5ec097def;p=gostls13.git crypto/cipher: add VSX implementation of xorBytes for ppc64x This change adds asm implementations of xorBytes for ppc64x that takes advantage of VSX registers and instructions. name old time/op new time/op delta XORBytes/8Bytes-8 16.4ns ± 0% 11.1ns ± 0% -32.32% (p=0.000 n=5+4) XORBytes/128Bytes-8 45.6ns ± 0% 16.2ns ± 0% -64.50% (p=0.008 n=5+5) XORBytes/2048Bytes-8 433ns ±13% 129ns ± 1% -70.29% (p=0.000 n=5+4) XORBytes/32768Bytes-8 7.16µs ± 0% 1.83µs ± 0% -74.39% (p=0.008 n=5+5) name old speed new speed delta XORBytes/8Bytes-8 488MB/s ± 0% 721MB/s ± 0% +47.75% (p=0.016 n=5+4) XORBytes/128Bytes-8 2.80GB/s ± 0% 7.89GB/s ± 0% +181.33% (p=0.008 n=5+5) XORBytes/2048Bytes-8 4.77GB/s ±13% 15.87GB/s ± 0% +232.68% (p=0.016 n=5+4) XORBytes/32768Bytes-8 4.58GB/s ± 0% 17.88GB/s ± 0% +290.47% (p=0.008 n=5+5) Change-Id: Ic27d9b858f8ec2d597fdabc68a288d6844eba701 Reviewed-on: https://go-review.googlesource.com/c/145997 Run-TryBot: Carlos Eduardo Seo TryBot-Result: Gobot Gobot Reviewed-by: Lynn Boger --- diff --git a/src/crypto/cipher/xor_generic.go b/src/crypto/cipher/xor_generic.go index 4d660b0a75..b7de60873c 100644 --- a/src/crypto/cipher/xor_generic.go +++ b/src/crypto/cipher/xor_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64 +// +build !amd64,!ppc64,!ppc64le package cipher diff --git a/src/crypto/cipher/xor_ppc64x.go b/src/crypto/cipher/xor_ppc64x.go new file mode 100644 index 0000000000..8d2e43d327 --- /dev/null +++ b/src/crypto/cipher/xor_ppc64x.go @@ -0,0 +1,29 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ppc64 ppc64le + +package cipher + +// xorBytes xors the bytes in a and b. The destination should have enough +// space, otherwise xorBytes will panic. Returns the number of bytes xor'd. +func xorBytes(dst, a, b []byte) int { + n := len(a) + if len(b) < n { + n = len(b) + } + if n == 0 { + return 0 + } + _ = dst[n-1] + xorBytesVSX(&dst[0], &a[0], &b[0], n) + return n +} + +func xorWords(dst, a, b []byte) { + xorBytes(dst, a, b) +} + +//go:noescape +func xorBytesVSX(dst, a, b *byte, n int) diff --git a/src/crypto/cipher/xor_ppc64x.s b/src/crypto/cipher/xor_ppc64x.s new file mode 100644 index 0000000000..af4d08bda3 --- /dev/null +++ b/src/crypto/cipher/xor_ppc64x.s @@ -0,0 +1,66 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ppc64 ppc64le + +#include "textflag.h" + +// func xorBytesVSX(dst, a, b *byte, n int) +TEXT ·xorBytesVSX(SB), NOSPLIT, $0 + MOVD dst+0(FP), R3 // R3 = dst + MOVD a+8(FP), R4 // R4 = a + MOVD b+16(FP), R5 // R5 = b + MOVD n+24(FP), R6 // R6 = n + + CMPU R6, $16, CR7 // Check if n ≥ 16 bytes + MOVD R0, R8 // R8 = index + CMPU R6, $8, CR6 // Check if 8 ≤ n < 16 bytes + BGE CR7, preloop16 + BLT CR6, small + + // Case for 8 ≤ n < 16 bytes + MOVD (R4)(R8), R14 // R14 = a[i,...,i+7] + MOVD (R5)(R8), R15 // R15 = b[i,...,i+7] + XOR R14, R15, R16 // R16 = a[] ^ b[] + SUB $8, R6 // n = n - 8 + MOVD R16, (R3)(R8) // Store to dst + ADD $8, R8 + + // Check if we're finished + CMP R6, R0 + BGT small + JMP done + + // Case for n ≥ 16 bytes +preloop16: + SRD $4, R6, R7 // Setup loop counter + MOVD R7, CTR + ANDCC $15, R6, R9 // Check for tailing bytes for later +loop16: + LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15] + LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15] + XXLXOR VS32, VS33, VS34 // VS34 = a[] ^ b[] + STXVD2X VS34, (R3)(R8) // Store to dst + ADD $16, R8 // Update index + BC 16, 0, loop16 // bdnz loop16 + + BEQ CR0, done + SLD $4, R7 + SUB R7, R6 // R6 = n - (R7 * 16) + + // Case for n < 8 bytes and tailing bytes from the + // previous cases. +small: + MOVD R6, CTR // Setup loop counter + +loop: + MOVBZ (R4)(R8), R14 // R14 = a[i] + MOVBZ (R5)(R8), R15 // R15 = b[i] + XOR R14, R15, R16 // R16 = a[i] ^ b[i] + MOVB R16, (R3)(R8) // Store to dst + ADD $1, R8 + BC 16, 0, loop // bdnz loop + +done: + RET