From fcd1ec0c822b31b0849e98941b54a12a3431bedd Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Thu, 25 Oct 2018 11:13:16 -0400 Subject: [PATCH] crypto/md5: fix md5block asm to work on big endian ppc64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This handles a TODO in the md5block_ppc64le.s file to make use of byte reverse loads so the function works for big endian as well as little endian. File name is now md5block_ppc64x.s. name old time/op new time/op delta Hash8Bytes 537ns ± 0% 299ns ± 0% -44.32% (p=0.000 n=8+1) Hash1K 4.74µs ± 0% 3.24µs ± 0% -31.64% (p=0.250 n=7+1) Hash8K 34.4µs ± 0% 23.6µs ± 0% -31.56% (p=0.222 n=8+1) Hash8BytesUnaligned 537ns ± 0% 298ns ± 0% -44.51% (p=0.000 n=8+1) Hash1KUnaligned 4.74µs ± 0% 3.24µs ± 0% -31.48% (p=0.222 n=8+1) Hash8KUnaligned 34.4µs ± 0% 23.6µs ± 0% -31.39% (p=0.222 n=8+1) name old speed new speed delta Hash8Bytes 14.9MB/s ± 0% 26.8MB/s ± 0% +79.76% (p=0.222 n=8+1) Hash1K 216MB/s ± 0% 316MB/s ± 0% +46.29% (p=0.250 n=7+1) Hash8K 238MB/s ± 0% 348MB/s ± 0% +46.11% (p=0.222 n=8+1) Hash8BytesUnaligned 14.9MB/s ± 0% 26.8MB/s ± 0% +79.76% (p=0.222 n=8+1) Hash1KUnaligned 216MB/s ± 0% 316MB/s ± 0% +45.95% (p=0.222 n=8+1) Hash8KUnaligned 238MB/s ± 0% 347MB/s ± 0% +45.75% (p=0.222 n=8+1) Change-Id: I2e226bf7e69e0acd49db1af42e4fd8b87b155606 Reviewed-on: https://go-review.googlesource.com/c/144599 Run-TryBot: Lynn Boger TryBot-Result: Gobot Gobot Reviewed-by: Carlos Eduardo Seo Reviewed-by: Michael Munday --- src/crypto/md5/md5block_decl.go | 2 +- src/crypto/md5/md5block_generic.go | 2 +- .../{md5block_ppc64le.s => md5block_ppc64x.s} | 37 ++++++++++++------- 3 files changed, 25 insertions(+), 16 deletions(-) rename src/crypto/md5/{md5block_ppc64le.s => md5block_ppc64x.s} (87%) diff --git a/src/crypto/md5/md5block_decl.go b/src/crypto/md5/md5block_decl.go index 40bca49a0e..1ac82cf08c 100644 --- a/src/crypto/md5/md5block_decl.go +++ b/src/crypto/md5/md5block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64 amd64p32 386 arm ppc64le s390x arm64 +// +build amd64 amd64p32 386 arm ppc64le ppc64 s390x arm64 package md5 diff --git a/src/crypto/md5/md5block_generic.go b/src/crypto/md5/md5block_generic.go index c744cf72e7..86e3b64e9f 100644 --- a/src/crypto/md5/md5block_generic.go +++ b/src/crypto/md5/md5block_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!amd64p32,!386,!arm,!ppc64le,!s390x,!arm64 +// +build !amd64,!amd64p32,!386,!arm,!ppc64le,!ppc64,!s390x,!arm64 package md5 diff --git a/src/crypto/md5/md5block_ppc64le.s b/src/crypto/md5/md5block_ppc64x.s similarity index 87% rename from src/crypto/md5/md5block_ppc64le.s rename to src/crypto/md5/md5block_ppc64x.s index 3b95da5e8a..f309a1413d 100644 --- a/src/crypto/md5/md5block_ppc64le.s +++ b/src/crypto/md5/md5block_ppc64x.s @@ -10,14 +10,23 @@ // Licence: I hereby disclaim the copyright on this code and place it // in the public domain. -#include "textflag.h" +// +build ppc64 ppc64le -// TODO: Could be updated for ppc64 big endian -// by using the correct byte reverse instruction. -// Changes required in the Go assembler to make -// that instruction work. +#include "textflag.h" -#define MOVE_LITTLE_ENDIAN MOVWZ +// ENDIAN_MOVE generates the appropriate +// 4 byte load for big or little endian. +// The 4 bytes at ptr+off is loaded into dst. +// The idx reg is only needed for big endian +// and is clobbered when used. +#ifdef GOARCH_ppc64le +#define ENDIAN_MOVE(off, ptr, dst, idx) \ + MOVWZ off(ptr),dst +#else +#define ENDIAN_MOVE(off, ptr, dst, idx) \ + MOVD $off,idx; \ + MOVWBR (idx)(ptr), dst +#endif TEXT ·block(SB),NOSPLIT,$0-32 MOVD dig+0(FP), R10 @@ -40,7 +49,7 @@ loop: MOVWZ R4, R16 MOVWZ R5, R17 - MOVE_LITTLE_ENDIAN 0(R6), R8 + ENDIAN_MOVE(0,R6,R8,R21) MOVWZ R5, R9 #define ROUND1(a, b, c, d, index, const, shift) \ @@ -49,7 +58,7 @@ loop: ADD R8, a; \ AND b, R9; \ XOR d, R9; \ - MOVE_LITTLE_ENDIAN (index*4)(R6), R8; \ + ENDIAN_MOVE(index*4,R6,R8,R21); \ ADD R9, a; \ RLWMI $shift, a, $0xffffffff, a; \ MOVWZ c, R9; \ @@ -73,7 +82,7 @@ loop: ROUND1(R4,R5,R22,R3,15,0xa679438e,17); ROUND1(R3,R4,R5,R22, 0,0x49b40821,22); - MOVE_LITTLE_ENDIAN (1*4)(R6), R8 + ENDIAN_MOVE(1*4,R6,R8,R21) MOVWZ R5, R9 MOVWZ R5, R10 @@ -83,7 +92,7 @@ loop: ADD R8, a; \ AND b, R10; \ AND c, R9; \ - MOVE_LITTLE_ENDIAN (index*4)(R6), R8; \ + ENDIAN_MOVE(index*4,R6,R8,R21); \ OR R9, R10; \ MOVWZ c, R9; \ ADD R10, a; \ @@ -109,13 +118,13 @@ loop: ROUND2(R4,R5,R22,R3,12,0x676f02d9,14); ROUND2(R3,R4,R5,R22, 0,0x8d2a4c8a,20); - MOVE_LITTLE_ENDIAN (5*4)(R6), R8 + ENDIAN_MOVE(5*4,R6,R8,R21) MOVWZ R4, R9 #define ROUND3(a, b, c, d, index, const, shift) \ ADD $const, a; \ ADD R8, a; \ - MOVE_LITTLE_ENDIAN (index*4)(R6), R8; \ + ENDIAN_MOVE(index*4,R6,R8,R21); \ XOR d, R9; \ XOR b, R9; \ ADD R9, a; \ @@ -141,7 +150,7 @@ loop: ROUND3(R4,R5,R22,R3, 2,0x1fa27cf8,16); ROUND3(R3,R4,R5,R22, 0,0xc4ac5665,23); - MOVE_LITTLE_ENDIAN (0*4)(R6), R8 + ENDIAN_MOVE(0,R6,R8,R21) MOVWZ $0xffffffff, R9 XOR R5, R9 @@ -151,7 +160,7 @@ loop: OR b, R9; \ XOR c, R9; \ ADD R9, a; \ - MOVE_LITTLE_ENDIAN (index*4)(R6), R8; \ + ENDIAN_MOVE(index*4,R6,R8,R21); \ MOVWZ $0xffffffff, R9; \ RLWMI $shift, a, $0xffffffff, a; \ XOR c, R9; \ -- 2.50.0