From 6ac8ccf4b3b7ffe946b99e5031b88edc611e32ec Mon Sep 17 00:00:00 2001 From: Ilya Tocar Date: Wed, 8 Feb 2017 15:31:54 -0600 Subject: [PATCH] crypto/sha512: Add AVX2 version for AMD64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit name old time/op new time/op delta Hash8Bytes-6 913ns ± 0% 667ns ± 0% -26.91% (p=0.000 n=10+10) Hash1K-6 6.58µs ± 0% 4.23µs ± 0% -35.69% (p=0.000 n=10+9) Hash8K-6 45.9µs ± 0% 28.1µs ± 0% -38.93% (p=0.000 n=10+10) name old speed new speed delta Hash8Bytes-6 8.76MB/s ± 0% 11.99MB/s ± 0% +36.87% (p=0.000 n=10+8) Hash1K-6 156MB/s ± 0% 242MB/s ± 0% +55.49% (p=0.000 n=10+9) Hash8K-6 178MB/s ± 0% 292MB/s ± 0% +63.74% (p=0.000 n=10+10) Change-Id: Ic9211d68b02935b2195995f264ec57d6bc36f713 Reviewed-on: https://go-review.googlesource.com/36630 Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot Reviewed-by: Russ Cox --- src/crypto/sha512/sha512block_amd64.go | 26 + src/crypto/sha512/sha512block_amd64.s | 1215 +++++++++++++++++++++++- src/crypto/sha512/sha512block_decl.go | 2 +- 3 files changed, 1241 insertions(+), 2 deletions(-) create mode 100644 src/crypto/sha512/sha512block_amd64.go diff --git a/src/crypto/sha512/sha512block_amd64.go b/src/crypto/sha512/sha512block_amd64.go new file mode 100644 index 0000000000..526d85d433 --- /dev/null +++ b/src/crypto/sha512/sha512block_amd64.go @@ -0,0 +1,26 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64 + +package sha512 + +//go:noescape +func blockAVX2(dig *digest, p []byte) + +//go:noescape +func blockAMD64(dig *digest, p []byte) + +//go:noescape +func checkAVX2() bool + +var hasAVX2 = checkAVX2() + +func block(dig *digest, p []byte) { + if hasAVX2 { + blockAVX2(dig, p) + } else { + blockAMD64(dig, p) + } +} diff --git a/src/crypto/sha512/sha512block_amd64.s b/src/crypto/sha512/sha512block_amd64.s index 87502cdfc6..9c9fd98cd8 100644 --- a/src/crypto/sha512/sha512block_amd64.s +++ b/src/crypto/sha512/sha512block_amd64.s @@ -141,7 +141,7 @@ MSGSCHEDULE1(index); \ SHA512ROUND(index, const, a, b, c, d, e, f, g, h) -TEXT ·block(SB),0,$648-32 +TEXT ·blockAMD64(SB),0,$648-32 MOVQ p_base+8(FP), SI MOVQ p_len+16(FP), DX SHRQ $7, DX @@ -271,3 +271,1216 @@ loop: end: RET + +// Version bellow is based on "Fast SHA512 Implementations on Intel +// Architecture Processors" White-paper +// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf +// AVX2 version by Intel, same algorithm in Linux kernel: +// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S + +// James Guilford +// Kirk Yap +// Tim Chen +// David Cote +// Aleksey Sidorov + +#define YFER_SIZE (4*8) +#define SRND_SIZE (1*8) +#define INP_SIZE (1*8) + +#define frame_YFER (0) +#define frame_SRND (frame_YFER + YFER_SIZE) +#define frame_INP (frame_SRND + SRND_SIZE) +#define frame_INPEND (frame_INP + INP_SIZE) + +#define addm(p1, p2) \ + ADDQ p1, p2; \ + MOVQ p2, p1 + +#define COPY_YMM_AND_BSWAP(p1, p2, p3) \ + VMOVDQU p2, p1; \ + VPSHUFB p3, p1, p1 + +#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \ + VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \ + VPALIGNR $RVAL, YSRC2, YDST, YDST + +DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607 +DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f +DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617 +DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f + +GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32 + +DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000 +DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000 +DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF +DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF + +GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32 + +TEXT ·blockAVX2(SB), NOSPLIT, $56-32 + MOVQ dig+0(FP), SI + MOVQ p_base+8(FP), DI + MOVQ p_len+16(FP), DX + + SHRQ $7, DX + SHLQ $7, DX + + JZ done_hash + ADDQ DI, DX + MOVQ DX, frame_INPEND(SP) + + MOVQ (0*8)(SI), AX + MOVQ (1*8)(SI), BX + MOVQ (2*8)(SI), CX + MOVQ (3*8)(SI), R8 + MOVQ (4*8)(SI), DX + MOVQ (5*8)(SI), R9 + MOVQ (6*8)(SI), R10 + MOVQ (7*8)(SI), R11 + + MOVQ $PSHUFFLE_BYTE_FLIP_MASK<>(SB), R12 + VMOVDQU (R12), Y9 + +loop0: + MOVQ ·_K+0(SB), BP + + // byte swap first 16 dwords + COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9) + COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9) + COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9) + COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9) + + MOVQ DI, frame_INP(SP) + + // schedule 64 input dwords, by doing 12 rounds of 4 each + MOVQ $4, frame_SRND(SP) + +loop1: + VPADDQ (BP), Y4, Y0 + VMOVDQU Y0, frame_YFER(SP) + + MY_VPALIGNR(Y0, Y7, Y6, 8) + + VPADDQ Y4, Y0, Y0 + + MY_VPALIGNR(Y1, Y5, Y4, 8) + + VPSRLQ $1, Y1, Y2 + VPSLLQ $(64-1), Y1, Y3 + VPOR Y2, Y3, Y3 + + VPSRLQ $7, Y1, Y8 + + MOVQ AX, DI + RORXQ $41, DX, R13 + RORXQ $18, DX, R14 + ADDQ frame_YFER(SP), R11 + ORQ CX, DI + MOVQ R9, R15 + RORXQ $34, AX, R12 + + XORQ R14, R13 + XORQ R10, R15 + RORXQ $14, DX, R14 + + ANDQ DX, R15 + XORQ R14, R13 + RORXQ $39, AX, R14 + ADDQ R11, R8 + + ANDQ BX, DI + XORQ R12, R14 + RORXQ $28, AX, R12 + + XORQ R10, R15 + XORQ R12, R14 + MOVQ AX, R12 + ANDQ CX, R12 + + ADDQ R13, R15 + ORQ R12, DI + ADDQ R14, R11 + + ADDQ R15, R8 + + ADDQ R15, R11 + ADDQ DI, R11 + + VPSRLQ $8, Y1, Y2 + VPSLLQ $(64-8), Y1, Y1 + VPOR Y2, Y1, Y1 + + VPXOR Y8, Y3, Y3 + VPXOR Y1, Y3, Y1 + + VPADDQ Y1, Y0, Y0 + + VPERM2F128 $0x0, Y0, Y0, Y4 + + MOVQ $MASK_YMM_LO<>(SB), R13 + + VPAND (R13), Y0, Y0 + + VPERM2F128 $0x11, Y7, Y7, Y2 + VPSRLQ $6, Y2, Y8 + + MOVQ R11, DI + RORXQ $41, R8, R13 + RORXQ $18, R8, R14 + ADDQ 1*8+frame_YFER(SP), R10 + ORQ BX, DI + + MOVQ DX, R15 + RORXQ $34, R11, R12 + XORQ R14, R13 + XORQ R9, R15 + + RORXQ $14, R8, R14 + XORQ R14, R13 + RORXQ $39, R11, R14 + ANDQ R8, R15 + ADDQ R10, CX + + ANDQ AX, DI + XORQ R12, R14 + + RORXQ $28, R11, R12 + XORQ R9, R15 + + XORQ R12, R14 + MOVQ R11, R12 + ANDQ BX, R12 + ADDQ R13, R15 + + ORQ R12, DI + ADDQ R14, R10 + + ADDQ R15, CX + ADDQ R15, R10 + ADDQ DI, R10 + + VPSRLQ $19, Y2, Y3 + VPSLLQ $(64-19), Y2, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSRLQ $61, Y2, Y3 + VPSLLQ $(64-61), Y2, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + + VPADDQ Y8, Y4, Y4 + + VPSRLQ $6, Y4, Y8 + + MOVQ R10, DI + RORXQ $41, CX, R13 + ADDQ 2*8+frame_YFER(SP), R9 + + RORXQ $18, CX, R14 + ORQ AX, DI + MOVQ R8, R15 + XORQ DX, R15 + + RORXQ $34, R10, R12 + XORQ R14, R13 + ANDQ CX, R15 + + RORXQ $14, CX, R14 + ADDQ R9, BX + ANDQ R11, DI + + XORQ R14, R13 + RORXQ $39, R10, R14 + XORQ DX, R15 + + XORQ R12, R14 + RORXQ $28, R10, R12 + + XORQ R12, R14 + MOVQ R10, R12 + ANDQ AX, R12 + ADDQ R13, R15 + + ORQ R12, DI + ADDQ R14, R9 + ADDQ R15, BX + ADDQ R15, R9 + + ADDQ DI, R9 + + VPSRLQ $19, Y4, Y3 + VPSLLQ $(64-19), Y4, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSRLQ $61, Y4, Y3 + VPSLLQ $(64-61), Y4, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + + VPADDQ Y8, Y0, Y2 + + VPBLENDD $0xF0, Y2, Y4, Y4 + + MOVQ R9, DI + RORXQ $41, BX, R13 + RORXQ $18, BX, R14 + ADDQ 3*8+frame_YFER(SP), DX + ORQ R11, DI + + MOVQ CX, R15 + RORXQ $34, R9, R12 + XORQ R14, R13 + XORQ R8, R15 + + RORXQ $14, BX, R14 + ANDQ BX, R15 + ADDQ DX, AX + ANDQ R10, DI + + XORQ R14, R13 + XORQ R8, R15 + + RORXQ $39, R9, R14 + ADDQ R13, R15 + + XORQ R12, R14 + ADDQ R15, AX + + RORXQ $28, R9, R12 + + XORQ R12, R14 + MOVQ R9, R12 + ANDQ R11, R12 + ORQ R12, DI + + ADDQ R14, DX + ADDQ R15, DX + ADDQ DI, DX + + VPADDQ 1*32(BP), Y5, Y0 + VMOVDQU Y0, frame_YFER(SP) + + MY_VPALIGNR(Y0, Y4, Y7, 8) + + VPADDQ Y5, Y0, Y0 + + MY_VPALIGNR(Y1, Y6, Y5, 8) + + VPSRLQ $1, Y1, Y2 + VPSLLQ $(64-1), Y1, Y3 + VPOR Y2, Y3, Y3 + + VPSRLQ $7, Y1, Y8 + + MOVQ DX, DI + RORXQ $41, AX, R13 + RORXQ $18, AX, R14 + ADDQ frame_YFER(SP), R8 + ORQ R10, DI + MOVQ BX, R15 + RORXQ $34, DX, R12 + + XORQ R14, R13 + XORQ CX, R15 + RORXQ $14, AX, R14 + + ANDQ AX, R15 + XORQ R14, R13 + RORXQ $39, DX, R14 + ADDQ R8, R11 + + ANDQ R9, DI + XORQ R12, R14 + RORXQ $28, DX, R12 + + XORQ CX, R15 + XORQ R12, R14 + MOVQ DX, R12 + ANDQ R10, R12 + + ADDQ R13, R15 + ORQ R12, DI + ADDQ R14, R8 + + ADDQ R15, R11 + + ADDQ R15, R8 + ADDQ DI, R8 + + VPSRLQ $8, Y1, Y2 + VPSLLQ $(64-8), Y1, Y1 + VPOR Y2, Y1, Y1 + + VPXOR Y8, Y3, Y3 + VPXOR Y1, Y3, Y1 + + VPADDQ Y1, Y0, Y0 + + VPERM2F128 $0x0, Y0, Y0, Y5 + + MOVQ $MASK_YMM_LO<>(SB), R13 + VPAND (R13), Y0, Y0 + + VPERM2F128 $0x11, Y4, Y4, Y2 + VPSRLQ $6, Y2, Y8 + + MOVQ R8, DI + RORXQ $41, R11, R13 + RORXQ $18, R11, R14 + ADDQ 1*8+frame_YFER(SP), CX + ORQ R9, DI + + MOVQ AX, R15 + RORXQ $34, R8, R12 + XORQ R14, R13 + XORQ BX, R15 + + RORXQ $14, R11, R14 + XORQ R14, R13 + RORXQ $39, R8, R14 + ANDQ R11, R15 + ADDQ CX, R10 + + ANDQ DX, DI + XORQ R12, R14 + + RORXQ $28, R8, R12 + XORQ BX, R15 + + XORQ R12, R14 + MOVQ R8, R12 + ANDQ R9, R12 + ADDQ R13, R15 + + ORQ R12, DI + ADDQ R14, CX + + ADDQ R15, R10 + ADDQ R15, CX + ADDQ DI, CX + + VPSRLQ $19, Y2, Y3 + VPSLLQ $(64-19), Y2, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSRLQ $61, Y2, Y3 + VPSLLQ $(64-61), Y2, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + + VPADDQ Y8, Y5, Y5 + + VPSRLQ $6, Y5, Y8 + + MOVQ CX, DI + RORXQ $41, R10, R13 + ADDQ 2*8+frame_YFER(SP), BX + + RORXQ $18, R10, R14 + ORQ DX, DI + MOVQ R11, R15 + XORQ AX, R15 + + RORXQ $34, CX, R12 + XORQ R14, R13 + ANDQ R10, R15 + + RORXQ $14, R10, R14 + ADDQ BX, R9 + ANDQ R8, DI + + XORQ R14, R13 + RORXQ $39, CX, R14 + XORQ AX, R15 + + XORQ R12, R14 + RORXQ $28, CX, R12 + + XORQ R12, R14 + MOVQ CX, R12 + ANDQ DX, R12 + ADDQ R13, R15 + + ORQ R12, DI + ADDQ R14, BX + ADDQ R15, R9 + ADDQ R15, BX + + ADDQ DI, BX + + VPSRLQ $19, Y5, Y3 + VPSLLQ $(64-19), Y5, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSRLQ $61, Y5, Y3 + VPSLLQ $(64-61), Y5, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + + VPADDQ Y8, Y0, Y2 + + VPBLENDD $0xF0, Y2, Y5, Y5 + + MOVQ BX, DI + RORXQ $41, R9, R13 + RORXQ $18, R9, R14 + ADDQ 3*8+frame_YFER(SP), AX + ORQ R8, DI + + MOVQ R10, R15 + RORXQ $34, BX, R12 + XORQ R14, R13 + XORQ R11, R15 + + RORXQ $14, R9, R14 + ANDQ R9, R15 + ADDQ AX, DX + ANDQ CX, DI + + XORQ R14, R13 + XORQ R11, R15 + + RORXQ $39, BX, R14 + ADDQ R13, R15 + + XORQ R12, R14 + ADDQ R15, DX + + RORXQ $28, BX, R12 + + XORQ R12, R14 + MOVQ BX, R12 + ANDQ R8, R12 + ORQ R12, DI + + ADDQ R14, AX + ADDQ R15, AX + ADDQ DI, AX + + VPADDQ 2*32(BP), Y6, Y0 + VMOVDQU Y0, frame_YFER(SP) + + MY_VPALIGNR(Y0, Y5, Y4, 8) + + VPADDQ Y6, Y0, Y0 + + MY_VPALIGNR(Y1, Y7, Y6, 8) + + VPSRLQ $1, Y1, Y2 + VPSLLQ $(64-1), Y1, Y3 + VPOR Y2, Y3, Y3 + + VPSRLQ $7, Y1, Y8 + + MOVQ AX, DI + RORXQ $41, DX, R13 + RORXQ $18, DX, R14 + ADDQ frame_YFER(SP), R11 + ORQ CX, DI + MOVQ R9, R15 + RORXQ $34, AX, R12 + + XORQ R14, R13 + XORQ R10, R15 + RORXQ $14, DX, R14 + + ANDQ DX, R15 + XORQ R14, R13 + RORXQ $39, AX, R14 + ADDQ R11, R8 + + ANDQ BX, DI + XORQ R12, R14 + RORXQ $28, AX, R12 + + XORQ R10, R15 + XORQ R12, R14 + MOVQ AX, R12 + ANDQ CX, R12 + + ADDQ R13, R15 + ORQ R12, DI + ADDQ R14, R11 + + ADDQ R15, R8 + + ADDQ R15, R11 + ADDQ DI, R11 + + VPSRLQ $8, Y1, Y2 + VPSLLQ $(64-8), Y1, Y1 + VPOR Y2, Y1, Y1 + + VPXOR Y8, Y3, Y3 + VPXOR Y1, Y3, Y1 + + VPADDQ Y1, Y0, Y0 + + VPERM2F128 $0x0, Y0, Y0, Y6 + + MOVQ $MASK_YMM_LO<>(SB), R13 + VPAND (R13), Y0, Y0 + + VPERM2F128 $0x11, Y5, Y5, Y2 + VPSRLQ $6, Y2, Y8 + + MOVQ R11, DI + RORXQ $41, R8, R13 + RORXQ $18, R8, R14 + ADDQ 1*8+frame_YFER(SP), R10 + ORQ BX, DI + + MOVQ DX, R15 + RORXQ $34, R11, R12 + XORQ R14, R13 + XORQ R9, R15 + + RORXQ $14, R8, R14 + XORQ R14, R13 + RORXQ $39, R11, R14 + ANDQ R8, R15 + ADDQ R10, CX + + ANDQ AX, DI + XORQ R12, R14 + + RORXQ $28, R11, R12 + XORQ R9, R15 + + XORQ R12, R14 + MOVQ R11, R12 + ANDQ BX, R12 + ADDQ R13, R15 + + ORQ R12, DI + ADDQ R14, R10 + + ADDQ R15, CX + ADDQ R15, R10 + ADDQ DI, R10 + + VPSRLQ $19, Y2, Y3 + VPSLLQ $(64-19), Y2, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSRLQ $61, Y2, Y3 + VPSLLQ $(64-61), Y2, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + + VPADDQ Y8, Y6, Y6 + + VPSRLQ $6, Y6, Y8 + + MOVQ R10, DI + RORXQ $41, CX, R13 + ADDQ 2*8+frame_YFER(SP), R9 + + RORXQ $18, CX, R14 + ORQ AX, DI + MOVQ R8, R15 + XORQ DX, R15 + + RORXQ $34, R10, R12 + XORQ R14, R13 + ANDQ CX, R15 + + RORXQ $14, CX, R14 + ADDQ R9, BX + ANDQ R11, DI + + XORQ R14, R13 + RORXQ $39, R10, R14 + XORQ DX, R15 + + XORQ R12, R14 + RORXQ $28, R10, R12 + + XORQ R12, R14 + MOVQ R10, R12 + ANDQ AX, R12 + ADDQ R13, R15 + + ORQ R12, DI + ADDQ R14, R9 + ADDQ R15, BX + ADDQ R15, R9 + + ADDQ DI, R9 + + VPSRLQ $19, Y6, Y3 + VPSLLQ $(64-19), Y6, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSRLQ $61, Y6, Y3 + VPSLLQ $(64-61), Y6, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + + VPADDQ Y8, Y0, Y2 + + VPBLENDD $0xF0, Y2, Y6, Y6 + + MOVQ R9, DI + RORXQ $41, BX, R13 + RORXQ $18, BX, R14 + ADDQ 3*8+frame_YFER(SP), DX + ORQ R11, DI + + MOVQ CX, R15 + RORXQ $34, R9, R12 + XORQ R14, R13 + XORQ R8, R15 + + RORXQ $14, BX, R14 + ANDQ BX, R15 + ADDQ DX, AX + ANDQ R10, DI + + XORQ R14, R13 + XORQ R8, R15 + + RORXQ $39, R9, R14 + ADDQ R13, R15 + + XORQ R12, R14 + ADDQ R15, AX + + RORXQ $28, R9, R12 + + XORQ R12, R14 + MOVQ R9, R12 + ANDQ R11, R12 + ORQ R12, DI + + ADDQ R14, DX + ADDQ R15, DX + ADDQ DI, DX + + VPADDQ 3*32(BP), Y7, Y0 + VMOVDQU Y0, frame_YFER(SP) + ADDQ $(4*32), BP + + MY_VPALIGNR(Y0, Y6, Y5, 8) + + VPADDQ Y7, Y0, Y0 + + MY_VPALIGNR(Y1, Y4, Y7, 8) + + VPSRLQ $1, Y1, Y2 + VPSLLQ $(64-1), Y1, Y3 + VPOR Y2, Y3, Y3 + + VPSRLQ $7, Y1, Y8 + + MOVQ DX, DI + RORXQ $41, AX, R13 + RORXQ $18, AX, R14 + ADDQ frame_YFER(SP), R8 + ORQ R10, DI + MOVQ BX, R15 + RORXQ $34, DX, R12 + + XORQ R14, R13 + XORQ CX, R15 + RORXQ $14, AX, R14 + + ANDQ AX, R15 + XORQ R14, R13 + RORXQ $39, DX, R14 + ADDQ R8, R11 + + ANDQ R9, DI + XORQ R12, R14 + RORXQ $28, DX, R12 + + XORQ CX, R15 + XORQ R12, R14 + MOVQ DX, R12 + ANDQ R10, R12 + + ADDQ R13, R15 + ORQ R12, DI + ADDQ R14, R8 + + ADDQ R15, R11 + + ADDQ R15, R8 + ADDQ DI, R8 + + VPSRLQ $8, Y1, Y2 + VPSLLQ $(64-8), Y1, Y1 + VPOR Y2, Y1, Y1 + + VPXOR Y8, Y3, Y3 + VPXOR Y1, Y3, Y1 + + VPADDQ Y1, Y0, Y0 + + VPERM2F128 $0x0, Y0, Y0, Y7 + + MOVQ $MASK_YMM_LO<>(SB), R13 + VPAND (R13), Y0, Y0 + + VPERM2F128 $0x11, Y6, Y6, Y2 + VPSRLQ $6, Y2, Y8 + + MOVQ R8, DI + RORXQ $41, R11, R13 + RORXQ $18, R11, R14 + ADDQ 1*8+frame_YFER(SP), CX + ORQ R9, DI + + MOVQ AX, R15 + RORXQ $34, R8, R12 + XORQ R14, R13 + XORQ BX, R15 + + RORXQ $14, R11, R14 + XORQ R14, R13 + RORXQ $39, R8, R14 + ANDQ R11, R15 + ADDQ CX, R10 + + ANDQ DX, DI + XORQ R12, R14 + + RORXQ $28, R8, R12 + XORQ BX, R15 + + XORQ R12, R14 + MOVQ R8, R12 + ANDQ R9, R12 + ADDQ R13, R15 + + ORQ R12, DI + ADDQ R14, CX + + ADDQ R15, R10 + ADDQ R15, CX + ADDQ DI, CX + + VPSRLQ $19, Y2, Y3 + VPSLLQ $(64-19), Y2, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSRLQ $61, Y2, Y3 + VPSLLQ $(64-61), Y2, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + + VPADDQ Y8, Y7, Y7 + + VPSRLQ $6, Y7, Y8 + + MOVQ CX, DI + RORXQ $41, R10, R13 + ADDQ 2*8+frame_YFER(SP), BX + + RORXQ $18, R10, R14 + ORQ DX, DI + MOVQ R11, R15 + XORQ AX, R15 + + RORXQ $34, CX, R12 + XORQ R14, R13 + ANDQ R10, R15 + + RORXQ $14, R10, R14 + ADDQ BX, R9 + ANDQ R8, DI + + XORQ R14, R13 + RORXQ $39, CX, R14 + XORQ AX, R15 + + XORQ R12, R14 + RORXQ $28, CX, R12 + + XORQ R12, R14 + MOVQ CX, R12 + ANDQ DX, R12 + ADDQ R13, R15 + + ORQ R12, DI + ADDQ R14, BX + ADDQ R15, R9 + ADDQ R15, BX + + ADDQ DI, BX + + VPSRLQ $19, Y7, Y3 + VPSLLQ $(64-19), Y7, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSRLQ $61, Y7, Y3 + VPSLLQ $(64-61), Y7, Y1 + VPOR Y1, Y3, Y3 + VPXOR Y3, Y8, Y8 + + VPADDQ Y8, Y0, Y2 + + VPBLENDD $0xF0, Y2, Y7, Y7 + + MOVQ BX, DI + RORXQ $41, R9, R13 + RORXQ $18, R9, R14 + ADDQ 3*8+frame_YFER(SP), AX + ORQ R8, DI + + MOVQ R10, R15 + RORXQ $34, BX, R12 + XORQ R14, R13 + XORQ R11, R15 + + RORXQ $14, R9, R14 + ANDQ R9, R15 + ADDQ AX, DX + ANDQ CX, DI + + XORQ R14, R13 + XORQ R11, R15 + + RORXQ $39, BX, R14 + ADDQ R13, R15 + + XORQ R12, R14 + ADDQ R15, DX + + RORXQ $28, BX, R12 + + XORQ R12, R14 + MOVQ BX, R12 + ANDQ R8, R12 + ORQ R12, DI + + ADDQ R14, AX + ADDQ R15, AX + ADDQ DI, AX + + SUBQ $1, frame_SRND(SP) + JNE loop1 + + MOVQ $2, frame_SRND(SP) + +loop2: + VPADDQ (BP), Y4, Y0 + VMOVDQU Y0, frame_YFER(SP) + + MOVQ R9, R15 + RORXQ $41, DX, R13 + RORXQ $18, DX, R14 + XORQ R10, R15 + + XORQ R14, R13 + RORXQ $14, DX, R14 + ANDQ DX, R15 + + XORQ R14, R13 + RORXQ $34, AX, R12 + XORQ R10, R15 + RORXQ $39, AX, R14 + MOVQ AX, DI + + XORQ R12, R14 + RORXQ $28, AX, R12 + ADDQ frame_YFER(SP), R11 + ORQ CX, DI + + XORQ R12, R14 + MOVQ AX, R12 + ANDQ BX, DI + ANDQ CX, R12 + ADDQ R13, R15 + + ADDQ R11, R8 + ORQ R12, DI + ADDQ R14, R11 + + ADDQ R15, R8 + + ADDQ R15, R11 + MOVQ DX, R15 + RORXQ $41, R8, R13 + RORXQ $18, R8, R14 + XORQ R9, R15 + + XORQ R14, R13 + RORXQ $14, R8, R14 + ANDQ R8, R15 + ADDQ DI, R11 + + XORQ R14, R13 + RORXQ $34, R11, R12 + XORQ R9, R15 + RORXQ $39, R11, R14 + MOVQ R11, DI + + XORQ R12, R14 + RORXQ $28, R11, R12 + ADDQ 8*1+frame_YFER(SP), R10 + ORQ BX, DI + + XORQ R12, R14 + MOVQ R11, R12 + ANDQ AX, DI + ANDQ BX, R12 + ADDQ R13, R15 + + ADDQ R10, CX + ORQ R12, DI + ADDQ R14, R10 + + ADDQ R15, CX + + ADDQ R15, R10 + MOVQ R8, R15 + RORXQ $41, CX, R13 + RORXQ $18, CX, R14 + XORQ DX, R15 + + XORQ R14, R13 + RORXQ $14, CX, R14 + ANDQ CX, R15 + ADDQ DI, R10 + + XORQ R14, R13 + RORXQ $34, R10, R12 + XORQ DX, R15 + RORXQ $39, R10, R14 + MOVQ R10, DI + + XORQ R12, R14 + RORXQ $28, R10, R12 + ADDQ 8*2+frame_YFER(SP), R9 + ORQ AX, DI + + XORQ R12, R14 + MOVQ R10, R12 + ANDQ R11, DI + ANDQ AX, R12 + ADDQ R13, R15 + + ADDQ R9, BX + ORQ R12, DI + ADDQ R14, R9 + + ADDQ R15, BX + + ADDQ R15, R9 + MOVQ CX, R15 + RORXQ $41, BX, R13 + RORXQ $18, BX, R14 + XORQ R8, R15 + + XORQ R14, R13 + RORXQ $14, BX, R14 + ANDQ BX, R15 + ADDQ DI, R9 + + XORQ R14, R13 + RORXQ $34, R9, R12 + XORQ R8, R15 + RORXQ $39, R9, R14 + MOVQ R9, DI + + XORQ R12, R14 + RORXQ $28, R9, R12 + ADDQ 8*3+frame_YFER(SP), DX + ORQ R11, DI + + XORQ R12, R14 + MOVQ R9, R12 + ANDQ R10, DI + ANDQ R11, R12 + ADDQ R13, R15 + + ADDQ DX, AX + ORQ R12, DI + ADDQ R14, DX + + ADDQ R15, AX + + ADDQ R15, DX + + ADDQ DI, DX + + VPADDQ 1*32(BP), Y5, Y0 + VMOVDQU Y0, frame_YFER(SP) + ADDQ $(2*32), BP + + MOVQ BX, R15 + RORXQ $41, AX, R13 + RORXQ $18, AX, R14 + XORQ CX, R15 + + XORQ R14, R13 + RORXQ $14, AX, R14 + ANDQ AX, R15 + + XORQ R14, R13 + RORXQ $34, DX, R12 + XORQ CX, R15 + RORXQ $39, DX, R14 + MOVQ DX, DI + + XORQ R12, R14 + RORXQ $28, DX, R12 + ADDQ frame_YFER(SP), R8 + ORQ R10, DI + + XORQ R12, R14 + MOVQ DX, R12 + ANDQ R9, DI + ANDQ R10, R12 + ADDQ R13, R15 + + ADDQ R8, R11 + ORQ R12, DI + ADDQ R14, R8 + + ADDQ R15, R11 + + ADDQ R15, R8 + MOVQ AX, R15 + RORXQ $41, R11, R13 + RORXQ $18, R11, R14 + XORQ BX, R15 + + XORQ R14, R13 + RORXQ $14, R11, R14 + ANDQ R11, R15 + ADDQ DI, R8 + + XORQ R14, R13 + RORXQ $34, R8, R12 + XORQ BX, R15 + RORXQ $39, R8, R14 + MOVQ R8, DI + + XORQ R12, R14 + RORXQ $28, R8, R12 + ADDQ 8*1+frame_YFER(SP), CX + ORQ R9, DI + + XORQ R12, R14 + MOVQ R8, R12 + ANDQ DX, DI + ANDQ R9, R12 + ADDQ R13, R15 + + ADDQ CX, R10 + ORQ R12, DI + ADDQ R14, CX + + ADDQ R15, R10 + + ADDQ R15, CX + MOVQ R11, R15 + RORXQ $41, R10, R13 + RORXQ $18, R10, R14 + XORQ AX, R15 + + XORQ R14, R13 + RORXQ $14, R10, R14 + ANDQ R10, R15 + ADDQ DI, CX + + XORQ R14, R13 + RORXQ $34, CX, R12 + XORQ AX, R15 + RORXQ $39, CX, R14 + MOVQ CX, DI + + XORQ R12, R14 + RORXQ $28, CX, R12 + ADDQ 8*2+frame_YFER(SP), BX + ORQ DX, DI + + XORQ R12, R14 + MOVQ CX, R12 + ANDQ R8, DI + ANDQ DX, R12 + ADDQ R13, R15 + + ADDQ BX, R9 + ORQ R12, DI + ADDQ R14, BX + + ADDQ R15, R9 + + ADDQ R15, BX + MOVQ R10, R15 + RORXQ $41, R9, R13 + RORXQ $18, R9, R14 + XORQ R11, R15 + + XORQ R14, R13 + RORXQ $14, R9, R14 + ANDQ R9, R15 + ADDQ DI, BX + + XORQ R14, R13 + RORXQ $34, BX, R12 + XORQ R11, R15 + RORXQ $39, BX, R14 + MOVQ BX, DI + + XORQ R12, R14 + RORXQ $28, BX, R12 + ADDQ 8*3+frame_YFER(SP), AX + ORQ R8, DI + + XORQ R12, R14 + MOVQ BX, R12 + ANDQ CX, DI + ANDQ R8, R12 + ADDQ R13, R15 + + ADDQ AX, DX + ORQ R12, DI + ADDQ R14, AX + + ADDQ R15, DX + + ADDQ R15, AX + + ADDQ DI, AX + + VMOVDQU Y6, Y4 + VMOVDQU Y7, Y5 + + SUBQ $1, frame_SRND(SP) + JNE loop2 + + addm(8*0(SI),AX) + addm(8*1(SI),BX) + addm(8*2(SI),CX) + addm(8*3(SI),R8) + addm(8*4(SI),DX) + addm(8*5(SI),R9) + addm(8*6(SI),R10) + addm(8*7(SI),R11) + + MOVQ frame_INP(SP), DI + ADDQ $128, DI + CMPQ DI, frame_INPEND(SP) + JNE loop0 + +done_hash: + VZEROUPPER + RET + +// func checkAVX2() bool +// returns whether AVX2 is supported +TEXT ·checkAVX2(SB), NOSPLIT, $0 + MOVB runtime·support_avx2(SB), AX + CMPB AX,$0 + JNE check_bmi2 + MOVB AX, ret+0(FP) +check_bmi2: + MOVB runtime·support_bmi2(SB), AX + MOVB AX, ret+0(FP) + RET diff --git a/src/crypto/sha512/sha512block_decl.go b/src/crypto/sha512/sha512block_decl.go index 8194506bf6..613d1e02a3 100644 --- a/src/crypto/sha512/sha512block_decl.go +++ b/src/crypto/sha512/sha512block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64 s390x ppc64le +// +build s390x ppc64le package sha512 -- 2.50.0