MSGSCHEDULE1(index); \
SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
-TEXT ·block(SB),0,$648-32
+TEXT ·blockAMD64(SB),0,$648-32
MOVQ p_base+8(FP), SI
MOVQ p_len+16(FP), DX
SHRQ $7, DX
end:
RET
+
+// Version bellow is based on "Fast SHA512 Implementations on Intel
+// Architecture Processors" White-paper
+// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
+// AVX2 version by Intel, same algorithm in Linux kernel:
+// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
+
+// James Guilford <james.guilford@intel.com>
+// Kirk Yap <kirk.s.yap@intel.com>
+// Tim Chen <tim.c.chen@linux.intel.com>
+// David Cote <david.m.cote@intel.com>
+// Aleksey Sidorov <aleksey.sidorov@intel.com>
+
+#define YFER_SIZE (4*8)
+#define SRND_SIZE (1*8)
+#define INP_SIZE (1*8)
+
+#define frame_YFER (0)
+#define frame_SRND (frame_YFER + YFER_SIZE)
+#define frame_INP (frame_SRND + SRND_SIZE)
+#define frame_INPEND (frame_INP + INP_SIZE)
+
+#define addm(p1, p2) \
+ ADDQ p1, p2; \
+ MOVQ p2, p1
+
+#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
+ VMOVDQU p2, p1; \
+ VPSHUFB p3, p1, p1
+
+#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
+ VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
+ VPALIGNR $RVAL, YSRC2, YDST, YDST
+
+DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
+DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
+DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
+DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
+
+GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
+
+DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
+DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
+DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
+DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
+
+GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
+
+TEXT ·blockAVX2(SB), NOSPLIT, $56-32
+ MOVQ dig+0(FP), SI
+ MOVQ p_base+8(FP), DI
+ MOVQ p_len+16(FP), DX
+
+ SHRQ $7, DX
+ SHLQ $7, DX
+
+ JZ done_hash
+ ADDQ DI, DX
+ MOVQ DX, frame_INPEND(SP)
+
+ MOVQ (0*8)(SI), AX
+ MOVQ (1*8)(SI), BX
+ MOVQ (2*8)(SI), CX
+ MOVQ (3*8)(SI), R8
+ MOVQ (4*8)(SI), DX
+ MOVQ (5*8)(SI), R9
+ MOVQ (6*8)(SI), R10
+ MOVQ (7*8)(SI), R11
+
+ MOVQ $PSHUFFLE_BYTE_FLIP_MASK<>(SB), R12
+ VMOVDQU (R12), Y9
+
+loop0:
+ MOVQ ·_K+0(SB), BP
+
+ // byte swap first 16 dwords
+ COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
+ COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
+ COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
+ COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
+
+ MOVQ DI, frame_INP(SP)
+
+ // schedule 64 input dwords, by doing 12 rounds of 4 each
+ MOVQ $4, frame_SRND(SP)
+
+loop1:
+ VPADDQ (BP), Y4, Y0
+ VMOVDQU Y0, frame_YFER(SP)
+
+ MY_VPALIGNR(Y0, Y7, Y6, 8)
+
+ VPADDQ Y4, Y0, Y0
+
+ MY_VPALIGNR(Y1, Y5, Y4, 8)
+
+ VPSRLQ $1, Y1, Y2
+ VPSLLQ $(64-1), Y1, Y3
+ VPOR Y2, Y3, Y3
+
+ VPSRLQ $7, Y1, Y8
+
+ MOVQ AX, DI
+ RORXQ $41, DX, R13
+ RORXQ $18, DX, R14
+ ADDQ frame_YFER(SP), R11
+ ORQ CX, DI
+ MOVQ R9, R15
+ RORXQ $34, AX, R12
+
+ XORQ R14, R13
+ XORQ R10, R15
+ RORXQ $14, DX, R14
+
+ ANDQ DX, R15
+ XORQ R14, R13
+ RORXQ $39, AX, R14
+ ADDQ R11, R8
+
+ ANDQ BX, DI
+ XORQ R12, R14
+ RORXQ $28, AX, R12
+
+ XORQ R10, R15
+ XORQ R12, R14
+ MOVQ AX, R12
+ ANDQ CX, R12
+
+ ADDQ R13, R15
+ ORQ R12, DI
+ ADDQ R14, R11
+
+ ADDQ R15, R8
+
+ ADDQ R15, R11
+ ADDQ DI, R11
+
+ VPSRLQ $8, Y1, Y2
+ VPSLLQ $(64-8), Y1, Y1
+ VPOR Y2, Y1, Y1
+
+ VPXOR Y8, Y3, Y3
+ VPXOR Y1, Y3, Y1
+
+ VPADDQ Y1, Y0, Y0
+
+ VPERM2F128 $0x0, Y0, Y0, Y4
+
+ MOVQ $MASK_YMM_LO<>(SB), R13
+
+ VPAND (R13), Y0, Y0
+
+ VPERM2F128 $0x11, Y7, Y7, Y2
+ VPSRLQ $6, Y2, Y8
+
+ MOVQ R11, DI
+ RORXQ $41, R8, R13
+ RORXQ $18, R8, R14
+ ADDQ 1*8+frame_YFER(SP), R10
+ ORQ BX, DI
+
+ MOVQ DX, R15
+ RORXQ $34, R11, R12
+ XORQ R14, R13
+ XORQ R9, R15
+
+ RORXQ $14, R8, R14
+ XORQ R14, R13
+ RORXQ $39, R11, R14
+ ANDQ R8, R15
+ ADDQ R10, CX
+
+ ANDQ AX, DI
+ XORQ R12, R14
+
+ RORXQ $28, R11, R12
+ XORQ R9, R15
+
+ XORQ R12, R14
+ MOVQ R11, R12
+ ANDQ BX, R12
+ ADDQ R13, R15
+
+ ORQ R12, DI
+ ADDQ R14, R10
+
+ ADDQ R15, CX
+ ADDQ R15, R10
+ ADDQ DI, R10
+
+ VPSRLQ $19, Y2, Y3
+ VPSLLQ $(64-19), Y2, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSRLQ $61, Y2, Y3
+ VPSLLQ $(64-61), Y2, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+
+ VPADDQ Y8, Y4, Y4
+
+ VPSRLQ $6, Y4, Y8
+
+ MOVQ R10, DI
+ RORXQ $41, CX, R13
+ ADDQ 2*8+frame_YFER(SP), R9
+
+ RORXQ $18, CX, R14
+ ORQ AX, DI
+ MOVQ R8, R15
+ XORQ DX, R15
+
+ RORXQ $34, R10, R12
+ XORQ R14, R13
+ ANDQ CX, R15
+
+ RORXQ $14, CX, R14
+ ADDQ R9, BX
+ ANDQ R11, DI
+
+ XORQ R14, R13
+ RORXQ $39, R10, R14
+ XORQ DX, R15
+
+ XORQ R12, R14
+ RORXQ $28, R10, R12
+
+ XORQ R12, R14
+ MOVQ R10, R12
+ ANDQ AX, R12
+ ADDQ R13, R15
+
+ ORQ R12, DI
+ ADDQ R14, R9
+ ADDQ R15, BX
+ ADDQ R15, R9
+
+ ADDQ DI, R9
+
+ VPSRLQ $19, Y4, Y3
+ VPSLLQ $(64-19), Y4, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSRLQ $61, Y4, Y3
+ VPSLLQ $(64-61), Y4, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+
+ VPADDQ Y8, Y0, Y2
+
+ VPBLENDD $0xF0, Y2, Y4, Y4
+
+ MOVQ R9, DI
+ RORXQ $41, BX, R13
+ RORXQ $18, BX, R14
+ ADDQ 3*8+frame_YFER(SP), DX
+ ORQ R11, DI
+
+ MOVQ CX, R15
+ RORXQ $34, R9, R12
+ XORQ R14, R13
+ XORQ R8, R15
+
+ RORXQ $14, BX, R14
+ ANDQ BX, R15
+ ADDQ DX, AX
+ ANDQ R10, DI
+
+ XORQ R14, R13
+ XORQ R8, R15
+
+ RORXQ $39, R9, R14
+ ADDQ R13, R15
+
+ XORQ R12, R14
+ ADDQ R15, AX
+
+ RORXQ $28, R9, R12
+
+ XORQ R12, R14
+ MOVQ R9, R12
+ ANDQ R11, R12
+ ORQ R12, DI
+
+ ADDQ R14, DX
+ ADDQ R15, DX
+ ADDQ DI, DX
+
+ VPADDQ 1*32(BP), Y5, Y0
+ VMOVDQU Y0, frame_YFER(SP)
+
+ MY_VPALIGNR(Y0, Y4, Y7, 8)
+
+ VPADDQ Y5, Y0, Y0
+
+ MY_VPALIGNR(Y1, Y6, Y5, 8)
+
+ VPSRLQ $1, Y1, Y2
+ VPSLLQ $(64-1), Y1, Y3
+ VPOR Y2, Y3, Y3
+
+ VPSRLQ $7, Y1, Y8
+
+ MOVQ DX, DI
+ RORXQ $41, AX, R13
+ RORXQ $18, AX, R14
+ ADDQ frame_YFER(SP), R8
+ ORQ R10, DI
+ MOVQ BX, R15
+ RORXQ $34, DX, R12
+
+ XORQ R14, R13
+ XORQ CX, R15
+ RORXQ $14, AX, R14
+
+ ANDQ AX, R15
+ XORQ R14, R13
+ RORXQ $39, DX, R14
+ ADDQ R8, R11
+
+ ANDQ R9, DI
+ XORQ R12, R14
+ RORXQ $28, DX, R12
+
+ XORQ CX, R15
+ XORQ R12, R14
+ MOVQ DX, R12
+ ANDQ R10, R12
+
+ ADDQ R13, R15
+ ORQ R12, DI
+ ADDQ R14, R8
+
+ ADDQ R15, R11
+
+ ADDQ R15, R8
+ ADDQ DI, R8
+
+ VPSRLQ $8, Y1, Y2
+ VPSLLQ $(64-8), Y1, Y1
+ VPOR Y2, Y1, Y1
+
+ VPXOR Y8, Y3, Y3
+ VPXOR Y1, Y3, Y1
+
+ VPADDQ Y1, Y0, Y0
+
+ VPERM2F128 $0x0, Y0, Y0, Y5
+
+ MOVQ $MASK_YMM_LO<>(SB), R13
+ VPAND (R13), Y0, Y0
+
+ VPERM2F128 $0x11, Y4, Y4, Y2
+ VPSRLQ $6, Y2, Y8
+
+ MOVQ R8, DI
+ RORXQ $41, R11, R13
+ RORXQ $18, R11, R14
+ ADDQ 1*8+frame_YFER(SP), CX
+ ORQ R9, DI
+
+ MOVQ AX, R15
+ RORXQ $34, R8, R12
+ XORQ R14, R13
+ XORQ BX, R15
+
+ RORXQ $14, R11, R14
+ XORQ R14, R13
+ RORXQ $39, R8, R14
+ ANDQ R11, R15
+ ADDQ CX, R10
+
+ ANDQ DX, DI
+ XORQ R12, R14
+
+ RORXQ $28, R8, R12
+ XORQ BX, R15
+
+ XORQ R12, R14
+ MOVQ R8, R12
+ ANDQ R9, R12
+ ADDQ R13, R15
+
+ ORQ R12, DI
+ ADDQ R14, CX
+
+ ADDQ R15, R10
+ ADDQ R15, CX
+ ADDQ DI, CX
+
+ VPSRLQ $19, Y2, Y3
+ VPSLLQ $(64-19), Y2, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSRLQ $61, Y2, Y3
+ VPSLLQ $(64-61), Y2, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+
+ VPADDQ Y8, Y5, Y5
+
+ VPSRLQ $6, Y5, Y8
+
+ MOVQ CX, DI
+ RORXQ $41, R10, R13
+ ADDQ 2*8+frame_YFER(SP), BX
+
+ RORXQ $18, R10, R14
+ ORQ DX, DI
+ MOVQ R11, R15
+ XORQ AX, R15
+
+ RORXQ $34, CX, R12
+ XORQ R14, R13
+ ANDQ R10, R15
+
+ RORXQ $14, R10, R14
+ ADDQ BX, R9
+ ANDQ R8, DI
+
+ XORQ R14, R13
+ RORXQ $39, CX, R14
+ XORQ AX, R15
+
+ XORQ R12, R14
+ RORXQ $28, CX, R12
+
+ XORQ R12, R14
+ MOVQ CX, R12
+ ANDQ DX, R12
+ ADDQ R13, R15
+
+ ORQ R12, DI
+ ADDQ R14, BX
+ ADDQ R15, R9
+ ADDQ R15, BX
+
+ ADDQ DI, BX
+
+ VPSRLQ $19, Y5, Y3
+ VPSLLQ $(64-19), Y5, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSRLQ $61, Y5, Y3
+ VPSLLQ $(64-61), Y5, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+
+ VPADDQ Y8, Y0, Y2
+
+ VPBLENDD $0xF0, Y2, Y5, Y5
+
+ MOVQ BX, DI
+ RORXQ $41, R9, R13
+ RORXQ $18, R9, R14
+ ADDQ 3*8+frame_YFER(SP), AX
+ ORQ R8, DI
+
+ MOVQ R10, R15
+ RORXQ $34, BX, R12
+ XORQ R14, R13
+ XORQ R11, R15
+
+ RORXQ $14, R9, R14
+ ANDQ R9, R15
+ ADDQ AX, DX
+ ANDQ CX, DI
+
+ XORQ R14, R13
+ XORQ R11, R15
+
+ RORXQ $39, BX, R14
+ ADDQ R13, R15
+
+ XORQ R12, R14
+ ADDQ R15, DX
+
+ RORXQ $28, BX, R12
+
+ XORQ R12, R14
+ MOVQ BX, R12
+ ANDQ R8, R12
+ ORQ R12, DI
+
+ ADDQ R14, AX
+ ADDQ R15, AX
+ ADDQ DI, AX
+
+ VPADDQ 2*32(BP), Y6, Y0
+ VMOVDQU Y0, frame_YFER(SP)
+
+ MY_VPALIGNR(Y0, Y5, Y4, 8)
+
+ VPADDQ Y6, Y0, Y0
+
+ MY_VPALIGNR(Y1, Y7, Y6, 8)
+
+ VPSRLQ $1, Y1, Y2
+ VPSLLQ $(64-1), Y1, Y3
+ VPOR Y2, Y3, Y3
+
+ VPSRLQ $7, Y1, Y8
+
+ MOVQ AX, DI
+ RORXQ $41, DX, R13
+ RORXQ $18, DX, R14
+ ADDQ frame_YFER(SP), R11
+ ORQ CX, DI
+ MOVQ R9, R15
+ RORXQ $34, AX, R12
+
+ XORQ R14, R13
+ XORQ R10, R15
+ RORXQ $14, DX, R14
+
+ ANDQ DX, R15
+ XORQ R14, R13
+ RORXQ $39, AX, R14
+ ADDQ R11, R8
+
+ ANDQ BX, DI
+ XORQ R12, R14
+ RORXQ $28, AX, R12
+
+ XORQ R10, R15
+ XORQ R12, R14
+ MOVQ AX, R12
+ ANDQ CX, R12
+
+ ADDQ R13, R15
+ ORQ R12, DI
+ ADDQ R14, R11
+
+ ADDQ R15, R8
+
+ ADDQ R15, R11
+ ADDQ DI, R11
+
+ VPSRLQ $8, Y1, Y2
+ VPSLLQ $(64-8), Y1, Y1
+ VPOR Y2, Y1, Y1
+
+ VPXOR Y8, Y3, Y3
+ VPXOR Y1, Y3, Y1
+
+ VPADDQ Y1, Y0, Y0
+
+ VPERM2F128 $0x0, Y0, Y0, Y6
+
+ MOVQ $MASK_YMM_LO<>(SB), R13
+ VPAND (R13), Y0, Y0
+
+ VPERM2F128 $0x11, Y5, Y5, Y2
+ VPSRLQ $6, Y2, Y8
+
+ MOVQ R11, DI
+ RORXQ $41, R8, R13
+ RORXQ $18, R8, R14
+ ADDQ 1*8+frame_YFER(SP), R10
+ ORQ BX, DI
+
+ MOVQ DX, R15
+ RORXQ $34, R11, R12
+ XORQ R14, R13
+ XORQ R9, R15
+
+ RORXQ $14, R8, R14
+ XORQ R14, R13
+ RORXQ $39, R11, R14
+ ANDQ R8, R15
+ ADDQ R10, CX
+
+ ANDQ AX, DI
+ XORQ R12, R14
+
+ RORXQ $28, R11, R12
+ XORQ R9, R15
+
+ XORQ R12, R14
+ MOVQ R11, R12
+ ANDQ BX, R12
+ ADDQ R13, R15
+
+ ORQ R12, DI
+ ADDQ R14, R10
+
+ ADDQ R15, CX
+ ADDQ R15, R10
+ ADDQ DI, R10
+
+ VPSRLQ $19, Y2, Y3
+ VPSLLQ $(64-19), Y2, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSRLQ $61, Y2, Y3
+ VPSLLQ $(64-61), Y2, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+
+ VPADDQ Y8, Y6, Y6
+
+ VPSRLQ $6, Y6, Y8
+
+ MOVQ R10, DI
+ RORXQ $41, CX, R13
+ ADDQ 2*8+frame_YFER(SP), R9
+
+ RORXQ $18, CX, R14
+ ORQ AX, DI
+ MOVQ R8, R15
+ XORQ DX, R15
+
+ RORXQ $34, R10, R12
+ XORQ R14, R13
+ ANDQ CX, R15
+
+ RORXQ $14, CX, R14
+ ADDQ R9, BX
+ ANDQ R11, DI
+
+ XORQ R14, R13
+ RORXQ $39, R10, R14
+ XORQ DX, R15
+
+ XORQ R12, R14
+ RORXQ $28, R10, R12
+
+ XORQ R12, R14
+ MOVQ R10, R12
+ ANDQ AX, R12
+ ADDQ R13, R15
+
+ ORQ R12, DI
+ ADDQ R14, R9
+ ADDQ R15, BX
+ ADDQ R15, R9
+
+ ADDQ DI, R9
+
+ VPSRLQ $19, Y6, Y3
+ VPSLLQ $(64-19), Y6, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSRLQ $61, Y6, Y3
+ VPSLLQ $(64-61), Y6, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+
+ VPADDQ Y8, Y0, Y2
+
+ VPBLENDD $0xF0, Y2, Y6, Y6
+
+ MOVQ R9, DI
+ RORXQ $41, BX, R13
+ RORXQ $18, BX, R14
+ ADDQ 3*8+frame_YFER(SP), DX
+ ORQ R11, DI
+
+ MOVQ CX, R15
+ RORXQ $34, R9, R12
+ XORQ R14, R13
+ XORQ R8, R15
+
+ RORXQ $14, BX, R14
+ ANDQ BX, R15
+ ADDQ DX, AX
+ ANDQ R10, DI
+
+ XORQ R14, R13
+ XORQ R8, R15
+
+ RORXQ $39, R9, R14
+ ADDQ R13, R15
+
+ XORQ R12, R14
+ ADDQ R15, AX
+
+ RORXQ $28, R9, R12
+
+ XORQ R12, R14
+ MOVQ R9, R12
+ ANDQ R11, R12
+ ORQ R12, DI
+
+ ADDQ R14, DX
+ ADDQ R15, DX
+ ADDQ DI, DX
+
+ VPADDQ 3*32(BP), Y7, Y0
+ VMOVDQU Y0, frame_YFER(SP)
+ ADDQ $(4*32), BP
+
+ MY_VPALIGNR(Y0, Y6, Y5, 8)
+
+ VPADDQ Y7, Y0, Y0
+
+ MY_VPALIGNR(Y1, Y4, Y7, 8)
+
+ VPSRLQ $1, Y1, Y2
+ VPSLLQ $(64-1), Y1, Y3
+ VPOR Y2, Y3, Y3
+
+ VPSRLQ $7, Y1, Y8
+
+ MOVQ DX, DI
+ RORXQ $41, AX, R13
+ RORXQ $18, AX, R14
+ ADDQ frame_YFER(SP), R8
+ ORQ R10, DI
+ MOVQ BX, R15
+ RORXQ $34, DX, R12
+
+ XORQ R14, R13
+ XORQ CX, R15
+ RORXQ $14, AX, R14
+
+ ANDQ AX, R15
+ XORQ R14, R13
+ RORXQ $39, DX, R14
+ ADDQ R8, R11
+
+ ANDQ R9, DI
+ XORQ R12, R14
+ RORXQ $28, DX, R12
+
+ XORQ CX, R15
+ XORQ R12, R14
+ MOVQ DX, R12
+ ANDQ R10, R12
+
+ ADDQ R13, R15
+ ORQ R12, DI
+ ADDQ R14, R8
+
+ ADDQ R15, R11
+
+ ADDQ R15, R8
+ ADDQ DI, R8
+
+ VPSRLQ $8, Y1, Y2
+ VPSLLQ $(64-8), Y1, Y1
+ VPOR Y2, Y1, Y1
+
+ VPXOR Y8, Y3, Y3
+ VPXOR Y1, Y3, Y1
+
+ VPADDQ Y1, Y0, Y0
+
+ VPERM2F128 $0x0, Y0, Y0, Y7
+
+ MOVQ $MASK_YMM_LO<>(SB), R13
+ VPAND (R13), Y0, Y0
+
+ VPERM2F128 $0x11, Y6, Y6, Y2
+ VPSRLQ $6, Y2, Y8
+
+ MOVQ R8, DI
+ RORXQ $41, R11, R13
+ RORXQ $18, R11, R14
+ ADDQ 1*8+frame_YFER(SP), CX
+ ORQ R9, DI
+
+ MOVQ AX, R15
+ RORXQ $34, R8, R12
+ XORQ R14, R13
+ XORQ BX, R15
+
+ RORXQ $14, R11, R14
+ XORQ R14, R13
+ RORXQ $39, R8, R14
+ ANDQ R11, R15
+ ADDQ CX, R10
+
+ ANDQ DX, DI
+ XORQ R12, R14
+
+ RORXQ $28, R8, R12
+ XORQ BX, R15
+
+ XORQ R12, R14
+ MOVQ R8, R12
+ ANDQ R9, R12
+ ADDQ R13, R15
+
+ ORQ R12, DI
+ ADDQ R14, CX
+
+ ADDQ R15, R10
+ ADDQ R15, CX
+ ADDQ DI, CX
+
+ VPSRLQ $19, Y2, Y3
+ VPSLLQ $(64-19), Y2, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSRLQ $61, Y2, Y3
+ VPSLLQ $(64-61), Y2, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+
+ VPADDQ Y8, Y7, Y7
+
+ VPSRLQ $6, Y7, Y8
+
+ MOVQ CX, DI
+ RORXQ $41, R10, R13
+ ADDQ 2*8+frame_YFER(SP), BX
+
+ RORXQ $18, R10, R14
+ ORQ DX, DI
+ MOVQ R11, R15
+ XORQ AX, R15
+
+ RORXQ $34, CX, R12
+ XORQ R14, R13
+ ANDQ R10, R15
+
+ RORXQ $14, R10, R14
+ ADDQ BX, R9
+ ANDQ R8, DI
+
+ XORQ R14, R13
+ RORXQ $39, CX, R14
+ XORQ AX, R15
+
+ XORQ R12, R14
+ RORXQ $28, CX, R12
+
+ XORQ R12, R14
+ MOVQ CX, R12
+ ANDQ DX, R12
+ ADDQ R13, R15
+
+ ORQ R12, DI
+ ADDQ R14, BX
+ ADDQ R15, R9
+ ADDQ R15, BX
+
+ ADDQ DI, BX
+
+ VPSRLQ $19, Y7, Y3
+ VPSLLQ $(64-19), Y7, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSRLQ $61, Y7, Y3
+ VPSLLQ $(64-61), Y7, Y1
+ VPOR Y1, Y3, Y3
+ VPXOR Y3, Y8, Y8
+
+ VPADDQ Y8, Y0, Y2
+
+ VPBLENDD $0xF0, Y2, Y7, Y7
+
+ MOVQ BX, DI
+ RORXQ $41, R9, R13
+ RORXQ $18, R9, R14
+ ADDQ 3*8+frame_YFER(SP), AX
+ ORQ R8, DI
+
+ MOVQ R10, R15
+ RORXQ $34, BX, R12
+ XORQ R14, R13
+ XORQ R11, R15
+
+ RORXQ $14, R9, R14
+ ANDQ R9, R15
+ ADDQ AX, DX
+ ANDQ CX, DI
+
+ XORQ R14, R13
+ XORQ R11, R15
+
+ RORXQ $39, BX, R14
+ ADDQ R13, R15
+
+ XORQ R12, R14
+ ADDQ R15, DX
+
+ RORXQ $28, BX, R12
+
+ XORQ R12, R14
+ MOVQ BX, R12
+ ANDQ R8, R12
+ ORQ R12, DI
+
+ ADDQ R14, AX
+ ADDQ R15, AX
+ ADDQ DI, AX
+
+ SUBQ $1, frame_SRND(SP)
+ JNE loop1
+
+ MOVQ $2, frame_SRND(SP)
+
+loop2:
+ VPADDQ (BP), Y4, Y0
+ VMOVDQU Y0, frame_YFER(SP)
+
+ MOVQ R9, R15
+ RORXQ $41, DX, R13
+ RORXQ $18, DX, R14
+ XORQ R10, R15
+
+ XORQ R14, R13
+ RORXQ $14, DX, R14
+ ANDQ DX, R15
+
+ XORQ R14, R13
+ RORXQ $34, AX, R12
+ XORQ R10, R15
+ RORXQ $39, AX, R14
+ MOVQ AX, DI
+
+ XORQ R12, R14
+ RORXQ $28, AX, R12
+ ADDQ frame_YFER(SP), R11
+ ORQ CX, DI
+
+ XORQ R12, R14
+ MOVQ AX, R12
+ ANDQ BX, DI
+ ANDQ CX, R12
+ ADDQ R13, R15
+
+ ADDQ R11, R8
+ ORQ R12, DI
+ ADDQ R14, R11
+
+ ADDQ R15, R8
+
+ ADDQ R15, R11
+ MOVQ DX, R15
+ RORXQ $41, R8, R13
+ RORXQ $18, R8, R14
+ XORQ R9, R15
+
+ XORQ R14, R13
+ RORXQ $14, R8, R14
+ ANDQ R8, R15
+ ADDQ DI, R11
+
+ XORQ R14, R13
+ RORXQ $34, R11, R12
+ XORQ R9, R15
+ RORXQ $39, R11, R14
+ MOVQ R11, DI
+
+ XORQ R12, R14
+ RORXQ $28, R11, R12
+ ADDQ 8*1+frame_YFER(SP), R10
+ ORQ BX, DI
+
+ XORQ R12, R14
+ MOVQ R11, R12
+ ANDQ AX, DI
+ ANDQ BX, R12
+ ADDQ R13, R15
+
+ ADDQ R10, CX
+ ORQ R12, DI
+ ADDQ R14, R10
+
+ ADDQ R15, CX
+
+ ADDQ R15, R10
+ MOVQ R8, R15
+ RORXQ $41, CX, R13
+ RORXQ $18, CX, R14
+ XORQ DX, R15
+
+ XORQ R14, R13
+ RORXQ $14, CX, R14
+ ANDQ CX, R15
+ ADDQ DI, R10
+
+ XORQ R14, R13
+ RORXQ $34, R10, R12
+ XORQ DX, R15
+ RORXQ $39, R10, R14
+ MOVQ R10, DI
+
+ XORQ R12, R14
+ RORXQ $28, R10, R12
+ ADDQ 8*2+frame_YFER(SP), R9
+ ORQ AX, DI
+
+ XORQ R12, R14
+ MOVQ R10, R12
+ ANDQ R11, DI
+ ANDQ AX, R12
+ ADDQ R13, R15
+
+ ADDQ R9, BX
+ ORQ R12, DI
+ ADDQ R14, R9
+
+ ADDQ R15, BX
+
+ ADDQ R15, R9
+ MOVQ CX, R15
+ RORXQ $41, BX, R13
+ RORXQ $18, BX, R14
+ XORQ R8, R15
+
+ XORQ R14, R13
+ RORXQ $14, BX, R14
+ ANDQ BX, R15
+ ADDQ DI, R9
+
+ XORQ R14, R13
+ RORXQ $34, R9, R12
+ XORQ R8, R15
+ RORXQ $39, R9, R14
+ MOVQ R9, DI
+
+ XORQ R12, R14
+ RORXQ $28, R9, R12
+ ADDQ 8*3+frame_YFER(SP), DX
+ ORQ R11, DI
+
+ XORQ R12, R14
+ MOVQ R9, R12
+ ANDQ R10, DI
+ ANDQ R11, R12
+ ADDQ R13, R15
+
+ ADDQ DX, AX
+ ORQ R12, DI
+ ADDQ R14, DX
+
+ ADDQ R15, AX
+
+ ADDQ R15, DX
+
+ ADDQ DI, DX
+
+ VPADDQ 1*32(BP), Y5, Y0
+ VMOVDQU Y0, frame_YFER(SP)
+ ADDQ $(2*32), BP
+
+ MOVQ BX, R15
+ RORXQ $41, AX, R13
+ RORXQ $18, AX, R14
+ XORQ CX, R15
+
+ XORQ R14, R13
+ RORXQ $14, AX, R14
+ ANDQ AX, R15
+
+ XORQ R14, R13
+ RORXQ $34, DX, R12
+ XORQ CX, R15
+ RORXQ $39, DX, R14
+ MOVQ DX, DI
+
+ XORQ R12, R14
+ RORXQ $28, DX, R12
+ ADDQ frame_YFER(SP), R8
+ ORQ R10, DI
+
+ XORQ R12, R14
+ MOVQ DX, R12
+ ANDQ R9, DI
+ ANDQ R10, R12
+ ADDQ R13, R15
+
+ ADDQ R8, R11
+ ORQ R12, DI
+ ADDQ R14, R8
+
+ ADDQ R15, R11
+
+ ADDQ R15, R8
+ MOVQ AX, R15
+ RORXQ $41, R11, R13
+ RORXQ $18, R11, R14
+ XORQ BX, R15
+
+ XORQ R14, R13
+ RORXQ $14, R11, R14
+ ANDQ R11, R15
+ ADDQ DI, R8
+
+ XORQ R14, R13
+ RORXQ $34, R8, R12
+ XORQ BX, R15
+ RORXQ $39, R8, R14
+ MOVQ R8, DI
+
+ XORQ R12, R14
+ RORXQ $28, R8, R12
+ ADDQ 8*1+frame_YFER(SP), CX
+ ORQ R9, DI
+
+ XORQ R12, R14
+ MOVQ R8, R12
+ ANDQ DX, DI
+ ANDQ R9, R12
+ ADDQ R13, R15
+
+ ADDQ CX, R10
+ ORQ R12, DI
+ ADDQ R14, CX
+
+ ADDQ R15, R10
+
+ ADDQ R15, CX
+ MOVQ R11, R15
+ RORXQ $41, R10, R13
+ RORXQ $18, R10, R14
+ XORQ AX, R15
+
+ XORQ R14, R13
+ RORXQ $14, R10, R14
+ ANDQ R10, R15
+ ADDQ DI, CX
+
+ XORQ R14, R13
+ RORXQ $34, CX, R12
+ XORQ AX, R15
+ RORXQ $39, CX, R14
+ MOVQ CX, DI
+
+ XORQ R12, R14
+ RORXQ $28, CX, R12
+ ADDQ 8*2+frame_YFER(SP), BX
+ ORQ DX, DI
+
+ XORQ R12, R14
+ MOVQ CX, R12
+ ANDQ R8, DI
+ ANDQ DX, R12
+ ADDQ R13, R15
+
+ ADDQ BX, R9
+ ORQ R12, DI
+ ADDQ R14, BX
+
+ ADDQ R15, R9
+
+ ADDQ R15, BX
+ MOVQ R10, R15
+ RORXQ $41, R9, R13
+ RORXQ $18, R9, R14
+ XORQ R11, R15
+
+ XORQ R14, R13
+ RORXQ $14, R9, R14
+ ANDQ R9, R15
+ ADDQ DI, BX
+
+ XORQ R14, R13
+ RORXQ $34, BX, R12
+ XORQ R11, R15
+ RORXQ $39, BX, R14
+ MOVQ BX, DI
+
+ XORQ R12, R14
+ RORXQ $28, BX, R12
+ ADDQ 8*3+frame_YFER(SP), AX
+ ORQ R8, DI
+
+ XORQ R12, R14
+ MOVQ BX, R12
+ ANDQ CX, DI
+ ANDQ R8, R12
+ ADDQ R13, R15
+
+ ADDQ AX, DX
+ ORQ R12, DI
+ ADDQ R14, AX
+
+ ADDQ R15, DX
+
+ ADDQ R15, AX
+
+ ADDQ DI, AX
+
+ VMOVDQU Y6, Y4
+ VMOVDQU Y7, Y5
+
+ SUBQ $1, frame_SRND(SP)
+ JNE loop2
+
+ addm(8*0(SI),AX)
+ addm(8*1(SI),BX)
+ addm(8*2(SI),CX)
+ addm(8*3(SI),R8)
+ addm(8*4(SI),DX)
+ addm(8*5(SI),R9)
+ addm(8*6(SI),R10)
+ addm(8*7(SI),R11)
+
+ MOVQ frame_INP(SP), DI
+ ADDQ $128, DI
+ CMPQ DI, frame_INPEND(SP)
+ JNE loop0
+
+done_hash:
+ VZEROUPPER
+ RET
+
+// func checkAVX2() bool
+// returns whether AVX2 is supported
+TEXT ·checkAVX2(SB), NOSPLIT, $0
+ MOVB runtime·support_avx2(SB), AX
+ CMPB AX,$0
+ JNE check_bmi2
+ MOVB AX, ret+0(FP)
+check_bmi2:
+ MOVB runtime·support_bmi2(SB), AX
+ MOVB AX, ret+0(FP)
+ RET