From: Ilya Tocar Date: Fri, 29 Apr 2016 13:17:14 +0000 (+0300) Subject: crypto/sha1: Add AVX2 version for AMD64 X-Git-Tag: go1.7beta1~301 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=fafadc521ede90f8abed73e8d209e130c456e983;p=gostls13.git crypto/sha1: Add AVX2 version for AMD64 name old time/op new time/op delta Hash8Bytes-48 271ns ± 8% 273ns ± 5% ~ (p=0.313 n=19+19) Hash320Bytes-48 1.04µs ± 7% 0.75µs ± 8% -27.66% (p=0.000 n=20+20) Hash1K-48 2.72µs ± 6% 1.75µs ± 6% -35.79% (p=0.000 n=19+20) Hash8K-48 19.9µs ± 7% 11.6µs ± 6% -41.84% (p=0.000 n=20+19) name old speed new speed delta Hash8Bytes-48 29.5MB/s ± 8% 29.3MB/s ± 5% ~ (p=0.314 n=19+19) Hash320Bytes-48 307MB/s ± 7% 424MB/s ± 8% +38.29% (p=0.000 n=20+20) Hash1K-48 377MB/s ± 6% 587MB/s ± 6% +55.76% (p=0.000 n=19+20) Hash8K-48 413MB/s ± 7% 709MB/s ± 6% +71.85% (p=0.000 n=20+19) Change-Id: I2963cf744eeb2e8191d4e4223fbf6f533a7fd405 Reviewed-on: https://go-review.googlesource.com/22607 Run-TryBot: Ilya Tocar Reviewed-by: Russ Cox Run-TryBot: Russ Cox TryBot-Result: Gobot Gobot --- diff --git a/src/crypto/sha1/sha1_test.go b/src/crypto/sha1/sha1_test.go index 9202e682a8..daab2aeaef 100644 --- a/src/crypto/sha1/sha1_test.go +++ b/src/crypto/sha1/sha1_test.go @@ -19,6 +19,7 @@ type sha1Test struct { } var golden = []sha1Test{ + {"76245dbf96f661bd221046197ab8b9f063f11bad", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"}, {"da39a3ee5e6b4b0d3255bfef95601890afd80709", ""}, {"86f7e437faa5a7fce15d1ddcb9eaeaea377667b8", "a"}, {"da23614e02469a0d7c7bd1bdab5c9c474b1904dc", "ab"}, @@ -120,6 +121,10 @@ func BenchmarkHash8Bytes(b *testing.B) { benchmarkSize(b, 8) } +func BenchmarkHash320Bytes(b *testing.B) { + benchmarkSize(b, 320) +} + func BenchmarkHash1K(b *testing.B) { benchmarkSize(b, 1024) } diff --git a/src/crypto/sha1/sha1block_amd64.go b/src/crypto/sha1/sha1block_amd64.go new file mode 100644 index 0000000000..84f8a52019 --- /dev/null +++ b/src/crypto/sha1/sha1block_amd64.go @@ -0,0 +1,23 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sha1 + +//go:noescape + +func blockAVX2(dig *digest, p []byte) + +//go:noescape +func blockAMD64(dig *digest, p []byte) +func checkAVX2() bool + +var hasAVX2 = checkAVX2() + +func block(dig *digest, p []byte) { + if hasAVX2 && len(p) >= 256 { + blockAVX2(dig, p) + } else { + blockAMD64(dig, p) + } +} diff --git a/src/crypto/sha1/sha1block_amd64.s b/src/crypto/sha1/sha1block_amd64.s index a504e14751..0cdb43b422 100644 --- a/src/crypto/sha1/sha1block_amd64.s +++ b/src/crypto/sha1/sha1block_amd64.s @@ -2,6 +2,15 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +// AVX2 version by Intel, same algorithm as code in Linux kernel: +// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S +// Authors: +// Ilya Albrekht +// Maxim Locktyukhin +// Ronen Zohar +// Chandramouli Narayanan + + #include "textflag.h" // SHA1 block routine. See sha1block.go for Go equivalent. @@ -87,7 +96,7 @@ FUNC4(a, b, c, d, e); \ MIX(a, b, c, d, e, 0xCA62C1D6) -TEXT ·block(SB),NOSPLIT,$64-32 +TEXT ·blockAMD64(SB),NOSPLIT,$64-32 MOVQ dig+0(FP), BP MOVQ p_base+8(FP), SI MOVQ p_len+16(FP), DX @@ -214,3 +223,1293 @@ end: MOVL DX, (3*4)(DI) MOVL BP, (4*4)(DI) RET + + +// This is the implementation using AVX2. It is based on: +// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" +// From http://software.intel.com/en-us/articles +// (look for improving-the-performance-of-the-secure-hash-algorithm-1) +// This implementation is 2x unrolled, and interleaves vector instructions, +// used to precompute W, with scalar computation of current round +// for optimal scheduling. + +// Trivial helper macros. +#define UPDATE_HASH(A,TB,C,D,E) \ + ADDL (R9), A \ + MOVL A, (R9) \ + ADDL 4(R9), TB \ + MOVL TB, 4(R9) \ + ADDL 8(R9), C \ + MOVL C, 8(R9) \ + ADDL 12(R9), D \ + MOVL D, 12(R9) \ + ADDL 16(R9), E \ + MOVL E, 16(R9) + + + +// Helper macros for PRECALC, which does precomputations +#define PRECALC_0(OFFSET) \ + VMOVDQU OFFSET(R10),X0 + +#define PRECALC_1(OFFSET) \ + VINSERTI128 $1, OFFSET(R13), Y0, Y0 + +#define PRECALC_2(YREG) \ + VPSHUFB Y10, Y0, YREG + +#define PRECALC_4(YREG,K_OFFSET) \ + VPADDD K_OFFSET(R8), YREG, Y0 + +#define PRECALC_7(OFFSET) \ + VMOVDQU Y0, (OFFSET*2)(R14) + + +// Message scheduling pre-compute for rounds 0-15 +// R13 is a pointer to even 64-byte block +// R10 is a pointer to odd 64-byte block +// R14 is a pointer to temp buffer +// X0 is used as temp register +// YREG is clobbered as part of computation +// OFFSET chooses 16 byte chunk within a block +// R8 is a pointer to constants block +// K_OFFSET chooses K constants relevant to this round +// X10 holds swap mask +#define PRECALC_00_15(OFFSET,YREG) \ + PRECALC_0(OFFSET) \ + PRECALC_1(OFFSET) \ + PRECALC_2(YREG) \ + PRECALC_4(YREG,0x0) \ + PRECALC_7(OFFSET) + + +// Helper macros for PRECALC_16_31 +#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ + VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14] + VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3] + +#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ + VPXOR REG_SUB_8, REG, REG \ + VPXOR REG_SUB_16, Y0, Y0 + +#define PRECALC_18(REG) \ + VPXOR Y0, REG, REG \ + VPSLLDQ $12, REG, Y9 + +#define PRECALC_19(REG) \ + VPSLLD $1, REG, Y0 \ + VPSRLD $31, REG, REG + +#define PRECALC_20(REG) \ + VPOR REG, Y0, Y0 \ + VPSLLD $2, Y9, REG + +#define PRECALC_21(REG) \ + VPSRLD $30, Y9, Y9 \ + VPXOR REG, Y0, Y0 + +#define PRECALC_23(REG,K_OFFSET,OFFSET) \ + VPXOR Y9, Y0, REG \ + VPADDD K_OFFSET(R8), REG, Y0 \ + VMOVDQU Y0, (OFFSET)(R14) + +// Message scheduling pre-compute for rounds 16-31 +// calculating last 32 w[i] values in 8 XMM registers +// pre-calculate K+w[i] values and store to mem +// for later load by ALU add instruction. +// "brute force" vectorization for rounds 16-31 only +// due to w[i]->w[i-3] dependency. +// clobbers 5 input ymm registers REG_SUB* +// uses X0 and X9 as temp registers +// As always, R8 is a pointer to constants block +// and R14 is a pointer to temp buffer +#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \ + PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ + PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ + PRECALC_18(REG) \ + PRECALC_19(REG) \ + PRECALC_20(REG) \ + PRECALC_21(REG) \ + PRECALC_23(REG,K_OFFSET,OFFSET) + + +// Helper macros for PRECALC_32_79 +#define PRECALC_32(REG_SUB_8,REG_SUB_4) \ + VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0 + +#define PRECALC_33(REG_SUB_28,REG) \ + VPXOR REG_SUB_28, REG, REG + +#define PRECALC_34(REG_SUB_16) \ + VPXOR REG_SUB_16, Y0, Y0 + +#define PRECALC_35(REG) \ + VPXOR Y0, REG, REG + +#define PRECALC_36(REG) \ + VPSLLD $2, REG, Y0 + +#define PRECALC_37(REG) \ + VPSRLD $30, REG, REG \ + VPOR REG, Y0, REG + +#define PRECALC_39(REG,K_OFFSET,OFFSET) \ + VPADDD K_OFFSET(R8), REG, Y0 \ + VMOVDQU Y0, (OFFSET)(R14) + +// Message scheduling pre-compute for rounds 32-79 +// In SHA-1 specification we have: +// w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 +// Which is the same as: +// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 +// This allows for more efficient vectorization, +// since w[i]->w[i-3] dependency is broken +#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \ + PRECALC_32(REG_SUB_8,REG_SUB_4) \ + PRECALC_33(REG_SUB_28,REG) \ + PRECALC_34(REG_SUB_16) \ + PRECALC_35(REG) \ + PRECALC_36(REG) \ + PRECALC_37(REG) \ + PRECALC_39(REG,K_OFFSET,OFFSET) + +#define PRECALC \ + PRECALC_00_15(0,Y15) \ + PRECALC_00_15(0x10,Y14) \ + PRECALC_00_15(0x20,Y13) \ + PRECALC_00_15(0x30,Y12) \ + PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \ + PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \ + PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \ + PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \ + PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \ + PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \ + PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \ + PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \ + PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \ + PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \ + PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \ + PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \ + PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \ + PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \ + PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \ + PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260) + +// Macros calculating individual rounds have general forn +// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST +// CALC_ROUND_{PRE,POST} macros follow + +#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \ + ADDL OFFSET(R15),REG_E \ + ANDNL REG_C,REG_A,BP \ + LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round + RORXL $0x1b, REG_A, R12 \ + RORXL $2, REG_A, REG_B // for next round + +// Calculate F for the next round +#define CALC_F1_POST(REG_A,REG_B,REG_E) \ + ANDL REG_B,REG_A \ // b&c + XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d) + LEAL (REG_E)(R12*1), REG_E // E += A >>> 5 + + +// Registers are cycleickly rotated DX -> AX -> DI -> SI -> BX -> CX +#define CALC_0 \ + MOVL SI, BX \ // Precalculating first round + RORXL $2, SI, SI \ + ANDNL AX, BX, BP \ + ANDL DI, BX \ + XORL BP, BX \ + CALC_F1_PRE(0x0,CX,BX,DI,DX) \ + PRECALC_0(0x80) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_1 \ + CALC_F1_PRE(0x4,DX,CX,SI,AX) \ + PRECALC_1(0x80) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_2 \ + CALC_F1_PRE(0x8,AX,DX,BX,DI) \ + PRECALC_2(Y15) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_3 \ + CALC_F1_PRE(0xc,DI,AX,CX,SI) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_4 \ + CALC_F1_PRE(0x20,SI,DI,DX,BX) \ + PRECALC_4(Y15,0x0) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_5 \ + CALC_F1_PRE(0x24,BX,SI,AX,CX) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_6 \ + CALC_F1_PRE(0x28,CX,BX,DI,DX) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_7 \ + CALC_F1_PRE(0x2c,DX,CX,SI,AX) \ + PRECALC_7(0x0) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_8 \ + CALC_F1_PRE(0x40,AX,DX,BX,DI) \ + PRECALC_0(0x90) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_9 \ + CALC_F1_PRE(0x44,DI,AX,CX,SI) \ + PRECALC_1(0x90) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_10 \ + CALC_F1_PRE(0x48,SI,DI,DX,BX) \ + PRECALC_2(Y14) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_11 \ + CALC_F1_PRE(0x4c,BX,SI,AX,CX) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_12 \ + CALC_F1_PRE(0x60,CX,BX,DI,DX) \ + PRECALC_4(Y14,0x0) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_13 \ + CALC_F1_PRE(0x64,DX,CX,SI,AX) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_14 \ + CALC_F1_PRE(0x68,AX,DX,BX,DI) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_15 \ + CALC_F1_PRE(0x6c,DI,AX,CX,SI) \ + PRECALC_7(0x10) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_16 \ + CALC_F1_PRE(0x80,SI,DI,DX,BX) \ + PRECALC_0(0xa0) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_17 \ + CALC_F1_PRE(0x84,BX,SI,AX,CX) \ + PRECALC_1(0xa0) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_18 \ + CALC_F1_PRE(0x88,CX,BX,DI,DX) \ + PRECALC_2(Y13) \ + CALC_F1_POST(CX,SI,DX) + + +#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \ + ADDL OFFSET(R15),REG_E \ + LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round + RORXL $0x1b, REG_A, R12 \ + RORXL $2, REG_A, REG_B // for next round + +#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \ + XORL REG_B, REG_A \ + ADDL R12, REG_E \ + XORL REG_C, REG_A + +#define CALC_19 \ + CALC_F2_PRE(0x8c,DX,CX,AX) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_20 \ + CALC_F2_PRE(0xa0,AX,DX,DI) \ + PRECALC_4(Y13,0x0) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_21 \ + CALC_F2_PRE(0xa4,DI,AX,SI) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_22 \ + CALC_F2_PRE(0xa8,SI,DI,BX) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_23 \ + CALC_F2_PRE(0xac,BX,SI,CX) \ + PRECALC_7(0x20) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_24 \ + CALC_F2_PRE(0xc0,CX,BX,DX) \ + PRECALC_0(0xb0) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_25 \ + CALC_F2_PRE(0xc4,DX,CX,AX) \ + PRECALC_1(0xb0) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_26 \ + CALC_F2_PRE(0xc8,AX,DX,DI) \ + PRECALC_2(Y12) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_27 \ + CALC_F2_PRE(0xcc,DI,AX,SI) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_28 \ + CALC_F2_PRE(0xe0,SI,DI,BX) \ + PRECALC_4(Y12,0x0) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_29 \ + CALC_F2_PRE(0xe4,BX,SI,CX) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_30 \ + CALC_F2_PRE(0xe8,CX,BX,DX) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_31 \ + CALC_F2_PRE(0xec,DX,CX,AX) \ + PRECALC_7(0x30) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_32 \ + CALC_F2_PRE(0x100,AX,DX,DI) \ + PRECALC_16(Y15,Y14,Y12,Y8) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_33 \ + CALC_F2_PRE(0x104,DI,AX,SI) \ + PRECALC_17(Y15,Y13,Y8) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_34 \ + CALC_F2_PRE(0x108,SI,DI,BX) \ + PRECALC_18(Y8) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_35 \ + CALC_F2_PRE(0x10c,BX,SI,CX) \ + PRECALC_19(Y8) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_36 \ + CALC_F2_PRE(0x120,CX,BX,DX) \ + PRECALC_20(Y8) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_37 \ + CALC_F2_PRE(0x124,DX,CX,AX) \ + PRECALC_21(Y8) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_38 \ + CALC_F2_PRE(0x128,AX,DX,DI) \ + CALC_F2_POST(AX,CX,BX,DI) + + +#define CALC_F3_PRE(OFFSET,REG_E) \ + ADDL OFFSET(R15),REG_E + +#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \ + LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round + MOVL REG_B, BP \ + ORL REG_A, BP \ + RORXL $0x1b, REG_A, R12 \ + RORXL $2, REG_A, REG_TB \ + ANDL REG_C, BP \ // Calculate F for the next round + ANDL REG_B, REG_A \ + ORL BP, REG_A \ + ADDL R12, REG_E + +#define CALC_39 \ + CALC_F3_PRE(0x12c,SI) \ + PRECALC_23(Y8,0x0,0x80) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_40 \ + CALC_F3_PRE(0x140,BX) \ + PRECALC_16(Y14,Y13,Y8,Y7) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_41 \ + CALC_F3_PRE(0x144,CX) \ + PRECALC_17(Y14,Y12,Y7) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_42 \ + CALC_F3_PRE(0x148,DX) \ + PRECALC_18(Y7) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_43 \ + CALC_F3_PRE(0x14c,AX) \ + PRECALC_19(Y7) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_44 \ + CALC_F3_PRE(0x160,DI) \ + PRECALC_20(Y7) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_45 \ + CALC_F3_PRE(0x164,SI) \ + PRECALC_21(Y7) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_46 \ + CALC_F3_PRE(0x168,BX) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_47 \ + CALC_F3_PRE(0x16c,CX) \ + VPXOR Y9, Y0, Y7 \ + VPADDD 0x20(R8), Y7, Y0 \ + VMOVDQU Y0, 0xa0(R14) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_48 \ + CALC_F3_PRE(0x180,DX) \ + PRECALC_16(Y13,Y12,Y7,Y5) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_49 \ + CALC_F3_PRE(0x184,AX) \ + PRECALC_17(Y13,Y8,Y5) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_50 \ + CALC_F3_PRE(0x188,DI) \ + PRECALC_18(Y5) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_51 \ + CALC_F3_PRE(0x18c,SI) \ + PRECALC_19(Y5) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_52 \ + CALC_F3_PRE(0x1a0,BX) \ + PRECALC_20(Y5) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_53 \ + CALC_F3_PRE(0x1a4,CX) \ + PRECALC_21(Y5) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_54 \ + CALC_F3_PRE(0x1a8,DX) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_55 \ + CALC_F3_PRE(0x1ac,AX) \ + PRECALC_23(Y5,0x20,0xc0) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_56 \ + CALC_F3_PRE(0x1c0,DI) \ + PRECALC_16(Y12,Y8,Y5,Y3) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_57 \ + CALC_F3_PRE(0x1c4,SI) \ + PRECALC_17(Y12,Y7,Y3) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_58 \ + CALC_F3_PRE(0x1c8,BX) \ + PRECALC_18(Y3) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_59 \ + CALC_F2_PRE(0x1cc,BX,SI,CX) \ + PRECALC_19(Y3) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_60 \ + CALC_F2_PRE(0x1e0,CX,BX,DX) \ + PRECALC_20(Y3) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_61 \ + CALC_F2_PRE(0x1e4,DX,CX,AX) \ + PRECALC_21(Y3) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_62 \ + CALC_F2_PRE(0x1e8,AX,DX,DI) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_63 \ + CALC_F2_PRE(0x1ec,DI,AX,SI) \ + PRECALC_23(Y3,0x20,0xe0) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_64 \ + CALC_F2_PRE(0x200,SI,DI,BX) \ + PRECALC_32(Y5,Y3) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_65 \ + CALC_F2_PRE(0x204,BX,SI,CX) \ + PRECALC_33(Y14,Y15) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_66 \ + CALC_F2_PRE(0x208,CX,BX,DX) \ + PRECALC_34(Y8) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_67 \ + CALC_F2_PRE(0x20c,DX,CX,AX) \ + PRECALC_35(Y15) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_68 \ + CALC_F2_PRE(0x220,AX,DX,DI) \ + PRECALC_36(Y15) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_69 \ + CALC_F2_PRE(0x224,DI,AX,SI) \ + PRECALC_37(Y15) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_70 \ + CALC_F2_PRE(0x228,SI,DI,BX) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_71 \ + CALC_F2_PRE(0x22c,BX,SI,CX) \ + PRECALC_39(Y15,0x20,0x100) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_72 \ + CALC_F2_PRE(0x240,CX,BX,DX) \ + PRECALC_32(Y3,Y15) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_73 \ + CALC_F2_PRE(0x244,DX,CX,AX) \ + PRECALC_33(Y13,Y14) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_74 \ + CALC_F2_PRE(0x248,AX,DX,DI) \ + PRECALC_34(Y7) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_75 \ + CALC_F2_PRE(0x24c,DI,AX,SI) \ + PRECALC_35(Y14) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_76 \ + CALC_F2_PRE(0x260,SI,DI,BX) \ + PRECALC_36(Y14) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_77 \ + CALC_F2_PRE(0x264,BX,SI,CX) \ + PRECALC_37(Y14) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_78 \ + CALC_F2_PRE(0x268,CX,BX,DX) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_79 \ + ADDL 0x26c(R15), AX \ + LEAL (AX)(CX*1), AX \ + RORXL $0x1b, DX, R12 \ + PRECALC_39(Y14,0x20,0x120) \ + ADDL R12, AX + +// Similar to CALC_0 +#define CALC_80 \ + MOVL CX, DX \ + RORXL $2, CX, CX \ + ANDNL SI, DX, BP \ + ANDL BX, DX \ + XORL BP, DX \ + CALC_F1_PRE(0x10,AX,DX,BX,DI) \ + PRECALC_32(Y15,Y14) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_81 \ + CALC_F1_PRE(0x14,DI,AX,CX,SI) \ + PRECALC_33(Y12,Y13) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_82 \ + CALC_F1_PRE(0x18,SI,DI,DX,BX) \ + PRECALC_34(Y5) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_83 \ + CALC_F1_PRE(0x1c,BX,SI,AX,CX) \ + PRECALC_35(Y13) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_84 \ + CALC_F1_PRE(0x30,CX,BX,DI,DX) \ + PRECALC_36(Y13) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_85 \ + CALC_F1_PRE(0x34,DX,CX,SI,AX) \ + PRECALC_37(Y13) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_86 \ + CALC_F1_PRE(0x38,AX,DX,BX,DI) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_87 \ + CALC_F1_PRE(0x3c,DI,AX,CX,SI) \ + PRECALC_39(Y13,0x40,0x140) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_88 \ + CALC_F1_PRE(0x50,SI,DI,DX,BX) \ + PRECALC_32(Y14,Y13) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_89 \ + CALC_F1_PRE(0x54,BX,SI,AX,CX) \ + PRECALC_33(Y8,Y12) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_90 \ + CALC_F1_PRE(0x58,CX,BX,DI,DX) \ + PRECALC_34(Y3) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_91 \ + CALC_F1_PRE(0x5c,DX,CX,SI,AX) \ + PRECALC_35(Y12) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_92 \ + CALC_F1_PRE(0x70,AX,DX,BX,DI) \ + PRECALC_36(Y12) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_93 \ + CALC_F1_PRE(0x74,DI,AX,CX,SI) \ + PRECALC_37(Y12) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_94 \ + CALC_F1_PRE(0x78,SI,DI,DX,BX) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_95 \ + CALC_F1_PRE(0x7c,BX,SI,AX,CX) \ + PRECALC_39(Y12,0x40,0x160) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_96 \ + CALC_F1_PRE(0x90,CX,BX,DI,DX) \ + PRECALC_32(Y13,Y12) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_97 \ + CALC_F1_PRE(0x94,DX,CX,SI,AX) \ + PRECALC_33(Y7,Y8) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_98 \ + CALC_F1_PRE(0x98,AX,DX,BX,DI) \ + PRECALC_34(Y15) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_99 \ + CALC_F2_PRE(0x9c,DI,AX,SI) \ + PRECALC_35(Y8) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_100 \ + CALC_F2_PRE(0xb0,SI,DI,BX) \ + PRECALC_36(Y8) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_101 \ + CALC_F2_PRE(0xb4,BX,SI,CX) \ + PRECALC_37(Y8) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_102 \ + CALC_F2_PRE(0xb8,CX,BX,DX) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_103 \ + CALC_F2_PRE(0xbc,DX,CX,AX) \ + PRECALC_39(Y8,0x40,0x180) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_104 \ + CALC_F2_PRE(0xd0,AX,DX,DI) \ + PRECALC_32(Y12,Y8) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_105 \ + CALC_F2_PRE(0xd4,DI,AX,SI) \ + PRECALC_33(Y5,Y7) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_106 \ + CALC_F2_PRE(0xd8,SI,DI,BX) \ + PRECALC_34(Y14) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_107 \ + CALC_F2_PRE(0xdc,BX,SI,CX) \ + PRECALC_35(Y7) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_108 \ + CALC_F2_PRE(0xf0,CX,BX,DX) \ + PRECALC_36(Y7) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_109 \ + CALC_F2_PRE(0xf4,DX,CX,AX) \ + PRECALC_37(Y7) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_110 \ + CALC_F2_PRE(0xf8,AX,DX,DI) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_111 \ + CALC_F2_PRE(0xfc,DI,AX,SI) \ + PRECALC_39(Y7,0x40,0x1a0) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_112 \ + CALC_F2_PRE(0x110,SI,DI,BX) \ + PRECALC_32(Y8,Y7) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_113 \ + CALC_F2_PRE(0x114,BX,SI,CX) \ + PRECALC_33(Y3,Y5) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_114 \ + CALC_F2_PRE(0x118,CX,BX,DX) \ + PRECALC_34(Y13) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_115 \ + CALC_F2_PRE(0x11c,DX,CX,AX) \ + PRECALC_35(Y5) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_116 \ + CALC_F2_PRE(0x130,AX,DX,DI) \ + PRECALC_36(Y5) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_117 \ + CALC_F2_PRE(0x134,DI,AX,SI) \ + PRECALC_37(Y5) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_118 \ + CALC_F2_PRE(0x138,SI,DI,BX) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_119 \ + CALC_F3_PRE(0x13c,CX) \ + PRECALC_39(Y5,0x40,0x1c0) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_120 \ + CALC_F3_PRE(0x150,DX) \ + PRECALC_32(Y7,Y5) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_121 \ + CALC_F3_PRE(0x154,AX) \ + PRECALC_33(Y15,Y3) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_122 \ + CALC_F3_PRE(0x158,DI) \ + PRECALC_34(Y12) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_123 \ + CALC_F3_PRE(0x15c,SI) \ + PRECALC_35(Y3) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_124 \ + CALC_F3_PRE(0x170,BX) \ + PRECALC_36(Y3) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_125 \ + CALC_F3_PRE(0x174,CX) \ + PRECALC_37(Y3) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_126 \ + CALC_F3_PRE(0x178,DX) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_127 \ + CALC_F3_PRE(0x17c,AX) \ + PRECALC_39(Y3,0x60,0x1e0) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_128 \ + CALC_F3_PRE(0x190,DI) \ + PRECALC_32(Y5,Y3) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_129 \ + CALC_F3_PRE(0x194,SI) \ + PRECALC_33(Y14,Y15) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_130 \ + CALC_F3_PRE(0x198,BX) \ + PRECALC_34(Y8) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_131 \ + CALC_F3_PRE(0x19c,CX) \ + PRECALC_35(Y15) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_132 \ + CALC_F3_PRE(0x1b0,DX) \ + PRECALC_36(Y15) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_133 \ + CALC_F3_PRE(0x1b4,AX) \ + PRECALC_37(Y15) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_134 \ + CALC_F3_PRE(0x1b8,DI) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_135 \ + CALC_F3_PRE(0x1bc,SI) \ + PRECALC_39(Y15,0x60,0x200) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_136 \ + CALC_F3_PRE(0x1d0,BX) \ + PRECALC_32(Y3,Y15) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_137 \ + CALC_F3_PRE(0x1d4,CX) \ + PRECALC_33(Y13,Y14) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_138 \ + CALC_F3_PRE(0x1d8,DX) \ + PRECALC_34(Y7) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_139 \ + CALC_F2_PRE(0x1dc,DX,CX,AX) \ + PRECALC_35(Y14) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_140 \ + CALC_F2_PRE(0x1f0,AX,DX,DI) \ + PRECALC_36(Y14) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_141 \ + CALC_F2_PRE(0x1f4,DI,AX,SI) \ + PRECALC_37(Y14) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_142 \ + CALC_F2_PRE(0x1f8,SI,DI,BX) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_143 \ + CALC_F2_PRE(0x1fc,BX,SI,CX) \ + PRECALC_39(Y14,0x60,0x220) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_144 \ + CALC_F2_PRE(0x210,CX,BX,DX) \ + PRECALC_32(Y15,Y14) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_145 \ + CALC_F2_PRE(0x214,DX,CX,AX) \ + PRECALC_33(Y12,Y13) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_146 \ + CALC_F2_PRE(0x218,AX,DX,DI) \ + PRECALC_34(Y5) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_147 \ + CALC_F2_PRE(0x21c,DI,AX,SI) \ + PRECALC_35(Y13) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_148 \ + CALC_F2_PRE(0x230,SI,DI,BX) \ + PRECALC_36(Y13) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_149 \ + CALC_F2_PRE(0x234,BX,SI,CX) \ + PRECALC_37(Y13) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_150 \ + CALC_F2_PRE(0x238,CX,BX,DX) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_151 \ + CALC_F2_PRE(0x23c,DX,CX,AX) \ + PRECALC_39(Y13,0x60,0x240) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_152 \ + CALC_F2_PRE(0x250,AX,DX,DI) \ + PRECALC_32(Y14,Y13) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_153 \ + CALC_F2_PRE(0x254,DI,AX,SI) \ + PRECALC_33(Y8,Y12) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_154 \ + CALC_F2_PRE(0x258,SI,DI,BX) \ + PRECALC_34(Y3) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_155 \ + CALC_F2_PRE(0x25c,BX,SI,CX) \ + PRECALC_35(Y12) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_156 \ + CALC_F2_PRE(0x270,CX,BX,DX) \ + PRECALC_36(Y12) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_157 \ + CALC_F2_PRE(0x274,DX,CX,AX) \ + PRECALC_37(Y12) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_158 \ + CALC_F2_PRE(0x278,AX,DX,DI) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_159 \ + ADDL 0x27c(R15),SI \ + LEAL (SI)(AX*1), SI \ + RORXL $0x1b, DI, R12 \ + PRECALC_39(Y12,0x60,0x260) \ + ADDL R12, SI + + + +#define CALC \ + MOVL (R9), CX \ + MOVL 4(R9), SI \ + MOVL 8(R9), DI \ + MOVL 12(R9), AX \ + MOVL 16(R9), DX \ + MOVQ SP, R14 \ + LEAQ (2*4*80+32)(SP), R15 \ + PRECALC \ // Precalc WK for first 2 blocks + XCHGQ R15, R14 \ +loop: \ // this loops is unrolled + CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block + JNE begin \ + VZEROUPPER \ + RET \ +begin: \ + CALC_0 \ + CALC_1 \ + CALC_2 \ + CALC_3 \ + CALC_4 \ + CALC_5 \ + CALC_6 \ + CALC_7 \ + CALC_8 \ + CALC_9 \ + CALC_10 \ + CALC_11 \ + CALC_12 \ + CALC_13 \ + CALC_14 \ + CALC_15 \ + CALC_16 \ + CALC_17 \ + CALC_18 \ + CALC_19 \ + CALC_20 \ + CALC_21 \ + CALC_22 \ + CALC_23 \ + CALC_24 \ + CALC_25 \ + CALC_26 \ + CALC_27 \ + CALC_28 \ + CALC_29 \ + CALC_30 \ + CALC_31 \ + CALC_32 \ + CALC_33 \ + CALC_34 \ + CALC_35 \ + CALC_36 \ + CALC_37 \ + CALC_38 \ + CALC_39 \ + CALC_40 \ + CALC_41 \ + CALC_42 \ + CALC_43 \ + CALC_44 \ + CALC_45 \ + CALC_46 \ + CALC_47 \ + CALC_48 \ + CALC_49 \ + CALC_50 \ + CALC_51 \ + CALC_52 \ + CALC_53 \ + CALC_54 \ + CALC_55 \ + CALC_56 \ + CALC_57 \ + CALC_58 \ + CALC_59 \ + ADDQ $128, R10 \ // move to next even-64-byte block + CMPQ R10, R11 \ // is current block the last one? + CMOVQCC R8, R10 \ // signal the last iteration smartly + CALC_60 \ + CALC_61 \ + CALC_62 \ + CALC_63 \ + CALC_64 \ + CALC_65 \ + CALC_66 \ + CALC_67 \ + CALC_68 \ + CALC_69 \ + CALC_70 \ + CALC_71 \ + CALC_72 \ + CALC_73 \ + CALC_74 \ + CALC_75 \ + CALC_76 \ + CALC_77 \ + CALC_78 \ + CALC_79 \ + UPDATE_HASH(AX,DX,BX,SI,DI) \ + CMPQ R10, R8 \ // is current block the last one? + JE loop\ + MOVL DX, CX \ + CALC_80 \ + CALC_81 \ + CALC_82 \ + CALC_83 \ + CALC_84 \ + CALC_85 \ + CALC_86 \ + CALC_87 \ + CALC_88 \ + CALC_89 \ + CALC_90 \ + CALC_91 \ + CALC_92 \ + CALC_93 \ + CALC_94 \ + CALC_95 \ + CALC_96 \ + CALC_97 \ + CALC_98 \ + CALC_99 \ + CALC_100 \ + CALC_101 \ + CALC_102 \ + CALC_103 \ + CALC_104 \ + CALC_105 \ + CALC_106 \ + CALC_107 \ + CALC_108 \ + CALC_109 \ + CALC_110 \ + CALC_111 \ + CALC_112 \ + CALC_113 \ + CALC_114 \ + CALC_115 \ + CALC_116 \ + CALC_117 \ + CALC_118 \ + CALC_119 \ + CALC_120 \ + CALC_121 \ + CALC_122 \ + CALC_123 \ + CALC_124 \ + CALC_125 \ + CALC_126 \ + CALC_127 \ + CALC_128 \ + CALC_129 \ + CALC_130 \ + CALC_131 \ + CALC_132 \ + CALC_133 \ + CALC_134 \ + CALC_135 \ + CALC_136 \ + CALC_137 \ + CALC_138 \ + CALC_139 \ + ADDQ $128, R13 \ //move to next even-64-byte block + CMPQ R13, R11 \ //is current block the last one? + CMOVQCC R8, R10 \ + CALC_140 \ + CALC_141 \ + CALC_142 \ + CALC_143 \ + CALC_144 \ + CALC_145 \ + CALC_146 \ + CALC_147 \ + CALC_148 \ + CALC_149 \ + CALC_150 \ + CALC_151 \ + CALC_152 \ + CALC_153 \ + CALC_154 \ + CALC_155 \ + CALC_156 \ + CALC_157 \ + CALC_158 \ + CALC_159 \ + UPDATE_HASH(SI,DI,DX,CX,BX) \ + MOVL SI, R12 \ //Reset state for AVX2 reg permutation + MOVL DI, SI \ + MOVL DX, DI \ + MOVL BX, DX \ + MOVL CX, AX \ + MOVL R12, CX \ + XCHGQ R15, R14 \ + JMP loop + + + +TEXT ·blockAVX2(SB),$1408-32 + + MOVQ dig+0(FP), DI + MOVQ p_base+8(FP), SI + MOVQ p_len+16(FP), DX + SHRQ $6, DX + SHLQ $6, DX + + MOVQ $K_XMM_AR<>(SB), R8 + + MOVQ DI, R9 + MOVQ SI, R10 + LEAQ 64(SI), R13 + + ADDQ SI, DX + ADDQ $64, DX + MOVQ DX, R11 + + CMPQ R13, R11 + CMOVQCC R8, R13 + + MOVQ $BSWAP_SHUFB_CTL<>(SB), R8 + VMOVDQU (R8), Y10 + MOVQ $K_XMM_AR<>(SB), R8 //restore R8 + + CALC // RET is inside macros + + +// func checkAVX2() bool +// returns whether AVX2 is supported +TEXT ·checkAVX2(SB),NOSPLIT,$0 + CMPB runtime·support_avx2(SB), $1 + JE has + MOVB $0, ret+0(FP) + RET +has: + MOVB $1, ret+0(FP) + RET + + +DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6 +GLOBL K_XMM_AR<>(SB),RODATA,$128 + +DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203 +DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607 +DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b +DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f +DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203 +DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607 +DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b +DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f +GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32 diff --git a/src/crypto/sha1/sha1block_decl.go b/src/crypto/sha1/sha1block_decl.go index a85b74b878..6d2d073d13 100644 --- a/src/crypto/sha1/sha1block_decl.go +++ b/src/crypto/sha1/sha1block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64 amd64p32 arm 386 s390x +// +build amd64p32 arm 386 s390x package sha1