--- /dev/null
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ . "github.com/mmcloughlin/avo/build"
+ . "github.com/mmcloughlin/avo/operand"
+ . "github.com/mmcloughlin/avo/reg"
+)
+
+//go:generate go run . -out ../sha1block_amd64.s -pkg sha1
+
+// AVX2 version by Intel, same algorithm as code in Linux kernel:
+// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+// Authors:
+// Ilya Albrekht <ilya.albrekht@intel.com>
+// Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+// Ronen Zohar <ronen.zohar@intel.com>
+// Chandramouli Narayanan <mouli@linux.intel.com>
+
+func main() {
+ Package("crypto/sha1")
+ ConstraintExpr("!purego")
+ blockAMD64()
+ blockAVX2()
+ Generate()
+}
+
+func LOAD(index int) {
+ MOVL(Mem{Base: SI}.Offset(index*4), R10L)
+ BSWAPL(R10L)
+ MOVL(R10L, Mem{Base: SP}.Offset(index*4))
+}
+
+func SHUFFLE(index int) {
+ MOVL(Mem{Base: SP}.Offset(((index)&0xf)*4), R10L)
+ XORL(Mem{Base: SP}.Offset(((index-3)&0xf)*4), R10L)
+ XORL(Mem{Base: SP}.Offset(((index-8)&0xf)*4), R10L)
+ XORL(Mem{Base: SP}.Offset(((index-14)&0xf)*4), R10L)
+ ROLL(Imm(1), R10L)
+ MOVL(R10L, Mem{Base: SP}.Offset(((index)&0xf)*4))
+}
+
+func FUNC1(a, b, c, d, e GPPhysical) {
+ MOVL(d, R9L)
+ XORL(c, R9L)
+ ANDL(b, R9L)
+ XORL(d, R9L)
+}
+
+func FUNC2(a, b, c, d, e GPPhysical) {
+ MOVL(b, R9L)
+ XORL(c, R9L)
+ XORL(d, R9L)
+}
+
+func FUNC3(a, b, c, d, e GPPhysical) {
+ MOVL(b, R8L)
+ ORL(c, R8L)
+ ANDL(d, R8L)
+ MOVL(b, R9L)
+ ANDL(c, R9L)
+ ORL(R8L, R9L)
+}
+
+func FUNC4(a, b, c, d, e GPPhysical) {
+ FUNC2(a, b, c, d, e)
+}
+
+func MIX(a, b, c, d, e GPPhysical, konst int) {
+ ROLL(Imm(30), b)
+ ADDL(R9L, e)
+ MOVL(a, R8L)
+ ROLL(Imm(5), R8L)
+ LEAL(Mem{Base: e, Index: R10L, Scale: 1}.Offset(konst), e)
+ ADDL(R8L, e)
+}
+
+func ROUND1(a, b, c, d, e GPPhysical, index int) {
+ LOAD(index)
+ FUNC1(a, b, c, d, e)
+ MIX(a, b, c, d, e, 0x5A827999)
+}
+
+func ROUND1x(a, b, c, d, e GPPhysical, index int) {
+ SHUFFLE(index)
+ FUNC1(a, b, c, d, e)
+ MIX(a, b, c, d, e, 0x5A827999)
+}
+
+func ROUND2(a, b, c, d, e GPPhysical, index int) {
+ SHUFFLE(index)
+ FUNC2(a, b, c, d, e)
+ MIX(a, b, c, d, e, 0x6ED9EBA1)
+}
+
+func ROUND3(a, b, c, d, e GPPhysical, index int) {
+ SHUFFLE(index)
+ FUNC3(a, b, c, d, e)
+ MIX(a, b, c, d, e, 0x8F1BBCDC)
+}
+
+func ROUND4(a, b, c, d, e GPPhysical, index int) {
+ SHUFFLE(index)
+ FUNC4(a, b, c, d, e)
+ MIX(a, b, c, d, e, 0xCA62C1D6)
+}
+
+func blockAMD64() {
+ Implement("blockAMD64")
+ Attributes(NOSPLIT)
+ AllocLocal(64)
+
+ Load(Param("dig"), RBP)
+ Load(Param("p").Base(), RSI)
+ Load(Param("p").Len(), RDX)
+ SHRQ(Imm(6), RDX)
+ SHLQ(Imm(6), RDX)
+
+ LEAQ(Mem{Base: SI, Index: DX, Scale: 1}, RDI)
+ MOVL(Mem{Base: BP}.Offset(0*4), EAX)
+ MOVL(Mem{Base: BP}.Offset(1*4), EBX)
+ MOVL(Mem{Base: BP}.Offset(2*4), ECX)
+ MOVL(Mem{Base: BP}.Offset(3*4), EDX)
+ MOVL(Mem{Base: BP}.Offset(4*4), EBP)
+
+ CMPQ(RSI, RDI)
+ JEQ(LabelRef("end"))
+
+ loop_amd64()
+ end()
+}
+
+func loop_amd64() {
+ Label("loop")
+ MOVL(EAX, R11L)
+ MOVL(EBX, R12L)
+ MOVL(ECX, R13L)
+ MOVL(EDX, R14L)
+ MOVL(EBP, R15L)
+
+ ROUND1(EAX, EBX, ECX, EDX, EBP, 0)
+ ROUND1(EBP, EAX, EBX, ECX, EDX, 1)
+ ROUND1(EDX, EBP, EAX, EBX, ECX, 2)
+ ROUND1(ECX, EDX, EBP, EAX, EBX, 3)
+ ROUND1(EBX, ECX, EDX, EBP, EAX, 4)
+ ROUND1(EAX, EBX, ECX, EDX, EBP, 5)
+ ROUND1(EBP, EAX, EBX, ECX, EDX, 6)
+ ROUND1(EDX, EBP, EAX, EBX, ECX, 7)
+ ROUND1(ECX, EDX, EBP, EAX, EBX, 8)
+ ROUND1(EBX, ECX, EDX, EBP, EAX, 9)
+ ROUND1(EAX, EBX, ECX, EDX, EBP, 10)
+ ROUND1(EBP, EAX, EBX, ECX, EDX, 11)
+ ROUND1(EDX, EBP, EAX, EBX, ECX, 12)
+ ROUND1(ECX, EDX, EBP, EAX, EBX, 13)
+ ROUND1(EBX, ECX, EDX, EBP, EAX, 14)
+ ROUND1(EAX, EBX, ECX, EDX, EBP, 15)
+
+ ROUND1x(EBP, EAX, EBX, ECX, EDX, 16)
+ ROUND1x(EDX, EBP, EAX, EBX, ECX, 17)
+ ROUND1x(ECX, EDX, EBP, EAX, EBX, 18)
+ ROUND1x(EBX, ECX, EDX, EBP, EAX, 19)
+
+ ROUND2(EAX, EBX, ECX, EDX, EBP, 20)
+ ROUND2(EBP, EAX, EBX, ECX, EDX, 21)
+ ROUND2(EDX, EBP, EAX, EBX, ECX, 22)
+ ROUND2(ECX, EDX, EBP, EAX, EBX, 23)
+ ROUND2(EBX, ECX, EDX, EBP, EAX, 24)
+ ROUND2(EAX, EBX, ECX, EDX, EBP, 25)
+ ROUND2(EBP, EAX, EBX, ECX, EDX, 26)
+ ROUND2(EDX, EBP, EAX, EBX, ECX, 27)
+ ROUND2(ECX, EDX, EBP, EAX, EBX, 28)
+ ROUND2(EBX, ECX, EDX, EBP, EAX, 29)
+ ROUND2(EAX, EBX, ECX, EDX, EBP, 30)
+ ROUND2(EBP, EAX, EBX, ECX, EDX, 31)
+ ROUND2(EDX, EBP, EAX, EBX, ECX, 32)
+ ROUND2(ECX, EDX, EBP, EAX, EBX, 33)
+ ROUND2(EBX, ECX, EDX, EBP, EAX, 34)
+ ROUND2(EAX, EBX, ECX, EDX, EBP, 35)
+ ROUND2(EBP, EAX, EBX, ECX, EDX, 36)
+ ROUND2(EDX, EBP, EAX, EBX, ECX, 37)
+ ROUND2(ECX, EDX, EBP, EAX, EBX, 38)
+ ROUND2(EBX, ECX, EDX, EBP, EAX, 39)
+
+ ROUND3(EAX, EBX, ECX, EDX, EBP, 40)
+ ROUND3(EBP, EAX, EBX, ECX, EDX, 41)
+ ROUND3(EDX, EBP, EAX, EBX, ECX, 42)
+ ROUND3(ECX, EDX, EBP, EAX, EBX, 43)
+ ROUND3(EBX, ECX, EDX, EBP, EAX, 44)
+ ROUND3(EAX, EBX, ECX, EDX, EBP, 45)
+ ROUND3(EBP, EAX, EBX, ECX, EDX, 46)
+ ROUND3(EDX, EBP, EAX, EBX, ECX, 47)
+ ROUND3(ECX, EDX, EBP, EAX, EBX, 48)
+ ROUND3(EBX, ECX, EDX, EBP, EAX, 49)
+ ROUND3(EAX, EBX, ECX, EDX, EBP, 50)
+ ROUND3(EBP, EAX, EBX, ECX, EDX, 51)
+ ROUND3(EDX, EBP, EAX, EBX, ECX, 52)
+ ROUND3(ECX, EDX, EBP, EAX, EBX, 53)
+ ROUND3(EBX, ECX, EDX, EBP, EAX, 54)
+ ROUND3(EAX, EBX, ECX, EDX, EBP, 55)
+ ROUND3(EBP, EAX, EBX, ECX, EDX, 56)
+ ROUND3(EDX, EBP, EAX, EBX, ECX, 57)
+ ROUND3(ECX, EDX, EBP, EAX, EBX, 58)
+ ROUND3(EBX, ECX, EDX, EBP, EAX, 59)
+
+ ROUND4(EAX, EBX, ECX, EDX, EBP, 60)
+ ROUND4(EBP, EAX, EBX, ECX, EDX, 61)
+ ROUND4(EDX, EBP, EAX, EBX, ECX, 62)
+ ROUND4(ECX, EDX, EBP, EAX, EBX, 63)
+ ROUND4(EBX, ECX, EDX, EBP, EAX, 64)
+ ROUND4(EAX, EBX, ECX, EDX, EBP, 65)
+ ROUND4(EBP, EAX, EBX, ECX, EDX, 66)
+ ROUND4(EDX, EBP, EAX, EBX, ECX, 67)
+ ROUND4(ECX, EDX, EBP, EAX, EBX, 68)
+ ROUND4(EBX, ECX, EDX, EBP, EAX, 69)
+ ROUND4(EAX, EBX, ECX, EDX, EBP, 70)
+ ROUND4(EBP, EAX, EBX, ECX, EDX, 71)
+ ROUND4(EDX, EBP, EAX, EBX, ECX, 72)
+ ROUND4(ECX, EDX, EBP, EAX, EBX, 73)
+ ROUND4(EBX, ECX, EDX, EBP, EAX, 74)
+ ROUND4(EAX, EBX, ECX, EDX, EBP, 75)
+ ROUND4(EBP, EAX, EBX, ECX, EDX, 76)
+ ROUND4(EDX, EBP, EAX, EBX, ECX, 77)
+ ROUND4(ECX, EDX, EBP, EAX, EBX, 78)
+ ROUND4(EBX, ECX, EDX, EBP, EAX, 79)
+
+ ADDL(R11L, EAX)
+ ADDL(R12L, EBX)
+ ADDL(R13L, ECX)
+ ADDL(R14L, EDX)
+ ADDL(R15L, EBP)
+
+ ADDQ(Imm(64), RSI)
+ CMPQ(RSI, RDI)
+ JB(LabelRef("loop"))
+}
+
+func end() {
+ Label("end")
+ Load(Param("dig"), RDI)
+ MOVL(EAX, Mem{Base: DI}.Offset(0*4))
+ MOVL(EBX, Mem{Base: DI}.Offset(1*4))
+ MOVL(ECX, Mem{Base: DI}.Offset(2*4))
+ MOVL(EDX, Mem{Base: DI}.Offset(3*4))
+ MOVL(EBP, Mem{Base: DI}.Offset(4*4))
+ RET()
+}
+
+// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
+// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
+// From http://software.intel.com/en-us/articles
+// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
+// This implementation is 2x unrolled, and interleaves vector instructions,
+// used to precompute W, with scalar computation of current round
+// for optimal scheduling.
+
+// Trivial helper macros.
+
+func UPDATE_HASH(A, TB, C, D, E GPPhysical) {
+ ADDL(Mem{Base: R9}, A)
+ MOVL(A, Mem{Base: R9})
+ ADDL(Mem{Base: R9}.Offset(4), TB)
+ MOVL(TB, Mem{Base: R9}.Offset(4))
+ ADDL(Mem{Base: R9}.Offset(8), C)
+ MOVL(C, Mem{Base: R9}.Offset(8))
+ ADDL(Mem{Base: R9}.Offset(12), D)
+ MOVL(D, Mem{Base: R9}.Offset(12))
+ ADDL(Mem{Base: R9}.Offset(16), E)
+ MOVL(E, Mem{Base: R9}.Offset(16))
+}
+
+// Helper macros for PRECALC, which does precomputations
+
+func PRECALC_0(OFFSET int) {
+ VMOVDQU(Mem{Base: R10}.Offset(OFFSET), X0)
+}
+
+func PRECALC_1(OFFSET int) {
+ VINSERTI128(Imm(1), Mem{Base: R13}.Offset(OFFSET), Y0, Y0)
+}
+
+func PRECALC_2(YREG VecPhysical) {
+ VPSHUFB(Y10, Y0, YREG)
+}
+
+func PRECALC_4(YREG VecPhysical, K_OFFSET int) {
+ VPADDD(Mem{Base: R8}.Offset(K_OFFSET), YREG, Y0)
+}
+
+func PRECALC_7(OFFSET int) {
+ VMOVDQU(Y0, Mem{Base: R14}.Offset(OFFSET*2))
+}
+
+// Message scheduling pre-compute for rounds 0-15
+//
+// - R13 is a pointer to even 64-byte block
+// - R10 is a pointer to odd 64-byte block
+// - R14 is a pointer to temp buffer
+// - X0 is used as temp register
+// - YREG is clobbered as part of computation
+// - OFFSET chooses 16 byte chunk within a block
+// - R8 is a pointer to constants block
+// - K_OFFSET chooses K constants relevant to this round
+// - X10 holds swap mask
+func PRECALC_00_15(OFFSET int, YREG VecPhysical) {
+ PRECALC_0(OFFSET)
+ PRECALC_1(OFFSET)
+ PRECALC_2(YREG)
+ PRECALC_4(YREG, 0x0)
+ PRECALC_7(OFFSET)
+}
+
+// Helper macros for PRECALC_16_31
+
+func PRECALC_16(REG_SUB_16, REG_SUB_12, REG_SUB_4, REG VecPhysical) {
+ VPALIGNR(Imm(8), REG_SUB_16, REG_SUB_12, REG) // w[i-14]
+ VPSRLDQ(Imm(4), REG_SUB_4, Y0) // w[i-3]
+}
+
+func PRECALC_17(REG_SUB_16, REG_SUB_8, REG VecPhysical) {
+ VPXOR(REG_SUB_8, REG, REG)
+ VPXOR(REG_SUB_16, Y0, Y0)
+}
+
+func PRECALC_18(REG VecPhysical) {
+ VPXOR(Y0, REG, REG)
+ VPSLLDQ(Imm(12), REG, Y9)
+}
+
+func PRECALC_19(REG VecPhysical) {
+ VPSLLD(Imm(1), REG, Y0)
+ VPSRLD(Imm(31), REG, REG)
+}
+
+func PRECALC_20(REG VecPhysical) {
+ VPOR(REG, Y0, Y0)
+ VPSLLD(Imm(2), Y9, REG)
+}
+
+func PRECALC_21(REG VecPhysical) {
+ VPSRLD(Imm(30), Y9, Y9)
+ VPXOR(REG, Y0, Y0)
+}
+
+func PRECALC_23(REG VecPhysical, K_OFFSET, OFFSET int) {
+ VPXOR(Y9, Y0, REG)
+ VPADDD(Mem{Base: R8}.Offset(K_OFFSET), REG, Y0)
+ VMOVDQU(Y0, Mem{Base: R14}.Offset(OFFSET))
+}
+
+// Message scheduling pre-compute for rounds 16-31
+// - calculating last 32 w[i] values in 8 XMM registers
+// - pre-calculate K+w[i] values and store to mem
+// - for later load by ALU add instruction.
+// - "brute force" vectorization for rounds 16-31 only
+// - due to w[i]->w[i-3] dependency.
+// - clobbers 5 input ymm registers REG_SUB*
+// - uses X0 and X9 as temp registers
+// - As always, R8 is a pointer to constants block
+// - and R14 is a pointer to temp buffer
+func PRECALC_16_31(REG, REG_SUB_4, REG_SUB_8, REG_SUB_12, REG_SUB_16 VecPhysical, K_OFFSET, OFFSET int) {
+ PRECALC_16(REG_SUB_16, REG_SUB_12, REG_SUB_4, REG)
+ PRECALC_17(REG_SUB_16, REG_SUB_8, REG)
+ PRECALC_18(REG)
+ PRECALC_19(REG)
+ PRECALC_20(REG)
+ PRECALC_21(REG)
+ PRECALC_23(REG, K_OFFSET, OFFSET)
+}
+
+// Helper macros for PRECALC_32_79
+
+func PRECALC_32(REG_SUB_8, REG_SUB_4 VecPhysical) {
+ VPALIGNR(Imm(8), REG_SUB_8, REG_SUB_4, Y0)
+}
+
+func PRECALC_33(REG_SUB_28, REG VecPhysical) {
+ VPXOR(REG_SUB_28, REG, REG)
+}
+
+func PRECALC_34(REG_SUB_16 VecPhysical) {
+ VPXOR(REG_SUB_16, Y0, Y0)
+}
+
+func PRECALC_35(REG VecPhysical) {
+ VPXOR(Y0, REG, REG)
+}
+
+func PRECALC_36(REG VecPhysical) {
+ VPSLLD(Imm(2), REG, Y0)
+}
+
+func PRECALC_37(REG VecPhysical) {
+ VPSRLD(Imm(30), REG, REG)
+ VPOR(REG, Y0, REG)
+}
+
+func PRECALC_39(REG VecPhysical, K_OFFSET, OFFSET int) {
+ VPADDD(Mem{Base: R8}.Offset(K_OFFSET), REG, Y0)
+ VMOVDQU(Y0, Mem{Base: R14}.Offset(OFFSET))
+}
+
+// Message scheduling pre-compute for rounds 32-79
+// In SHA-1 specification we have:
+// w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+// Which is the same as:
+// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+// This allows for more efficient vectorization,
+// since w[i]->w[i-3] dependency is broken
+
+func PRECALC_32_79(REG, REG_SUB_4, REG_SUB_8, REG_SUB_16, REG_SUB_28 VecPhysical, K_OFFSET, OFFSET int) {
+ PRECALC_32(REG_SUB_8, REG_SUB_4)
+ PRECALC_33(REG_SUB_28, REG)
+ PRECALC_34(REG_SUB_16)
+ PRECALC_35(REG)
+ PRECALC_36(REG)
+ PRECALC_37(REG)
+ PRECALC_39(REG, K_OFFSET, OFFSET)
+}
+
+func PRECALC() {
+ PRECALC_00_15(0, Y15)
+ PRECALC_00_15(0x10, Y14)
+ PRECALC_00_15(0x20, Y13)
+ PRECALC_00_15(0x30, Y12)
+ PRECALC_16_31(Y8, Y12, Y13, Y14, Y15, 0, 0x80)
+ PRECALC_16_31(Y7, Y8, Y12, Y13, Y14, 0x20, 0xa0)
+ PRECALC_16_31(Y5, Y7, Y8, Y12, Y13, 0x20, 0xc0)
+ PRECALC_16_31(Y3, Y5, Y7, Y8, Y12, 0x20, 0xe0)
+ PRECALC_32_79(Y15, Y3, Y5, Y8, Y14, 0x20, 0x100)
+ PRECALC_32_79(Y14, Y15, Y3, Y7, Y13, 0x20, 0x120)
+ PRECALC_32_79(Y13, Y14, Y15, Y5, Y12, 0x40, 0x140)
+ PRECALC_32_79(Y12, Y13, Y14, Y3, Y8, 0x40, 0x160)
+ PRECALC_32_79(Y8, Y12, Y13, Y15, Y7, 0x40, 0x180)
+ PRECALC_32_79(Y7, Y8, Y12, Y14, Y5, 0x40, 0x1a0)
+ PRECALC_32_79(Y5, Y7, Y8, Y13, Y3, 0x40, 0x1c0)
+ PRECALC_32_79(Y3, Y5, Y7, Y12, Y15, 0x60, 0x1e0)
+ PRECALC_32_79(Y15, Y3, Y5, Y8, Y14, 0x60, 0x200)
+ PRECALC_32_79(Y14, Y15, Y3, Y7, Y13, 0x60, 0x220)
+ PRECALC_32_79(Y13, Y14, Y15, Y5, Y12, 0x60, 0x240)
+ PRECALC_32_79(Y12, Y13, Y14, Y3, Y8, 0x60, 0x260)
+}
+
+// Macros calculating individual rounds have general form
+// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
+// CALC_ROUND_{PRE,POST} macros follow
+
+func CALC_F1_PRE(OFFSET int, REG_A, REG_B, REG_C, REG_E GPPhysical) {
+ ADDL(Mem{Base: R15}.Offset(OFFSET), REG_E)
+ ANDNL(REG_C, REG_A, EBP)
+ LEAL(Mem{Base: REG_E, Index: REG_B, Scale: 1}, REG_E) // Add F from the previous round
+ RORXL(Imm(0x1b), REG_A, R12L)
+ RORXL(Imm(2), REG_A, REG_B) // for next round
+}
+
+func CALC_F1_POST(REG_A, REG_B, REG_E GPPhysical) {
+ ANDL(REG_B, REG_A) // b&c
+ XORL(EBP, REG_A) // F1 = (b&c) ^ (~b&d)
+ LEAL(Mem{Base: REG_E, Index: R12, Scale: 1}, REG_E) // E += A >>> 5
+}
+
+// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
+
+func CALC_0() {
+ MOVL(ESI, EBX) // Precalculating first round
+ RORXL(Imm(2), ESI, ESI)
+ ANDNL(EAX, EBX, EBP)
+ ANDL(EDI, EBX)
+ XORL(EBP, EBX)
+ CALC_F1_PRE(0x0, ECX, EBX, EDI, EDX)
+ PRECALC_0(0x80)
+ CALC_F1_POST(ECX, ESI, EDX)
+}
+
+func CALC_1() {
+ CALC_F1_PRE(0x4, EDX, ECX, ESI, EAX)
+ PRECALC_1(0x80)
+ CALC_F1_POST(EDX, EBX, EAX)
+}
+
+func CALC_2() {
+ CALC_F1_PRE(0x8, EAX, EDX, EBX, EDI)
+ PRECALC_2(Y15)
+ CALC_F1_POST(EAX, ECX, EDI)
+}
+
+func CALC_3() {
+ CALC_F1_PRE(0xc, EDI, EAX, ECX, ESI)
+ CALC_F1_POST(EDI, EDX, ESI)
+}
+
+func CALC_4() {
+ CALC_F1_PRE(0x20, ESI, EDI, EDX, EBX)
+ PRECALC_4(Y15, 0x0)
+ CALC_F1_POST(ESI, EAX, EBX)
+}
+
+func CALC_5() {
+ CALC_F1_PRE(0x24, EBX, ESI, EAX, ECX)
+ CALC_F1_POST(EBX, EDI, ECX)
+}
+
+func CALC_6() {
+ CALC_F1_PRE(0x28, ECX, EBX, EDI, EDX)
+ CALC_F1_POST(ECX, ESI, EDX)
+}
+
+func CALC_7() {
+ CALC_F1_PRE(0x2c, EDX, ECX, ESI, EAX)
+ PRECALC_7(0x0)
+ CALC_F1_POST(EDX, EBX, EAX)
+}
+
+func CALC_8() {
+ CALC_F1_PRE(0x40, EAX, EDX, EBX, EDI)
+ PRECALC_0(0x90)
+ CALC_F1_POST(EAX, ECX, EDI)
+}
+
+func CALC_9() {
+ CALC_F1_PRE(0x44, EDI, EAX, ECX, ESI)
+ PRECALC_1(0x90)
+ CALC_F1_POST(EDI, EDX, ESI)
+}
+
+func CALC_10() {
+ CALC_F1_PRE(0x48, ESI, EDI, EDX, EBX)
+ PRECALC_2(Y14)
+ CALC_F1_POST(ESI, EAX, EBX)
+}
+
+func CALC_11() {
+ CALC_F1_PRE(0x4c, EBX, ESI, EAX, ECX)
+ CALC_F1_POST(EBX, EDI, ECX)
+}
+
+func CALC_12() {
+ CALC_F1_PRE(0x60, ECX, EBX, EDI, EDX)
+ PRECALC_4(Y14, 0x0)
+ CALC_F1_POST(ECX, ESI, EDX)
+}
+
+func CALC_13() {
+ CALC_F1_PRE(0x64, EDX, ECX, ESI, EAX)
+ CALC_F1_POST(EDX, EBX, EAX)
+}
+
+func CALC_14() {
+ CALC_F1_PRE(0x68, EAX, EDX, EBX, EDI)
+ CALC_F1_POST(EAX, ECX, EDI)
+}
+
+func CALC_15() {
+ CALC_F1_PRE(0x6c, EDI, EAX, ECX, ESI)
+ PRECALC_7(0x10)
+ CALC_F1_POST(EDI, EDX, ESI)
+}
+
+func CALC_16() {
+ CALC_F1_PRE(0x80, ESI, EDI, EDX, EBX)
+ PRECALC_0(0xa0)
+ CALC_F1_POST(ESI, EAX, EBX)
+}
+
+func CALC_17() {
+ CALC_F1_PRE(0x84, EBX, ESI, EAX, ECX)
+ PRECALC_1(0xa0)
+ CALC_F1_POST(EBX, EDI, ECX)
+}
+
+func CALC_18() {
+ CALC_F1_PRE(0x88, ECX, EBX, EDI, EDX)
+ PRECALC_2(Y13)
+ CALC_F1_POST(ECX, ESI, EDX)
+}
+
+func CALC_F2_PRE(OFFSET int, REG_A, REG_B, REG_E GPPhysical) {
+ ADDL(Mem{Base: R15}.Offset(OFFSET), REG_E)
+ LEAL(Mem{Base: REG_E, Index: REG_B, Scale: 1}, REG_E) // Add F from the previous round
+ RORXL(Imm(0x1b), REG_A, R12L)
+ RORXL(Imm(2), REG_A, REG_B) // for next round
+}
+
+func CALC_F2_POST(REG_A, REG_B, REG_C, REG_E GPPhysical) {
+ XORL(REG_B, REG_A)
+ ADDL(R12L, REG_E)
+ XORL(REG_C, REG_A)
+}
+
+func CALC_19() {
+ CALC_F2_PRE(0x8c, EDX, ECX, EAX)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_20() {
+ CALC_F2_PRE(0xa0, EAX, EDX, EDI)
+ PRECALC_4(Y13, 0x0)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_21() {
+ CALC_F2_PRE(0xa4, EDI, EAX, ESI)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_22() {
+ CALC_F2_PRE(0xa8, ESI, EDI, EBX)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_23() {
+ CALC_F2_PRE(0xac, EBX, ESI, ECX)
+ PRECALC_7(0x20)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_24() {
+ CALC_F2_PRE(0xc0, ECX, EBX, EDX)
+ PRECALC_0(0xb0)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_25() {
+ CALC_F2_PRE(0xc4, EDX, ECX, EAX)
+ PRECALC_1(0xb0)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_26() {
+ CALC_F2_PRE(0xc8, EAX, EDX, EDI)
+ PRECALC_2(Y12)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_27() {
+ CALC_F2_PRE(0xcc, EDI, EAX, ESI)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_28() {
+ CALC_F2_PRE(0xe0, ESI, EDI, EBX)
+ PRECALC_4(Y12, 0x0)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_29() {
+ CALC_F2_PRE(0xe4, EBX, ESI, ECX)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_30() {
+ CALC_F2_PRE(0xe8, ECX, EBX, EDX)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_31() {
+ CALC_F2_PRE(0xec, EDX, ECX, EAX)
+ PRECALC_7(0x30)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_32() {
+ CALC_F2_PRE(0x100, EAX, EDX, EDI)
+ PRECALC_16(Y15, Y14, Y12, Y8)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_33() {
+ CALC_F2_PRE(0x104, EDI, EAX, ESI)
+ PRECALC_17(Y15, Y13, Y8)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_34() {
+ CALC_F2_PRE(0x108, ESI, EDI, EBX)
+ PRECALC_18(Y8)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_35() {
+ CALC_F2_PRE(0x10c, EBX, ESI, ECX)
+ PRECALC_19(Y8)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_36() {
+ CALC_F2_PRE(0x120, ECX, EBX, EDX)
+ PRECALC_20(Y8)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_37() {
+ CALC_F2_PRE(0x124, EDX, ECX, EAX)
+ PRECALC_21(Y8)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_38() {
+ CALC_F2_PRE(0x128, EAX, EDX, EDI)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_F3_PRE(OFFSET int, REG_E GPPhysical) {
+ ADDL(Mem{Base: R15}.Offset(OFFSET), REG_E)
+}
+
+func CALC_F3_POST(REG_A, REG_B, REG_C, REG_E, REG_TB GPPhysical) {
+ LEAL(Mem{Base: REG_E, Index: REG_TB, Scale: 1}, REG_E) // Add F from the previous round
+ MOVL(REG_B, EBP)
+ ORL(REG_A, EBP)
+ RORXL(Imm(0x1b), REG_A, R12L)
+ RORXL(Imm(2), REG_A, REG_TB)
+ ANDL(REG_C, EBP)
+ ANDL(REG_B, REG_A)
+ ORL(EBP, REG_A)
+ ADDL(R12L, REG_E)
+}
+
+func CALC_39() {
+ CALC_F3_PRE(0x12c, ESI)
+ PRECALC_23(Y8, 0x0, 0x80)
+ CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
+}
+
+func CALC_40() {
+ CALC_F3_PRE(0x140, EBX)
+ PRECALC_16(Y14, Y13, Y8, Y7)
+ CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
+}
+
+func CALC_41() {
+ CALC_F3_PRE(0x144, ECX)
+ PRECALC_17(Y14, Y12, Y7)
+ CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
+}
+
+func CALC_42() {
+ CALC_F3_PRE(0x148, EDX)
+ PRECALC_18(Y7)
+ CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
+}
+
+func CALC_43() {
+ CALC_F3_PRE(0x14c, EAX)
+ PRECALC_19(Y7)
+ CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
+}
+
+func CALC_44() {
+ CALC_F3_PRE(0x160, EDI)
+ PRECALC_20(Y7)
+ CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
+}
+
+func CALC_45() {
+ CALC_F3_PRE(0x164, ESI)
+ PRECALC_21(Y7)
+ CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
+}
+
+func CALC_46() {
+ CALC_F3_PRE(0x168, EBX)
+ CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
+}
+
+func CALC_47() {
+ CALC_F3_PRE(0x16c, ECX)
+ VPXOR(Y9, Y0, Y7)
+ VPADDD(Mem{Base: R8}.Offset(0x20), Y7, Y0)
+ VMOVDQU(Y0, Mem{Base: R14}.Offset(0xa0))
+ CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
+}
+
+func CALC_48() {
+ CALC_F3_PRE(0x180, EDX)
+ PRECALC_16(Y13, Y12, Y7, Y5)
+ CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
+}
+
+func CALC_49() {
+ CALC_F3_PRE(0x184, EAX)
+ PRECALC_17(Y13, Y8, Y5)
+ CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
+}
+
+func CALC_50() {
+ CALC_F3_PRE(0x188, EDI)
+ PRECALC_18(Y5)
+ CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
+}
+
+func CALC_51() {
+ CALC_F3_PRE(0x18c, ESI)
+ PRECALC_19(Y5)
+ CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
+}
+
+func CALC_52() {
+ CALC_F3_PRE(0x1a0, EBX)
+ PRECALC_20(Y5)
+ CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
+}
+
+func CALC_53() {
+ CALC_F3_PRE(0x1a4, ECX)
+ PRECALC_21(Y5)
+ CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
+}
+
+func CALC_54() {
+ CALC_F3_PRE(0x1a8, EDX)
+ CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
+}
+
+func CALC_55() {
+ CALC_F3_PRE(0x1ac, EAX)
+ PRECALC_23(Y5, 0x20, 0xc0)
+ CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
+}
+
+func CALC_56() {
+ CALC_F3_PRE(0x1c0, EDI)
+ PRECALC_16(Y12, Y8, Y5, Y3)
+ CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
+}
+
+func CALC_57() {
+ CALC_F3_PRE(0x1c4, ESI)
+ PRECALC_17(Y12, Y7, Y3)
+ CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
+}
+
+func CALC_58() {
+ CALC_F3_PRE(0x1c8, EBX)
+ PRECALC_18(Y3)
+ CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
+}
+
+func CALC_59() {
+ CALC_F2_PRE(0x1cc, EBX, ESI, ECX)
+ PRECALC_19(Y3)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_60() {
+ CALC_F2_PRE(0x1e0, ECX, EBX, EDX)
+ PRECALC_20(Y3)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_61() {
+ CALC_F2_PRE(0x1e4, EDX, ECX, EAX)
+ PRECALC_21(Y3)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_62() {
+ CALC_F2_PRE(0x1e8, EAX, EDX, EDI)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_63() {
+ CALC_F2_PRE(0x1ec, EDI, EAX, ESI)
+ PRECALC_23(Y3, 0x20, 0xe0)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_64() {
+ CALC_F2_PRE(0x200, ESI, EDI, EBX)
+ PRECALC_32(Y5, Y3)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_65() {
+ CALC_F2_PRE(0x204, EBX, ESI, ECX)
+ PRECALC_33(Y14, Y15)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_66() {
+ CALC_F2_PRE(0x208, ECX, EBX, EDX)
+ PRECALC_34(Y8)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_67() {
+ CALC_F2_PRE(0x20c, EDX, ECX, EAX)
+ PRECALC_35(Y15)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_68() {
+ CALC_F2_PRE(0x220, EAX, EDX, EDI)
+ PRECALC_36(Y15)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_69() {
+ CALC_F2_PRE(0x224, EDI, EAX, ESI)
+ PRECALC_37(Y15)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_70() {
+ CALC_F2_PRE(0x228, ESI, EDI, EBX)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_71() {
+ CALC_F2_PRE(0x22c, EBX, ESI, ECX)
+ PRECALC_39(Y15, 0x20, 0x100)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_72() {
+ CALC_F2_PRE(0x240, ECX, EBX, EDX)
+ PRECALC_32(Y3, Y15)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_73() {
+ CALC_F2_PRE(0x244, EDX, ECX, EAX)
+ PRECALC_33(Y13, Y14)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_74() {
+ CALC_F2_PRE(0x248, EAX, EDX, EDI)
+ PRECALC_34(Y7)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_75() {
+ CALC_F2_PRE(0x24c, EDI, EAX, ESI)
+ PRECALC_35(Y14)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_76() {
+ CALC_F2_PRE(0x260, ESI, EDI, EBX)
+ PRECALC_36(Y14)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_77() {
+ CALC_F2_PRE(0x264, EBX, ESI, ECX)
+ PRECALC_37(Y14)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_78() {
+ CALC_F2_PRE(0x268, ECX, EBX, EDX)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_79() {
+ ADDL(Mem{Base: R15}.Offset(0x26c), EAX)
+ LEAL(Mem{Base: AX, Index: CX, Scale: 1}, EAX)
+ RORXL(Imm(0x1b), EDX, R12L)
+ PRECALC_39(Y14, 0x20, 0x120)
+ ADDL(R12L, EAX)
+}
+
+// Similar to CALC_0
+func CALC_80() {
+ MOVL(ECX, EDX)
+ RORXL(Imm(2), ECX, ECX)
+ ANDNL(ESI, EDX, EBP)
+ ANDL(EBX, EDX)
+ XORL(EBP, EDX)
+ CALC_F1_PRE(0x10, EAX, EDX, EBX, EDI)
+ PRECALC_32(Y15, Y14)
+ CALC_F1_POST(EAX, ECX, EDI)
+}
+
+func CALC_81() {
+ CALC_F1_PRE(0x14, EDI, EAX, ECX, ESI)
+ PRECALC_33(Y12, Y13)
+ CALC_F1_POST(EDI, EDX, ESI)
+}
+
+func CALC_82() {
+ CALC_F1_PRE(0x18, ESI, EDI, EDX, EBX)
+ PRECALC_34(Y5)
+ CALC_F1_POST(ESI, EAX, EBX)
+}
+
+func CALC_83() {
+ CALC_F1_PRE(0x1c, EBX, ESI, EAX, ECX)
+ PRECALC_35(Y13)
+ CALC_F1_POST(EBX, EDI, ECX)
+}
+
+func CALC_84() {
+ CALC_F1_PRE(0x30, ECX, EBX, EDI, EDX)
+ PRECALC_36(Y13)
+ CALC_F1_POST(ECX, ESI, EDX)
+}
+
+func CALC_85() {
+ CALC_F1_PRE(0x34, EDX, ECX, ESI, EAX)
+ PRECALC_37(Y13)
+ CALC_F1_POST(EDX, EBX, EAX)
+}
+
+func CALC_86() {
+ CALC_F1_PRE(0x38, EAX, EDX, EBX, EDI)
+ CALC_F1_POST(EAX, ECX, EDI)
+}
+
+func CALC_87() {
+ CALC_F1_PRE(0x3c, EDI, EAX, ECX, ESI)
+ PRECALC_39(Y13, 0x40, 0x140)
+ CALC_F1_POST(EDI, EDX, ESI)
+}
+
+func CALC_88() {
+ CALC_F1_PRE(0x50, ESI, EDI, EDX, EBX)
+ PRECALC_32(Y14, Y13)
+ CALC_F1_POST(ESI, EAX, EBX)
+}
+
+func CALC_89() {
+ CALC_F1_PRE(0x54, EBX, ESI, EAX, ECX)
+ PRECALC_33(Y8, Y12)
+ CALC_F1_POST(EBX, EDI, ECX)
+}
+
+func CALC_90() {
+ CALC_F1_PRE(0x58, ECX, EBX, EDI, EDX)
+ PRECALC_34(Y3)
+ CALC_F1_POST(ECX, ESI, EDX)
+}
+
+func CALC_91() {
+ CALC_F1_PRE(0x5c, EDX, ECX, ESI, EAX)
+ PRECALC_35(Y12)
+ CALC_F1_POST(EDX, EBX, EAX)
+}
+
+func CALC_92() {
+ CALC_F1_PRE(0x70, EAX, EDX, EBX, EDI)
+ PRECALC_36(Y12)
+ CALC_F1_POST(EAX, ECX, EDI)
+}
+
+func CALC_93() {
+ CALC_F1_PRE(0x74, EDI, EAX, ECX, ESI)
+ PRECALC_37(Y12)
+ CALC_F1_POST(EDI, EDX, ESI)
+}
+
+func CALC_94() {
+ CALC_F1_PRE(0x78, ESI, EDI, EDX, EBX)
+ CALC_F1_POST(ESI, EAX, EBX)
+}
+
+func CALC_95() {
+ CALC_F1_PRE(0x7c, EBX, ESI, EAX, ECX)
+ PRECALC_39(Y12, 0x40, 0x160)
+ CALC_F1_POST(EBX, EDI, ECX)
+}
+
+func CALC_96() {
+ CALC_F1_PRE(0x90, ECX, EBX, EDI, EDX)
+ PRECALC_32(Y13, Y12)
+ CALC_F1_POST(ECX, ESI, EDX)
+}
+
+func CALC_97() {
+ CALC_F1_PRE(0x94, EDX, ECX, ESI, EAX)
+ PRECALC_33(Y7, Y8)
+ CALC_F1_POST(EDX, EBX, EAX)
+}
+
+func CALC_98() {
+ CALC_F1_PRE(0x98, EAX, EDX, EBX, EDI)
+ PRECALC_34(Y15)
+ CALC_F1_POST(EAX, ECX, EDI)
+}
+
+func CALC_99() {
+ CALC_F2_PRE(0x9c, EDI, EAX, ESI)
+ PRECALC_35(Y8)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_100() {
+ CALC_F2_PRE(0xb0, ESI, EDI, EBX)
+ PRECALC_36(Y8)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_101() {
+ CALC_F2_PRE(0xb4, EBX, ESI, ECX)
+ PRECALC_37(Y8)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_102() {
+ CALC_F2_PRE(0xb8, ECX, EBX, EDX)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_103() {
+ CALC_F2_PRE(0xbc, EDX, ECX, EAX)
+ PRECALC_39(Y8, 0x40, 0x180)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_104() {
+ CALC_F2_PRE(0xd0, EAX, EDX, EDI)
+ PRECALC_32(Y12, Y8)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_105() {
+ CALC_F2_PRE(0xd4, EDI, EAX, ESI)
+ PRECALC_33(Y5, Y7)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_106() {
+ CALC_F2_PRE(0xd8, ESI, EDI, EBX)
+ PRECALC_34(Y14)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_107() {
+ CALC_F2_PRE(0xdc, EBX, ESI, ECX)
+ PRECALC_35(Y7)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_108() {
+ CALC_F2_PRE(0xf0, ECX, EBX, EDX)
+ PRECALC_36(Y7)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_109() {
+ CALC_F2_PRE(0xf4, EDX, ECX, EAX)
+ PRECALC_37(Y7)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_110() {
+ CALC_F2_PRE(0xf8, EAX, EDX, EDI)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_111() {
+ CALC_F2_PRE(0xfc, EDI, EAX, ESI)
+ PRECALC_39(Y7, 0x40, 0x1a0)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_112() {
+ CALC_F2_PRE(0x110, ESI, EDI, EBX)
+ PRECALC_32(Y8, Y7)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_113() {
+ CALC_F2_PRE(0x114, EBX, ESI, ECX)
+ PRECALC_33(Y3, Y5)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_114() {
+ CALC_F2_PRE(0x118, ECX, EBX, EDX)
+ PRECALC_34(Y13)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_115() {
+ CALC_F2_PRE(0x11c, EDX, ECX, EAX)
+ PRECALC_35(Y5)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_116() {
+ CALC_F2_PRE(0x130, EAX, EDX, EDI)
+ PRECALC_36(Y5)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_117() {
+ CALC_F2_PRE(0x134, EDI, EAX, ESI)
+ PRECALC_37(Y5)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_118() {
+ CALC_F2_PRE(0x138, ESI, EDI, EBX)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_119() {
+ CALC_F3_PRE(0x13c, ECX)
+ PRECALC_39(Y5, 0x40, 0x1c0)
+ CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
+}
+
+func CALC_120() {
+ CALC_F3_PRE(0x150, EDX)
+ PRECALC_32(Y7, Y5)
+ CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
+}
+
+func CALC_121() {
+ CALC_F3_PRE(0x154, EAX)
+ PRECALC_33(Y15, Y3)
+ CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
+}
+
+func CALC_122() {
+ CALC_F3_PRE(0x158, EDI)
+ PRECALC_34(Y12)
+ CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
+}
+
+func CALC_123() {
+ CALC_F3_PRE(0x15c, ESI)
+ PRECALC_35(Y3)
+ CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
+}
+
+func CALC_124() {
+ CALC_F3_PRE(0x170, EBX)
+ PRECALC_36(Y3)
+ CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
+}
+
+func CALC_125() {
+ CALC_F3_PRE(0x174, ECX)
+ PRECALC_37(Y3)
+ CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
+}
+
+func CALC_126() {
+ CALC_F3_PRE(0x178, EDX)
+ CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
+}
+
+func CALC_127() {
+ CALC_F3_PRE(0x17c, EAX)
+ PRECALC_39(Y3, 0x60, 0x1e0)
+ CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
+}
+
+func CALC_128() {
+ CALC_F3_PRE(0x190, EDI)
+ PRECALC_32(Y5, Y3)
+ CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
+}
+
+func CALC_129() {
+ CALC_F3_PRE(0x194, ESI)
+ PRECALC_33(Y14, Y15)
+ CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
+}
+
+func CALC_130() {
+ CALC_F3_PRE(0x198, EBX)
+ PRECALC_34(Y8)
+ CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
+}
+
+func CALC_131() {
+ CALC_F3_PRE(0x19c, ECX)
+ PRECALC_35(Y15)
+ CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
+}
+
+func CALC_132() {
+ CALC_F3_PRE(0x1b0, EDX)
+ PRECALC_36(Y15)
+ CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
+}
+
+func CALC_133() {
+ CALC_F3_PRE(0x1b4, EAX)
+ PRECALC_37(Y15)
+ CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
+}
+
+func CALC_134() {
+ CALC_F3_PRE(0x1b8, EDI)
+ CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
+}
+
+func CALC_135() {
+ CALC_F3_PRE(0x1bc, ESI)
+ PRECALC_39(Y15, 0x60, 0x200)
+ CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
+}
+
+func CALC_136() {
+ CALC_F3_PRE(0x1d0, EBX)
+ PRECALC_32(Y3, Y15)
+ CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
+}
+
+func CALC_137() {
+ CALC_F3_PRE(0x1d4, ECX)
+ PRECALC_33(Y13, Y14)
+ CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
+}
+
+func CALC_138() {
+ CALC_F3_PRE(0x1d8, EDX)
+ PRECALC_34(Y7)
+ CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
+}
+
+func CALC_139() {
+ CALC_F2_PRE(0x1dc, EDX, ECX, EAX)
+ PRECALC_35(Y14)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_140() {
+ CALC_F2_PRE(0x1f0, EAX, EDX, EDI)
+ PRECALC_36(Y14)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_141() {
+ CALC_F2_PRE(0x1f4, EDI, EAX, ESI)
+ PRECALC_37(Y14)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_142() {
+ CALC_F2_PRE(0x1f8, ESI, EDI, EBX)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_143() {
+ CALC_F2_PRE(0x1fc, EBX, ESI, ECX)
+ PRECALC_39(Y14, 0x60, 0x220)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_144() {
+ CALC_F2_PRE(0x210, ECX, EBX, EDX)
+ PRECALC_32(Y15, Y14)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_145() {
+ CALC_F2_PRE(0x214, EDX, ECX, EAX)
+ PRECALC_33(Y12, Y13)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_146() {
+ CALC_F2_PRE(0x218, EAX, EDX, EDI)
+ PRECALC_34(Y5)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_147() {
+ CALC_F2_PRE(0x21c, EDI, EAX, ESI)
+ PRECALC_35(Y13)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_148() {
+ CALC_F2_PRE(0x230, ESI, EDI, EBX)
+ PRECALC_36(Y13)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_149() {
+ CALC_F2_PRE(0x234, EBX, ESI, ECX)
+ PRECALC_37(Y13)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_150() {
+ CALC_F2_PRE(0x238, ECX, EBX, EDX)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_151() {
+ CALC_F2_PRE(0x23c, EDX, ECX, EAX)
+ PRECALC_39(Y13, 0x60, 0x240)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_152() {
+ CALC_F2_PRE(0x250, EAX, EDX, EDI)
+ PRECALC_32(Y14, Y13)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_153() {
+ CALC_F2_PRE(0x254, EDI, EAX, ESI)
+ PRECALC_33(Y8, Y12)
+ CALC_F2_POST(EDI, EDX, ECX, ESI)
+}
+
+func CALC_154() {
+ CALC_F2_PRE(0x258, ESI, EDI, EBX)
+ PRECALC_34(Y3)
+ CALC_F2_POST(ESI, EAX, EDX, EBX)
+}
+
+func CALC_155() {
+ CALC_F2_PRE(0x25c, EBX, ESI, ECX)
+ PRECALC_35(Y12)
+ CALC_F2_POST(EBX, EDI, EAX, ECX)
+}
+
+func CALC_156() {
+ CALC_F2_PRE(0x270, ECX, EBX, EDX)
+ PRECALC_36(Y12)
+ CALC_F2_POST(ECX, ESI, EDI, EDX)
+}
+
+func CALC_157() {
+ CALC_F2_PRE(0x274, EDX, ECX, EAX)
+ PRECALC_37(Y12)
+ CALC_F2_POST(EDX, EBX, ESI, EAX)
+}
+
+func CALC_158() {
+ CALC_F2_PRE(0x278, EAX, EDX, EDI)
+ CALC_F2_POST(EAX, ECX, EBX, EDI)
+}
+
+func CALC_159() {
+ ADDL(Mem{Base: R15}.Offset(0x27c), ESI)
+ LEAL(Mem{Base: SI, Index: AX, Scale: 1}, ESI)
+ RORXL(Imm(0x1b), EDI, R12L)
+ PRECALC_39(Y12, 0x60, 0x260)
+ ADDL(R12L, ESI)
+}
+
+func CALC() {
+ MOVL(Mem{Base: R9}, ECX)
+ MOVL(Mem{Base: R9}.Offset(4), ESI)
+ MOVL(Mem{Base: R9}.Offset(8), EDI)
+ MOVL(Mem{Base: R9}.Offset(12), EAX)
+ MOVL(Mem{Base: R9}.Offset(16), EDX)
+ MOVQ(RSP, R14)
+ LEAQ(Mem{Base: SP}.Offset(2*4*80+32), R15)
+ PRECALC() // Precalc WK for first 2 blocks
+ XCHGQ(R15, R14)
+ loop_avx2()
+ begin()
+}
+
+// this loops is unrolled
+func loop_avx2() {
+ Label("loop")
+ CMPQ(R10, R8) // we use R8 value (set below) as a signal of a last block
+ JNE(LabelRef("begin"))
+ VZEROUPPER()
+ RET()
+}
+
+func begin() {
+ Label("begin")
+ CALC_0()
+ CALC_1()
+ CALC_2()
+ CALC_3()
+ CALC_4()
+ CALC_5()
+ CALC_6()
+ CALC_7()
+ CALC_8()
+ CALC_9()
+ CALC_10()
+ CALC_11()
+ CALC_12()
+ CALC_13()
+ CALC_14()
+ CALC_15()
+ CALC_16()
+ CALC_17()
+ CALC_18()
+ CALC_19()
+ CALC_20()
+ CALC_21()
+ CALC_22()
+ CALC_23()
+ CALC_24()
+ CALC_25()
+ CALC_26()
+ CALC_27()
+ CALC_28()
+ CALC_29()
+ CALC_30()
+ CALC_31()
+ CALC_32()
+ CALC_33()
+ CALC_34()
+ CALC_35()
+ CALC_36()
+ CALC_37()
+ CALC_38()
+ CALC_39()
+ CALC_40()
+ CALC_41()
+ CALC_42()
+ CALC_43()
+ CALC_44()
+ CALC_45()
+ CALC_46()
+ CALC_47()
+ CALC_48()
+ CALC_49()
+ CALC_50()
+ CALC_51()
+ CALC_52()
+ CALC_53()
+ CALC_54()
+ CALC_55()
+ CALC_56()
+ CALC_57()
+ CALC_58()
+ CALC_59()
+ ADDQ(Imm(128), R10) // move to next even-64-byte block
+ CMPQ(R10, R11) // is current block the last one?
+ CMOVQCC(R8, R10) // signal the last iteration smartly
+ CALC_60()
+ CALC_61()
+ CALC_62()
+ CALC_63()
+ CALC_64()
+ CALC_65()
+ CALC_66()
+ CALC_67()
+ CALC_68()
+ CALC_69()
+ CALC_70()
+ CALC_71()
+ CALC_72()
+ CALC_73()
+ CALC_74()
+ CALC_75()
+ CALC_76()
+ CALC_77()
+ CALC_78()
+ CALC_79()
+ UPDATE_HASH(EAX, EDX, EBX, ESI, EDI)
+ CMPQ(R10, R8) // is current block the last one?
+ JE(LabelRef("loop"))
+ MOVL(EDX, ECX)
+ CALC_80()
+ CALC_81()
+ CALC_82()
+ CALC_83()
+ CALC_84()
+ CALC_85()
+ CALC_86()
+ CALC_87()
+ CALC_88()
+ CALC_89()
+ CALC_90()
+ CALC_91()
+ CALC_92()
+ CALC_93()
+ CALC_94()
+ CALC_95()
+ CALC_96()
+ CALC_97()
+ CALC_98()
+ CALC_99()
+ CALC_100()
+ CALC_101()
+ CALC_102()
+ CALC_103()
+ CALC_104()
+ CALC_105()
+ CALC_106()
+ CALC_107()
+ CALC_108()
+ CALC_109()
+ CALC_110()
+ CALC_111()
+ CALC_112()
+ CALC_113()
+ CALC_114()
+ CALC_115()
+ CALC_116()
+ CALC_117()
+ CALC_118()
+ CALC_119()
+ CALC_120()
+ CALC_121()
+ CALC_122()
+ CALC_123()
+ CALC_124()
+ CALC_125()
+ CALC_126()
+ CALC_127()
+ CALC_128()
+ CALC_129()
+ CALC_130()
+ CALC_131()
+ CALC_132()
+ CALC_133()
+ CALC_134()
+ CALC_135()
+ CALC_136()
+ CALC_137()
+ CALC_138()
+ CALC_139()
+ ADDQ(Imm(128), R13) //move to next even-64-byte block
+ CMPQ(R13, R11) //is current block the last one?
+ CMOVQCC(R8, R10)
+ CALC_140()
+ CALC_141()
+ CALC_142()
+ CALC_143()
+ CALC_144()
+ CALC_145()
+ CALC_146()
+ CALC_147()
+ CALC_148()
+ CALC_149()
+ CALC_150()
+ CALC_151()
+ CALC_152()
+ CALC_153()
+ CALC_154()
+ CALC_155()
+ CALC_156()
+ CALC_157()
+ CALC_158()
+ CALC_159()
+ UPDATE_HASH(ESI, EDI, EDX, ECX, EBX)
+ MOVL(ESI, R12L)
+ MOVL(EDI, ESI)
+ MOVL(EDX, EDI)
+ MOVL(EBX, EDX)
+ MOVL(ECX, EAX)
+ MOVL(R12L, ECX)
+ XCHGQ(R15, R14)
+ JMP(LabelRef("loop"))
+}
+
+func blockAVX2() {
+ Implement("blockAVX2")
+ AllocLocal(1408)
+
+ Load(Param("dig"), RDI)
+ Load(Param("p").Base(), RSI)
+ Load(Param("p").Len(), RDX)
+ SHRQ(Imm(6), RDX)
+ SHLQ(Imm(6), RDX)
+
+ K_XMM_AR := K_XMM_AR_DATA()
+ LEAQ(K_XMM_AR, R8)
+
+ MOVQ(RDI, R9)
+ MOVQ(RSI, R10)
+ LEAQ(Mem{Base: SI}.Offset(64), R13)
+
+ ADDQ(RSI, RDX)
+ ADDQ(Imm(64), RDX)
+ MOVQ(RDX, R11)
+
+ CMPQ(R13, R11)
+ CMOVQCC(R8, R13)
+
+ BSWAP_SHUFB_CTL := BSWAP_SHUFB_CTL_DATA()
+ VMOVDQU(BSWAP_SHUFB_CTL, Y10)
+ CALC()
+}
+
+// ##~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
+
+// Pointers for memoizing Data section symbols
+var (
+ K_XMM_AR_ptr, BSWAP_SHUFB_CTL_ptr *Mem
+)
+
+// To hold Round Constants for K_XMM_AR_DATA
+
+var _K = []uint32{
+ 0x5A827999,
+ 0x6ED9EBA1,
+ 0x8F1BBCDC,
+ 0xCA62C1D6,
+}
+
+func K_XMM_AR_DATA() Mem {
+ if K_XMM_AR_ptr != nil {
+ return *K_XMM_AR_ptr
+ }
+
+ K_XMM_AR := GLOBL("K_XMM_AR", RODATA)
+ K_XMM_AR_ptr = &K_XMM_AR
+
+ offset_idx := 0
+ for _, v := range _K {
+ DATA((offset_idx+0)*4, U32(v))
+ DATA((offset_idx+1)*4, U32(v))
+ DATA((offset_idx+2)*4, U32(v))
+ DATA((offset_idx+3)*4, U32(v))
+ DATA((offset_idx+4)*4, U32(v))
+ DATA((offset_idx+5)*4, U32(v))
+ DATA((offset_idx+6)*4, U32(v))
+ DATA((offset_idx+7)*4, U32(v))
+ offset_idx += 8
+ }
+ return K_XMM_AR
+}
+
+var BSWAP_SHUFB_CTL_CONSTANTS = [8]uint32{
+ 0x00010203,
+ 0x04050607,
+ 0x08090a0b,
+ 0x0c0d0e0f,
+ 0x00010203,
+ 0x04050607,
+ 0x08090a0b,
+ 0x0c0d0e0f,
+}
+
+func BSWAP_SHUFB_CTL_DATA() Mem {
+ if BSWAP_SHUFB_CTL_ptr != nil {
+ return *BSWAP_SHUFB_CTL_ptr
+ }
+
+ BSWAP_SHUFB_CTL := GLOBL("BSWAP_SHUFB_CTL", RODATA)
+ BSWAP_SHUFB_CTL_ptr = &BSWAP_SHUFB_CTL
+ for i, v := range BSWAP_SHUFB_CTL_CONSTANTS {
+
+ DATA(i*4, U32(v))
+ }
+ return BSWAP_SHUFB_CTL
+}
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// AVX2 version by Intel, same algorithm as code in Linux kernel:
-// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
-// Authors:
-// Ilya Albrekht <ilya.albrekht@intel.com>
-// Maxim Locktyukhin <maxim.locktyukhin@intel.com>
-// Ronen Zohar <ronen.zohar@intel.com>
-// Chandramouli Narayanan <mouli@linux.intel.com>
+// Code generated by command: go run sha1block_amd64_asm.go -out ../sha1block_amd64.s -pkg sha1. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
-// SHA-1 block routine. See sha1block.go for Go equivalent.
-//
-// There are 80 rounds of 4 types:
-// - rounds 0-15 are type 1 and load data (ROUND1 macro).
-// - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
-// - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
-// - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
-// - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
-//
-// Each round loads or shuffles the data, then computes a per-round
-// function of b, c, d, and then mixes the result into and rotates the
-// five registers a, b, c, d, e holding the intermediate results.
-//
-// The register rotation is implemented by rotating the arguments to
-// the round macros instead of by explicit move instructions.
-
-#define LOAD(index) \
- MOVL (index*4)(SI), R10; \
- BSWAPL R10; \
- MOVL R10, (index*4)(SP)
-
-#define SHUFFLE(index) \
- MOVL (((index)&0xf)*4)(SP), R10; \
- XORL (((index-3)&0xf)*4)(SP), R10; \
- XORL (((index-8)&0xf)*4)(SP), R10; \
- XORL (((index-14)&0xf)*4)(SP), R10; \
- ROLL $1, R10; \
- MOVL R10, (((index)&0xf)*4)(SP)
-
-#define FUNC1(a, b, c, d, e) \
- MOVL d, R9; \
- XORL c, R9; \
- ANDL b, R9; \
- XORL d, R9
-
-#define FUNC2(a, b, c, d, e) \
- MOVL b, R9; \
- XORL c, R9; \
- XORL d, R9
-
-#define FUNC3(a, b, c, d, e) \
- MOVL b, R8; \
- ORL c, R8; \
- ANDL d, R8; \
- MOVL b, R9; \
- ANDL c, R9; \
- ORL R8, R9
-
-#define FUNC4 FUNC2
-
-#define MIX(a, b, c, d, e, const) \
- ROLL $30, b; \
- ADDL R9, e; \
- MOVL a, R8; \
- ROLL $5, R8; \
- LEAL const(e)(R10*1), e; \
- ADDL R8, e
-
-#define ROUND1(a, b, c, d, e, index) \
- LOAD(index); \
- FUNC1(a, b, c, d, e); \
- MIX(a, b, c, d, e, 0x5A827999)
-
-#define ROUND1x(a, b, c, d, e, index) \
- SHUFFLE(index); \
- FUNC1(a, b, c, d, e); \
- MIX(a, b, c, d, e, 0x5A827999)
-
-#define ROUND2(a, b, c, d, e, index) \
- SHUFFLE(index); \
- FUNC2(a, b, c, d, e); \
- MIX(a, b, c, d, e, 0x6ED9EBA1)
-
-#define ROUND3(a, b, c, d, e, index) \
- SHUFFLE(index); \
- FUNC3(a, b, c, d, e); \
- MIX(a, b, c, d, e, 0x8F1BBCDC)
-
-#define ROUND4(a, b, c, d, e, index) \
- SHUFFLE(index); \
- FUNC4(a, b, c, d, e); \
- MIX(a, b, c, d, e, 0xCA62C1D6)
-
-TEXT ·blockAMD64(SB),NOSPLIT,$64-32
- MOVQ dig+0(FP), BP
- MOVQ p_base+8(FP), SI
- MOVQ p_len+16(FP), DX
- SHRQ $6, DX
- SHLQ $6, DX
-
- LEAQ (SI)(DX*1), DI
- MOVL (0*4)(BP), AX
- MOVL (1*4)(BP), BX
- MOVL (2*4)(BP), CX
- MOVL (3*4)(BP), DX
- MOVL (4*4)(BP), BP
-
- CMPQ SI, DI
- JEQ end
+// func blockAMD64(dig *digest, p []byte)
+TEXT ·blockAMD64(SB), NOSPLIT, $64-32
+ MOVQ dig+0(FP), BP
+ MOVQ p_base+8(FP), SI
+ MOVQ p_len+16(FP), DX
+ SHRQ $0x06, DX
+ SHLQ $0x06, DX
+ LEAQ (SI)(DX*1), DI
+ MOVL (BP), AX
+ MOVL 4(BP), BX
+ MOVL 8(BP), CX
+ MOVL 12(BP), DX
+ MOVL 16(BP), BP
+ CMPQ SI, DI
+ JEQ end
loop:
- MOVL AX, R11
- MOVL BX, R12
- MOVL CX, R13
- MOVL DX, R14
- MOVL BP, R15
-
- ROUND1(AX, BX, CX, DX, BP, 0)
- ROUND1(BP, AX, BX, CX, DX, 1)
- ROUND1(DX, BP, AX, BX, CX, 2)
- ROUND1(CX, DX, BP, AX, BX, 3)
- ROUND1(BX, CX, DX, BP, AX, 4)
- ROUND1(AX, BX, CX, DX, BP, 5)
- ROUND1(BP, AX, BX, CX, DX, 6)
- ROUND1(DX, BP, AX, BX, CX, 7)
- ROUND1(CX, DX, BP, AX, BX, 8)
- ROUND1(BX, CX, DX, BP, AX, 9)
- ROUND1(AX, BX, CX, DX, BP, 10)
- ROUND1(BP, AX, BX, CX, DX, 11)
- ROUND1(DX, BP, AX, BX, CX, 12)
- ROUND1(CX, DX, BP, AX, BX, 13)
- ROUND1(BX, CX, DX, BP, AX, 14)
- ROUND1(AX, BX, CX, DX, BP, 15)
-
- ROUND1x(BP, AX, BX, CX, DX, 16)
- ROUND1x(DX, BP, AX, BX, CX, 17)
- ROUND1x(CX, DX, BP, AX, BX, 18)
- ROUND1x(BX, CX, DX, BP, AX, 19)
-
- ROUND2(AX, BX, CX, DX, BP, 20)
- ROUND2(BP, AX, BX, CX, DX, 21)
- ROUND2(DX, BP, AX, BX, CX, 22)
- ROUND2(CX, DX, BP, AX, BX, 23)
- ROUND2(BX, CX, DX, BP, AX, 24)
- ROUND2(AX, BX, CX, DX, BP, 25)
- ROUND2(BP, AX, BX, CX, DX, 26)
- ROUND2(DX, BP, AX, BX, CX, 27)
- ROUND2(CX, DX, BP, AX, BX, 28)
- ROUND2(BX, CX, DX, BP, AX, 29)
- ROUND2(AX, BX, CX, DX, BP, 30)
- ROUND2(BP, AX, BX, CX, DX, 31)
- ROUND2(DX, BP, AX, BX, CX, 32)
- ROUND2(CX, DX, BP, AX, BX, 33)
- ROUND2(BX, CX, DX, BP, AX, 34)
- ROUND2(AX, BX, CX, DX, BP, 35)
- ROUND2(BP, AX, BX, CX, DX, 36)
- ROUND2(DX, BP, AX, BX, CX, 37)
- ROUND2(CX, DX, BP, AX, BX, 38)
- ROUND2(BX, CX, DX, BP, AX, 39)
-
- ROUND3(AX, BX, CX, DX, BP, 40)
- ROUND3(BP, AX, BX, CX, DX, 41)
- ROUND3(DX, BP, AX, BX, CX, 42)
- ROUND3(CX, DX, BP, AX, BX, 43)
- ROUND3(BX, CX, DX, BP, AX, 44)
- ROUND3(AX, BX, CX, DX, BP, 45)
- ROUND3(BP, AX, BX, CX, DX, 46)
- ROUND3(DX, BP, AX, BX, CX, 47)
- ROUND3(CX, DX, BP, AX, BX, 48)
- ROUND3(BX, CX, DX, BP, AX, 49)
- ROUND3(AX, BX, CX, DX, BP, 50)
- ROUND3(BP, AX, BX, CX, DX, 51)
- ROUND3(DX, BP, AX, BX, CX, 52)
- ROUND3(CX, DX, BP, AX, BX, 53)
- ROUND3(BX, CX, DX, BP, AX, 54)
- ROUND3(AX, BX, CX, DX, BP, 55)
- ROUND3(BP, AX, BX, CX, DX, 56)
- ROUND3(DX, BP, AX, BX, CX, 57)
- ROUND3(CX, DX, BP, AX, BX, 58)
- ROUND3(BX, CX, DX, BP, AX, 59)
-
- ROUND4(AX, BX, CX, DX, BP, 60)
- ROUND4(BP, AX, BX, CX, DX, 61)
- ROUND4(DX, BP, AX, BX, CX, 62)
- ROUND4(CX, DX, BP, AX, BX, 63)
- ROUND4(BX, CX, DX, BP, AX, 64)
- ROUND4(AX, BX, CX, DX, BP, 65)
- ROUND4(BP, AX, BX, CX, DX, 66)
- ROUND4(DX, BP, AX, BX, CX, 67)
- ROUND4(CX, DX, BP, AX, BX, 68)
- ROUND4(BX, CX, DX, BP, AX, 69)
- ROUND4(AX, BX, CX, DX, BP, 70)
- ROUND4(BP, AX, BX, CX, DX, 71)
- ROUND4(DX, BP, AX, BX, CX, 72)
- ROUND4(CX, DX, BP, AX, BX, 73)
- ROUND4(BX, CX, DX, BP, AX, 74)
- ROUND4(AX, BX, CX, DX, BP, 75)
- ROUND4(BP, AX, BX, CX, DX, 76)
- ROUND4(DX, BP, AX, BX, CX, 77)
- ROUND4(CX, DX, BP, AX, BX, 78)
- ROUND4(BX, CX, DX, BP, AX, 79)
-
- ADDL R11, AX
- ADDL R12, BX
- ADDL R13, CX
- ADDL R14, DX
- ADDL R15, BP
-
- ADDQ $64, SI
- CMPQ SI, DI
- JB loop
+ MOVL AX, R11
+ MOVL BX, R12
+ MOVL CX, R13
+ MOVL DX, R14
+ MOVL BP, R15
+ MOVL (SI), R10
+ BSWAPL R10
+ MOVL R10, (SP)
+ MOVL DX, R9
+ XORL CX, R9
+ ANDL BX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 4(SI), R10
+ BSWAPL R10
+ MOVL R10, 4(SP)
+ MOVL CX, R9
+ XORL BX, R9
+ ANDL AX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 8(SI), R10
+ BSWAPL R10
+ MOVL R10, 8(SP)
+ MOVL BX, R9
+ XORL AX, R9
+ ANDL BP, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 12(SI), R10
+ BSWAPL R10
+ MOVL R10, 12(SP)
+ MOVL AX, R9
+ XORL BP, R9
+ ANDL DX, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 16(SI), R10
+ BSWAPL R10
+ MOVL R10, 16(SP)
+ MOVL BP, R9
+ XORL DX, R9
+ ANDL CX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 20(SI), R10
+ BSWAPL R10
+ MOVL R10, 20(SP)
+ MOVL DX, R9
+ XORL CX, R9
+ ANDL BX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 24(SI), R10
+ BSWAPL R10
+ MOVL R10, 24(SP)
+ MOVL CX, R9
+ XORL BX, R9
+ ANDL AX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 28(SI), R10
+ BSWAPL R10
+ MOVL R10, 28(SP)
+ MOVL BX, R9
+ XORL AX, R9
+ ANDL BP, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 32(SI), R10
+ BSWAPL R10
+ MOVL R10, 32(SP)
+ MOVL AX, R9
+ XORL BP, R9
+ ANDL DX, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 36(SI), R10
+ BSWAPL R10
+ MOVL R10, 36(SP)
+ MOVL BP, R9
+ XORL DX, R9
+ ANDL CX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 40(SI), R10
+ BSWAPL R10
+ MOVL R10, 40(SP)
+ MOVL DX, R9
+ XORL CX, R9
+ ANDL BX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 44(SI), R10
+ BSWAPL R10
+ MOVL R10, 44(SP)
+ MOVL CX, R9
+ XORL BX, R9
+ ANDL AX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 48(SI), R10
+ BSWAPL R10
+ MOVL R10, 48(SP)
+ MOVL BX, R9
+ XORL AX, R9
+ ANDL BP, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 52(SI), R10
+ BSWAPL R10
+ MOVL R10, 52(SP)
+ MOVL AX, R9
+ XORL BP, R9
+ ANDL DX, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 56(SI), R10
+ BSWAPL R10
+ MOVL R10, 56(SP)
+ MOVL BP, R9
+ XORL DX, R9
+ ANDL CX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 60(SI), R10
+ BSWAPL R10
+ MOVL R10, 60(SP)
+ MOVL DX, R9
+ XORL CX, R9
+ ANDL BX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL (SP), R10
+ XORL 52(SP), R10
+ XORL 32(SP), R10
+ XORL 8(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, (SP)
+ MOVL CX, R9
+ XORL BX, R9
+ ANDL AX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 4(SP), R10
+ XORL 56(SP), R10
+ XORL 36(SP), R10
+ XORL 12(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 4(SP)
+ MOVL BX, R9
+ XORL AX, R9
+ ANDL BP, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 8(SP), R10
+ XORL 60(SP), R10
+ XORL 40(SP), R10
+ XORL 16(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 8(SP)
+ MOVL AX, R9
+ XORL BP, R9
+ ANDL DX, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 12(SP), R10
+ XORL (SP), R10
+ XORL 44(SP), R10
+ XORL 20(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 12(SP)
+ MOVL BP, R9
+ XORL DX, R9
+ ANDL CX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 1518500249(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 16(SP), R10
+ XORL 4(SP), R10
+ XORL 48(SP), R10
+ XORL 24(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 16(SP)
+ MOVL BX, R9
+ XORL CX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 20(SP), R10
+ XORL 8(SP), R10
+ XORL 52(SP), R10
+ XORL 28(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 20(SP)
+ MOVL AX, R9
+ XORL BX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 24(SP), R10
+ XORL 12(SP), R10
+ XORL 56(SP), R10
+ XORL 32(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 24(SP)
+ MOVL BP, R9
+ XORL AX, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 28(SP), R10
+ XORL 16(SP), R10
+ XORL 60(SP), R10
+ XORL 36(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 28(SP)
+ MOVL DX, R9
+ XORL BP, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 32(SP), R10
+ XORL 20(SP), R10
+ XORL (SP), R10
+ XORL 40(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 32(SP)
+ MOVL CX, R9
+ XORL DX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 36(SP), R10
+ XORL 24(SP), R10
+ XORL 4(SP), R10
+ XORL 44(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 36(SP)
+ MOVL BX, R9
+ XORL CX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 40(SP), R10
+ XORL 28(SP), R10
+ XORL 8(SP), R10
+ XORL 48(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 40(SP)
+ MOVL AX, R9
+ XORL BX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 44(SP), R10
+ XORL 32(SP), R10
+ XORL 12(SP), R10
+ XORL 52(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 44(SP)
+ MOVL BP, R9
+ XORL AX, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 48(SP), R10
+ XORL 36(SP), R10
+ XORL 16(SP), R10
+ XORL 56(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 48(SP)
+ MOVL DX, R9
+ XORL BP, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 52(SP), R10
+ XORL 40(SP), R10
+ XORL 20(SP), R10
+ XORL 60(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 52(SP)
+ MOVL CX, R9
+ XORL DX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 56(SP), R10
+ XORL 44(SP), R10
+ XORL 24(SP), R10
+ XORL (SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 56(SP)
+ MOVL BX, R9
+ XORL CX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 60(SP), R10
+ XORL 48(SP), R10
+ XORL 28(SP), R10
+ XORL 4(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 60(SP)
+ MOVL AX, R9
+ XORL BX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL (SP), R10
+ XORL 52(SP), R10
+ XORL 32(SP), R10
+ XORL 8(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, (SP)
+ MOVL BP, R9
+ XORL AX, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 4(SP), R10
+ XORL 56(SP), R10
+ XORL 36(SP), R10
+ XORL 12(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 4(SP)
+ MOVL DX, R9
+ XORL BP, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 8(SP), R10
+ XORL 60(SP), R10
+ XORL 40(SP), R10
+ XORL 16(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 8(SP)
+ MOVL CX, R9
+ XORL DX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 12(SP), R10
+ XORL (SP), R10
+ XORL 44(SP), R10
+ XORL 20(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 12(SP)
+ MOVL BX, R9
+ XORL CX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 16(SP), R10
+ XORL 4(SP), R10
+ XORL 48(SP), R10
+ XORL 24(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 16(SP)
+ MOVL AX, R9
+ XORL BX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 20(SP), R10
+ XORL 8(SP), R10
+ XORL 52(SP), R10
+ XORL 28(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 20(SP)
+ MOVL BP, R9
+ XORL AX, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 24(SP), R10
+ XORL 12(SP), R10
+ XORL 56(SP), R10
+ XORL 32(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 24(SP)
+ MOVL DX, R9
+ XORL BP, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 28(SP), R10
+ XORL 16(SP), R10
+ XORL 60(SP), R10
+ XORL 36(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 28(SP)
+ MOVL CX, R9
+ XORL DX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 1859775393(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 32(SP), R10
+ XORL 20(SP), R10
+ XORL (SP), R10
+ XORL 40(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 32(SP)
+ MOVL BX, R8
+ ORL CX, R8
+ ANDL DX, R8
+ MOVL BX, R9
+ ANDL CX, R9
+ ORL R8, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 36(SP), R10
+ XORL 24(SP), R10
+ XORL 4(SP), R10
+ XORL 44(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 36(SP)
+ MOVL AX, R8
+ ORL BX, R8
+ ANDL CX, R8
+ MOVL AX, R9
+ ANDL BX, R9
+ ORL R8, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 40(SP), R10
+ XORL 28(SP), R10
+ XORL 8(SP), R10
+ XORL 48(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 40(SP)
+ MOVL BP, R8
+ ORL AX, R8
+ ANDL BX, R8
+ MOVL BP, R9
+ ANDL AX, R9
+ ORL R8, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 44(SP), R10
+ XORL 32(SP), R10
+ XORL 12(SP), R10
+ XORL 52(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 44(SP)
+ MOVL DX, R8
+ ORL BP, R8
+ ANDL AX, R8
+ MOVL DX, R9
+ ANDL BP, R9
+ ORL R8, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 48(SP), R10
+ XORL 36(SP), R10
+ XORL 16(SP), R10
+ XORL 56(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 48(SP)
+ MOVL CX, R8
+ ORL DX, R8
+ ANDL BP, R8
+ MOVL CX, R9
+ ANDL DX, R9
+ ORL R8, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 52(SP), R10
+ XORL 40(SP), R10
+ XORL 20(SP), R10
+ XORL 60(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 52(SP)
+ MOVL BX, R8
+ ORL CX, R8
+ ANDL DX, R8
+ MOVL BX, R9
+ ANDL CX, R9
+ ORL R8, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 56(SP), R10
+ XORL 44(SP), R10
+ XORL 24(SP), R10
+ XORL (SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 56(SP)
+ MOVL AX, R8
+ ORL BX, R8
+ ANDL CX, R8
+ MOVL AX, R9
+ ANDL BX, R9
+ ORL R8, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 60(SP), R10
+ XORL 48(SP), R10
+ XORL 28(SP), R10
+ XORL 4(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 60(SP)
+ MOVL BP, R8
+ ORL AX, R8
+ ANDL BX, R8
+ MOVL BP, R9
+ ANDL AX, R9
+ ORL R8, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL (SP), R10
+ XORL 52(SP), R10
+ XORL 32(SP), R10
+ XORL 8(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, (SP)
+ MOVL DX, R8
+ ORL BP, R8
+ ANDL AX, R8
+ MOVL DX, R9
+ ANDL BP, R9
+ ORL R8, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 4(SP), R10
+ XORL 56(SP), R10
+ XORL 36(SP), R10
+ XORL 12(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 4(SP)
+ MOVL CX, R8
+ ORL DX, R8
+ ANDL BP, R8
+ MOVL CX, R9
+ ANDL DX, R9
+ ORL R8, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 8(SP), R10
+ XORL 60(SP), R10
+ XORL 40(SP), R10
+ XORL 16(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 8(SP)
+ MOVL BX, R8
+ ORL CX, R8
+ ANDL DX, R8
+ MOVL BX, R9
+ ANDL CX, R9
+ ORL R8, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 12(SP), R10
+ XORL (SP), R10
+ XORL 44(SP), R10
+ XORL 20(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 12(SP)
+ MOVL AX, R8
+ ORL BX, R8
+ ANDL CX, R8
+ MOVL AX, R9
+ ANDL BX, R9
+ ORL R8, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 16(SP), R10
+ XORL 4(SP), R10
+ XORL 48(SP), R10
+ XORL 24(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 16(SP)
+ MOVL BP, R8
+ ORL AX, R8
+ ANDL BX, R8
+ MOVL BP, R9
+ ANDL AX, R9
+ ORL R8, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 20(SP), R10
+ XORL 8(SP), R10
+ XORL 52(SP), R10
+ XORL 28(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 20(SP)
+ MOVL DX, R8
+ ORL BP, R8
+ ANDL AX, R8
+ MOVL DX, R9
+ ANDL BP, R9
+ ORL R8, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 24(SP), R10
+ XORL 12(SP), R10
+ XORL 56(SP), R10
+ XORL 32(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 24(SP)
+ MOVL CX, R8
+ ORL DX, R8
+ ANDL BP, R8
+ MOVL CX, R9
+ ANDL DX, R9
+ ORL R8, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 28(SP), R10
+ XORL 16(SP), R10
+ XORL 60(SP), R10
+ XORL 36(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 28(SP)
+ MOVL BX, R8
+ ORL CX, R8
+ ANDL DX, R8
+ MOVL BX, R9
+ ANDL CX, R9
+ ORL R8, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 32(SP), R10
+ XORL 20(SP), R10
+ XORL (SP), R10
+ XORL 40(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 32(SP)
+ MOVL AX, R8
+ ORL BX, R8
+ ANDL CX, R8
+ MOVL AX, R9
+ ANDL BX, R9
+ ORL R8, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 36(SP), R10
+ XORL 24(SP), R10
+ XORL 4(SP), R10
+ XORL 44(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 36(SP)
+ MOVL BP, R8
+ ORL AX, R8
+ ANDL BX, R8
+ MOVL BP, R9
+ ANDL AX, R9
+ ORL R8, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 40(SP), R10
+ XORL 28(SP), R10
+ XORL 8(SP), R10
+ XORL 48(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 40(SP)
+ MOVL DX, R8
+ ORL BP, R8
+ ANDL AX, R8
+ MOVL DX, R9
+ ANDL BP, R9
+ ORL R8, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 44(SP), R10
+ XORL 32(SP), R10
+ XORL 12(SP), R10
+ XORL 52(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 44(SP)
+ MOVL CX, R8
+ ORL DX, R8
+ ANDL BP, R8
+ MOVL CX, R9
+ ANDL DX, R9
+ ORL R8, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 2400959708(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 48(SP), R10
+ XORL 36(SP), R10
+ XORL 16(SP), R10
+ XORL 56(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 48(SP)
+ MOVL BX, R9
+ XORL CX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 52(SP), R10
+ XORL 40(SP), R10
+ XORL 20(SP), R10
+ XORL 60(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 52(SP)
+ MOVL AX, R9
+ XORL BX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 56(SP), R10
+ XORL 44(SP), R10
+ XORL 24(SP), R10
+ XORL (SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 56(SP)
+ MOVL BP, R9
+ XORL AX, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 60(SP), R10
+ XORL 48(SP), R10
+ XORL 28(SP), R10
+ XORL 4(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 60(SP)
+ MOVL DX, R9
+ XORL BP, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL (SP), R10
+ XORL 52(SP), R10
+ XORL 32(SP), R10
+ XORL 8(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, (SP)
+ MOVL CX, R9
+ XORL DX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 4(SP), R10
+ XORL 56(SP), R10
+ XORL 36(SP), R10
+ XORL 12(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 4(SP)
+ MOVL BX, R9
+ XORL CX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 8(SP), R10
+ XORL 60(SP), R10
+ XORL 40(SP), R10
+ XORL 16(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 8(SP)
+ MOVL AX, R9
+ XORL BX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 12(SP), R10
+ XORL (SP), R10
+ XORL 44(SP), R10
+ XORL 20(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 12(SP)
+ MOVL BP, R9
+ XORL AX, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 16(SP), R10
+ XORL 4(SP), R10
+ XORL 48(SP), R10
+ XORL 24(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 16(SP)
+ MOVL DX, R9
+ XORL BP, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 20(SP), R10
+ XORL 8(SP), R10
+ XORL 52(SP), R10
+ XORL 28(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 20(SP)
+ MOVL CX, R9
+ XORL DX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 24(SP), R10
+ XORL 12(SP), R10
+ XORL 56(SP), R10
+ XORL 32(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 24(SP)
+ MOVL BX, R9
+ XORL CX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 28(SP), R10
+ XORL 16(SP), R10
+ XORL 60(SP), R10
+ XORL 36(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 28(SP)
+ MOVL AX, R9
+ XORL BX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 32(SP), R10
+ XORL 20(SP), R10
+ XORL (SP), R10
+ XORL 40(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 32(SP)
+ MOVL BP, R9
+ XORL AX, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 36(SP), R10
+ XORL 24(SP), R10
+ XORL 4(SP), R10
+ XORL 44(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 36(SP)
+ MOVL DX, R9
+ XORL BP, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 40(SP), R10
+ XORL 28(SP), R10
+ XORL 8(SP), R10
+ XORL 48(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 40(SP)
+ MOVL CX, R9
+ XORL DX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(AX)(R10*1), AX
+ ADDL R8, AX
+ MOVL 44(SP), R10
+ XORL 32(SP), R10
+ XORL 12(SP), R10
+ XORL 52(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 44(SP)
+ MOVL BX, R9
+ XORL CX, R9
+ XORL DX, R9
+ ROLL $0x1e, BX
+ ADDL R9, BP
+ MOVL AX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(BP)(R10*1), BP
+ ADDL R8, BP
+ MOVL 48(SP), R10
+ XORL 36(SP), R10
+ XORL 16(SP), R10
+ XORL 56(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 48(SP)
+ MOVL AX, R9
+ XORL BX, R9
+ XORL CX, R9
+ ROLL $0x1e, AX
+ ADDL R9, DX
+ MOVL BP, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(DX)(R10*1), DX
+ ADDL R8, DX
+ MOVL 52(SP), R10
+ XORL 40(SP), R10
+ XORL 20(SP), R10
+ XORL 60(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 52(SP)
+ MOVL BP, R9
+ XORL AX, R9
+ XORL BX, R9
+ ROLL $0x1e, BP
+ ADDL R9, CX
+ MOVL DX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(CX)(R10*1), CX
+ ADDL R8, CX
+ MOVL 56(SP), R10
+ XORL 44(SP), R10
+ XORL 24(SP), R10
+ XORL (SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 56(SP)
+ MOVL DX, R9
+ XORL BP, R9
+ XORL AX, R9
+ ROLL $0x1e, DX
+ ADDL R9, BX
+ MOVL CX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(BX)(R10*1), BX
+ ADDL R8, BX
+ MOVL 60(SP), R10
+ XORL 48(SP), R10
+ XORL 28(SP), R10
+ XORL 4(SP), R10
+ ROLL $0x01, R10
+ MOVL R10, 60(SP)
+ MOVL CX, R9
+ XORL DX, R9
+ XORL BP, R9
+ ROLL $0x1e, CX
+ ADDL R9, AX
+ MOVL BX, R8
+ ROLL $0x05, R8
+ LEAL 3395469782(AX)(R10*1), AX
+ ADDL R8, AX
+ ADDL R11, AX
+ ADDL R12, BX
+ ADDL R13, CX
+ ADDL R14, DX
+ ADDL R15, BP
+ ADDQ $0x40, SI
+ CMPQ SI, DI
+ JB loop
end:
- MOVQ dig+0(FP), DI
- MOVL AX, (0*4)(DI)
- MOVL BX, (1*4)(DI)
- MOVL CX, (2*4)(DI)
- MOVL DX, (3*4)(DI)
- MOVL BP, (4*4)(DI)
+ MOVQ dig+0(FP), DI
+ MOVL AX, (DI)
+ MOVL BX, 4(DI)
+ MOVL CX, 8(DI)
+ MOVL DX, 12(DI)
+ MOVL BP, 16(DI)
RET
+// func blockAVX2(dig *digest, p []byte)
+// Requires: AVX, AVX2, BMI, BMI2, CMOV
+TEXT ·blockAVX2(SB), $1408-32
+ MOVQ dig+0(FP), DI
+ MOVQ p_base+8(FP), SI
+ MOVQ p_len+16(FP), DX
+ SHRQ $0x06, DX
+ SHLQ $0x06, DX
+ LEAQ K_XMM_AR<>+0(SB), R8
+ MOVQ DI, R9
+ MOVQ SI, R10
+ LEAQ 64(SI), R13
+ ADDQ SI, DX
+ ADDQ $0x40, DX
+ MOVQ DX, R11
+ CMPQ R13, R11
+ CMOVQCC R8, R13
+ VMOVDQU BSWAP_SHUFB_CTL<>+0(SB), Y10
+ MOVL (R9), CX
+ MOVL 4(R9), SI
+ MOVL 8(R9), DI
+ MOVL 12(R9), AX
+ MOVL 16(R9), DX
+ MOVQ SP, R14
+ LEAQ 672(SP), R15
+ VMOVDQU (R10), X0
+ VINSERTI128 $0x01, (R13), Y0, Y0
+ VPSHUFB Y10, Y0, Y15
+ VPADDD (R8), Y15, Y0
+ VMOVDQU Y0, (R14)
+ VMOVDQU 16(R10), X0
+ VINSERTI128 $0x01, 16(R13), Y0, Y0
+ VPSHUFB Y10, Y0, Y14
+ VPADDD (R8), Y14, Y0
+ VMOVDQU Y0, 32(R14)
+ VMOVDQU 32(R10), X0
+ VINSERTI128 $0x01, 32(R13), Y0, Y0
+ VPSHUFB Y10, Y0, Y13
+ VPADDD (R8), Y13, Y0
+ VMOVDQU Y0, 64(R14)
+ VMOVDQU 48(R10), X0
+ VINSERTI128 $0x01, 48(R13), Y0, Y0
+ VPSHUFB Y10, Y0, Y12
+ VPADDD (R8), Y12, Y0
+ VMOVDQU Y0, 96(R14)
+ VPALIGNR $0x08, Y15, Y14, Y8
+ VPSRLDQ $0x04, Y12, Y0
+ VPXOR Y13, Y8, Y8
+ VPXOR Y15, Y0, Y0
+ VPXOR Y0, Y8, Y8
+ VPSLLDQ $0x0c, Y8, Y9
+ VPSLLD $0x01, Y8, Y0
+ VPSRLD $0x1f, Y8, Y8
+ VPOR Y8, Y0, Y0
+ VPSLLD $0x02, Y9, Y8
+ VPSRLD $0x1e, Y9, Y9
+ VPXOR Y8, Y0, Y0
+ VPXOR Y9, Y0, Y8
+ VPADDD (R8), Y8, Y0
+ VMOVDQU Y0, 128(R14)
+ VPALIGNR $0x08, Y14, Y13, Y7
+ VPSRLDQ $0x04, Y8, Y0
+ VPXOR Y12, Y7, Y7
+ VPXOR Y14, Y0, Y0
+ VPXOR Y0, Y7, Y7
+ VPSLLDQ $0x0c, Y7, Y9
+ VPSLLD $0x01, Y7, Y0
+ VPSRLD $0x1f, Y7, Y7
+ VPOR Y7, Y0, Y0
+ VPSLLD $0x02, Y9, Y7
+ VPSRLD $0x1e, Y9, Y9
+ VPXOR Y7, Y0, Y0
+ VPXOR Y9, Y0, Y7
+ VPADDD 32(R8), Y7, Y0
+ VMOVDQU Y0, 160(R14)
+ VPALIGNR $0x08, Y13, Y12, Y5
+ VPSRLDQ $0x04, Y7, Y0
+ VPXOR Y8, Y5, Y5
+ VPXOR Y13, Y0, Y0
+ VPXOR Y0, Y5, Y5
+ VPSLLDQ $0x0c, Y5, Y9
+ VPSLLD $0x01, Y5, Y0
+ VPSRLD $0x1f, Y5, Y5
+ VPOR Y5, Y0, Y0
+ VPSLLD $0x02, Y9, Y5
+ VPSRLD $0x1e, Y9, Y9
+ VPXOR Y5, Y0, Y0
+ VPXOR Y9, Y0, Y5
+ VPADDD 32(R8), Y5, Y0
+ VMOVDQU Y0, 192(R14)
+ VPALIGNR $0x08, Y12, Y8, Y3
+ VPSRLDQ $0x04, Y5, Y0
+ VPXOR Y7, Y3, Y3
+ VPXOR Y12, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSLLDQ $0x0c, Y3, Y9
+ VPSLLD $0x01, Y3, Y0
+ VPSRLD $0x1f, Y3, Y3
+ VPOR Y3, Y0, Y0
+ VPSLLD $0x02, Y9, Y3
+ VPSRLD $0x1e, Y9, Y9
+ VPXOR Y3, Y0, Y0
+ VPXOR Y9, Y0, Y3
+ VPADDD 32(R8), Y3, Y0
+ VMOVDQU Y0, 224(R14)
+ VPALIGNR $0x08, Y5, Y3, Y0
+ VPXOR Y14, Y15, Y15
+ VPXOR Y8, Y0, Y0
+ VPXOR Y0, Y15, Y15
+ VPSLLD $0x02, Y15, Y0
+ VPSRLD $0x1e, Y15, Y15
+ VPOR Y15, Y0, Y15
+ VPADDD 32(R8), Y15, Y0
+ VMOVDQU Y0, 256(R14)
+ VPALIGNR $0x08, Y3, Y15, Y0
+ VPXOR Y13, Y14, Y14
+ VPXOR Y7, Y0, Y0
+ VPXOR Y0, Y14, Y14
+ VPSLLD $0x02, Y14, Y0
+ VPSRLD $0x1e, Y14, Y14
+ VPOR Y14, Y0, Y14
+ VPADDD 32(R8), Y14, Y0
+ VMOVDQU Y0, 288(R14)
+ VPALIGNR $0x08, Y15, Y14, Y0
+ VPXOR Y12, Y13, Y13
+ VPXOR Y5, Y0, Y0
+ VPXOR Y0, Y13, Y13
+ VPSLLD $0x02, Y13, Y0
+ VPSRLD $0x1e, Y13, Y13
+ VPOR Y13, Y0, Y13
+ VPADDD 64(R8), Y13, Y0
+ VMOVDQU Y0, 320(R14)
+ VPALIGNR $0x08, Y14, Y13, Y0
+ VPXOR Y8, Y12, Y12
+ VPXOR Y3, Y0, Y0
+ VPXOR Y0, Y12, Y12
+ VPSLLD $0x02, Y12, Y0
+ VPSRLD $0x1e, Y12, Y12
+ VPOR Y12, Y0, Y12
+ VPADDD 64(R8), Y12, Y0
+ VMOVDQU Y0, 352(R14)
+ VPALIGNR $0x08, Y13, Y12, Y0
+ VPXOR Y7, Y8, Y8
+ VPXOR Y15, Y0, Y0
+ VPXOR Y0, Y8, Y8
+ VPSLLD $0x02, Y8, Y0
+ VPSRLD $0x1e, Y8, Y8
+ VPOR Y8, Y0, Y8
+ VPADDD 64(R8), Y8, Y0
+ VMOVDQU Y0, 384(R14)
+ VPALIGNR $0x08, Y12, Y8, Y0
+ VPXOR Y5, Y7, Y7
+ VPXOR Y14, Y0, Y0
+ VPXOR Y0, Y7, Y7
+ VPSLLD $0x02, Y7, Y0
+ VPSRLD $0x1e, Y7, Y7
+ VPOR Y7, Y0, Y7
+ VPADDD 64(R8), Y7, Y0
+ VMOVDQU Y0, 416(R14)
+ VPALIGNR $0x08, Y8, Y7, Y0
+ VPXOR Y3, Y5, Y5
+ VPXOR Y13, Y0, Y0
+ VPXOR Y0, Y5, Y5
+ VPSLLD $0x02, Y5, Y0
+ VPSRLD $0x1e, Y5, Y5
+ VPOR Y5, Y0, Y5
+ VPADDD 64(R8), Y5, Y0
+ VMOVDQU Y0, 448(R14)
+ VPALIGNR $0x08, Y7, Y5, Y0
+ VPXOR Y15, Y3, Y3
+ VPXOR Y12, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSLLD $0x02, Y3, Y0
+ VPSRLD $0x1e, Y3, Y3
+ VPOR Y3, Y0, Y3
+ VPADDD 96(R8), Y3, Y0
+ VMOVDQU Y0, 480(R14)
+ VPALIGNR $0x08, Y5, Y3, Y0
+ VPXOR Y14, Y15, Y15
+ VPXOR Y8, Y0, Y0
+ VPXOR Y0, Y15, Y15
+ VPSLLD $0x02, Y15, Y0
+ VPSRLD $0x1e, Y15, Y15
+ VPOR Y15, Y0, Y15
+ VPADDD 96(R8), Y15, Y0
+ VMOVDQU Y0, 512(R14)
+ VPALIGNR $0x08, Y3, Y15, Y0
+ VPXOR Y13, Y14, Y14
+ VPXOR Y7, Y0, Y0
+ VPXOR Y0, Y14, Y14
+ VPSLLD $0x02, Y14, Y0
+ VPSRLD $0x1e, Y14, Y14
+ VPOR Y14, Y0, Y14
+ VPADDD 96(R8), Y14, Y0
+ VMOVDQU Y0, 544(R14)
+ VPALIGNR $0x08, Y15, Y14, Y0
+ VPXOR Y12, Y13, Y13
+ VPXOR Y5, Y0, Y0
+ VPXOR Y0, Y13, Y13
+ VPSLLD $0x02, Y13, Y0
+ VPSRLD $0x1e, Y13, Y13
+ VPOR Y13, Y0, Y13
+ VPADDD 96(R8), Y13, Y0
+ VMOVDQU Y0, 576(R14)
+ VPALIGNR $0x08, Y14, Y13, Y0
+ VPXOR Y8, Y12, Y12
+ VPXOR Y3, Y0, Y0
+ VPXOR Y0, Y12, Y12
+ VPSLLD $0x02, Y12, Y0
+ VPSRLD $0x1e, Y12, Y12
+ VPOR Y12, Y0, Y12
+ VPADDD 96(R8), Y12, Y0
+ VMOVDQU Y0, 608(R14)
+ XCHGQ R15, R14
-// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
-// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
-// From http://software.intel.com/en-us/articles
-// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
-// This implementation is 2x unrolled, and interleaves vector instructions,
-// used to precompute W, with scalar computation of current round
-// for optimal scheduling.
-
-// Trivial helper macros.
-#define UPDATE_HASH(A,TB,C,D,E) \
- ADDL (R9), A \
- MOVL A, (R9) \
- ADDL 4(R9), TB \
- MOVL TB, 4(R9) \
- ADDL 8(R9), C \
- MOVL C, 8(R9) \
- ADDL 12(R9), D \
- MOVL D, 12(R9) \
- ADDL 16(R9), E \
- MOVL E, 16(R9)
-
-
-
-// Helper macros for PRECALC, which does precomputations
-#define PRECALC_0(OFFSET) \
- VMOVDQU OFFSET(R10),X0
-
-#define PRECALC_1(OFFSET) \
- VINSERTI128 $1, OFFSET(R13), Y0, Y0
-
-#define PRECALC_2(YREG) \
- VPSHUFB Y10, Y0, YREG
-
-#define PRECALC_4(YREG,K_OFFSET) \
- VPADDD K_OFFSET(R8), YREG, Y0
-
-#define PRECALC_7(OFFSET) \
- VMOVDQU Y0, (OFFSET*2)(R14)
-
-
-// Message scheduling pre-compute for rounds 0-15
-// R13 is a pointer to even 64-byte block
-// R10 is a pointer to odd 64-byte block
-// R14 is a pointer to temp buffer
-// X0 is used as temp register
-// YREG is clobbered as part of computation
-// OFFSET chooses 16 byte chunk within a block
-// R8 is a pointer to constants block
-// K_OFFSET chooses K constants relevant to this round
-// X10 holds swap mask
-#define PRECALC_00_15(OFFSET,YREG) \
- PRECALC_0(OFFSET) \
- PRECALC_1(OFFSET) \
- PRECALC_2(YREG) \
- PRECALC_4(YREG,0x0) \
- PRECALC_7(OFFSET)
-
-
-// Helper macros for PRECALC_16_31
-#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
- VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14]
- VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
-
-#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
- VPXOR REG_SUB_8, REG, REG \
- VPXOR REG_SUB_16, Y0, Y0
-
-#define PRECALC_18(REG) \
- VPXOR Y0, REG, REG \
- VPSLLDQ $12, REG, Y9
-
-#define PRECALC_19(REG) \
- VPSLLD $1, REG, Y0 \
- VPSRLD $31, REG, REG
-
-#define PRECALC_20(REG) \
- VPOR REG, Y0, Y0 \
- VPSLLD $2, Y9, REG
-
-#define PRECALC_21(REG) \
- VPSRLD $30, Y9, Y9 \
- VPXOR REG, Y0, Y0
-
-#define PRECALC_23(REG,K_OFFSET,OFFSET) \
- VPXOR Y9, Y0, REG \
- VPADDD K_OFFSET(R8), REG, Y0 \
- VMOVDQU Y0, (OFFSET)(R14)
-
-// Message scheduling pre-compute for rounds 16-31
-// calculating last 32 w[i] values in 8 XMM registers
-// pre-calculate K+w[i] values and store to mem
-// for later load by ALU add instruction.
-// "brute force" vectorization for rounds 16-31 only
-// due to w[i]->w[i-3] dependency.
-// clobbers 5 input ymm registers REG_SUB*
-// uses X0 and X9 as temp registers
-// As always, R8 is a pointer to constants block
-// and R14 is a pointer to temp buffer
-#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
- PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
- PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
- PRECALC_18(REG) \
- PRECALC_19(REG) \
- PRECALC_20(REG) \
- PRECALC_21(REG) \
- PRECALC_23(REG,K_OFFSET,OFFSET)
-
-
-// Helper macros for PRECALC_32_79
-#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
- VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
-
-#define PRECALC_33(REG_SUB_28,REG) \
- VPXOR REG_SUB_28, REG, REG
-
-#define PRECALC_34(REG_SUB_16) \
- VPXOR REG_SUB_16, Y0, Y0
-
-#define PRECALC_35(REG) \
- VPXOR Y0, REG, REG
-
-#define PRECALC_36(REG) \
- VPSLLD $2, REG, Y0
-
-#define PRECALC_37(REG) \
- VPSRLD $30, REG, REG \
- VPOR REG, Y0, REG
-
-#define PRECALC_39(REG,K_OFFSET,OFFSET) \
- VPADDD K_OFFSET(R8), REG, Y0 \
- VMOVDQU Y0, (OFFSET)(R14)
-
-// Message scheduling pre-compute for rounds 32-79
-// In SHA-1 specification we have:
-// w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
-// Which is the same as:
-// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
-// This allows for more efficient vectorization,
-// since w[i]->w[i-3] dependency is broken
-#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
- PRECALC_32(REG_SUB_8,REG_SUB_4) \
- PRECALC_33(REG_SUB_28,REG) \
- PRECALC_34(REG_SUB_16) \
- PRECALC_35(REG) \
- PRECALC_36(REG) \
- PRECALC_37(REG) \
- PRECALC_39(REG,K_OFFSET,OFFSET)
-
-#define PRECALC \
- PRECALC_00_15(0,Y15) \
- PRECALC_00_15(0x10,Y14) \
- PRECALC_00_15(0x20,Y13) \
- PRECALC_00_15(0x30,Y12) \
- PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
- PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
- PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
- PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
- PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
- PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
- PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
- PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
- PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
- PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
- PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
- PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
- PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
- PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
- PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
- PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
-
-// Macros calculating individual rounds have general form
-// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
-// CALC_ROUND_{PRE,POST} macros follow
-
-#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
- ADDL OFFSET(R15),REG_E \
- ANDNL REG_C,REG_A,BP \
- LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
- RORXL $0x1b, REG_A, R12 \
- RORXL $2, REG_A, REG_B // for next round
-
-// Calculate F for the next round
-#define CALC_F1_POST(REG_A,REG_B,REG_E) \
- ANDL REG_B,REG_A \ // b&c
- XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d)
- LEAL (REG_E)(R12*1), REG_E // E += A >>> 5
-
-
-// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
-#define CALC_0 \
- MOVL SI, BX \ // Precalculating first round
- RORXL $2, SI, SI \
- ANDNL AX, BX, BP \
- ANDL DI, BX \
- XORL BP, BX \
- CALC_F1_PRE(0x0,CX,BX,DI,DX) \
- PRECALC_0(0x80) \
- CALC_F1_POST(CX,SI,DX)
-
-#define CALC_1 \
- CALC_F1_PRE(0x4,DX,CX,SI,AX) \
- PRECALC_1(0x80) \
- CALC_F1_POST(DX,BX,AX)
-
-#define CALC_2 \
- CALC_F1_PRE(0x8,AX,DX,BX,DI) \
- PRECALC_2(Y15) \
- CALC_F1_POST(AX,CX,DI)
-
-#define CALC_3 \
- CALC_F1_PRE(0xc,DI,AX,CX,SI) \
- CALC_F1_POST(DI,DX,SI)
-
-#define CALC_4 \
- CALC_F1_PRE(0x20,SI,DI,DX,BX) \
- PRECALC_4(Y15,0x0) \
- CALC_F1_POST(SI,AX,BX)
-
-#define CALC_5 \
- CALC_F1_PRE(0x24,BX,SI,AX,CX) \
- CALC_F1_POST(BX,DI,CX)
-
-#define CALC_6 \
- CALC_F1_PRE(0x28,CX,BX,DI,DX) \
- CALC_F1_POST(CX,SI,DX)
-
-#define CALC_7 \
- CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
- PRECALC_7(0x0) \
- CALC_F1_POST(DX,BX,AX)
-
-#define CALC_8 \
- CALC_F1_PRE(0x40,AX,DX,BX,DI) \
- PRECALC_0(0x90) \
- CALC_F1_POST(AX,CX,DI)
-
-#define CALC_9 \
- CALC_F1_PRE(0x44,DI,AX,CX,SI) \
- PRECALC_1(0x90) \
- CALC_F1_POST(DI,DX,SI)
-
-#define CALC_10 \
- CALC_F1_PRE(0x48,SI,DI,DX,BX) \
- PRECALC_2(Y14) \
- CALC_F1_POST(SI,AX,BX)
-
-#define CALC_11 \
- CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
- CALC_F1_POST(BX,DI,CX)
-
-#define CALC_12 \
- CALC_F1_PRE(0x60,CX,BX,DI,DX) \
- PRECALC_4(Y14,0x0) \
- CALC_F1_POST(CX,SI,DX)
-
-#define CALC_13 \
- CALC_F1_PRE(0x64,DX,CX,SI,AX) \
- CALC_F1_POST(DX,BX,AX)
-
-#define CALC_14 \
- CALC_F1_PRE(0x68,AX,DX,BX,DI) \
- CALC_F1_POST(AX,CX,DI)
-
-#define CALC_15 \
- CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
- PRECALC_7(0x10) \
- CALC_F1_POST(DI,DX,SI)
-
-#define CALC_16 \
- CALC_F1_PRE(0x80,SI,DI,DX,BX) \
- PRECALC_0(0xa0) \
- CALC_F1_POST(SI,AX,BX)
-
-#define CALC_17 \
- CALC_F1_PRE(0x84,BX,SI,AX,CX) \
- PRECALC_1(0xa0) \
- CALC_F1_POST(BX,DI,CX)
-
-#define CALC_18 \
- CALC_F1_PRE(0x88,CX,BX,DI,DX) \
- PRECALC_2(Y13) \
- CALC_F1_POST(CX,SI,DX)
-
-
-#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
- ADDL OFFSET(R15),REG_E \
- LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
- RORXL $0x1b, REG_A, R12 \
- RORXL $2, REG_A, REG_B // for next round
-
-#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
- XORL REG_B, REG_A \
- ADDL R12, REG_E \
- XORL REG_C, REG_A
-
-#define CALC_19 \
- CALC_F2_PRE(0x8c,DX,CX,AX) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_20 \
- CALC_F2_PRE(0xa0,AX,DX,DI) \
- PRECALC_4(Y13,0x0) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_21 \
- CALC_F2_PRE(0xa4,DI,AX,SI) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_22 \
- CALC_F2_PRE(0xa8,SI,DI,BX) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_23 \
- CALC_F2_PRE(0xac,BX,SI,CX) \
- PRECALC_7(0x20) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_24 \
- CALC_F2_PRE(0xc0,CX,BX,DX) \
- PRECALC_0(0xb0) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_25 \
- CALC_F2_PRE(0xc4,DX,CX,AX) \
- PRECALC_1(0xb0) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_26 \
- CALC_F2_PRE(0xc8,AX,DX,DI) \
- PRECALC_2(Y12) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_27 \
- CALC_F2_PRE(0xcc,DI,AX,SI) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_28 \
- CALC_F2_PRE(0xe0,SI,DI,BX) \
- PRECALC_4(Y12,0x0) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_29 \
- CALC_F2_PRE(0xe4,BX,SI,CX) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_30 \
- CALC_F2_PRE(0xe8,CX,BX,DX) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_31 \
- CALC_F2_PRE(0xec,DX,CX,AX) \
- PRECALC_7(0x30) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_32 \
- CALC_F2_PRE(0x100,AX,DX,DI) \
- PRECALC_16(Y15,Y14,Y12,Y8) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_33 \
- CALC_F2_PRE(0x104,DI,AX,SI) \
- PRECALC_17(Y15,Y13,Y8) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_34 \
- CALC_F2_PRE(0x108,SI,DI,BX) \
- PRECALC_18(Y8) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_35 \
- CALC_F2_PRE(0x10c,BX,SI,CX) \
- PRECALC_19(Y8) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_36 \
- CALC_F2_PRE(0x120,CX,BX,DX) \
- PRECALC_20(Y8) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_37 \
- CALC_F2_PRE(0x124,DX,CX,AX) \
- PRECALC_21(Y8) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_38 \
- CALC_F2_PRE(0x128,AX,DX,DI) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-
-#define CALC_F3_PRE(OFFSET,REG_E) \
- ADDL OFFSET(R15),REG_E
-
-#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
- LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
- MOVL REG_B, BP \
- ORL REG_A, BP \
- RORXL $0x1b, REG_A, R12 \
- RORXL $2, REG_A, REG_TB \
- ANDL REG_C, BP \ // Calculate F for the next round
- ANDL REG_B, REG_A \
- ORL BP, REG_A \
- ADDL R12, REG_E
-
-#define CALC_39 \
- CALC_F3_PRE(0x12c,SI) \
- PRECALC_23(Y8,0x0,0x80) \
- CALC_F3_POST(DI,DX,CX,SI,AX)
-
-#define CALC_40 \
- CALC_F3_PRE(0x140,BX) \
- PRECALC_16(Y14,Y13,Y8,Y7) \
- CALC_F3_POST(SI,AX,DX,BX,DI)
-
-#define CALC_41 \
- CALC_F3_PRE(0x144,CX) \
- PRECALC_17(Y14,Y12,Y7) \
- CALC_F3_POST(BX,DI,AX,CX,SI)
-
-#define CALC_42 \
- CALC_F3_PRE(0x148,DX) \
- PRECALC_18(Y7) \
- CALC_F3_POST(CX,SI,DI,DX,BX)
-
-#define CALC_43 \
- CALC_F3_PRE(0x14c,AX) \
- PRECALC_19(Y7) \
- CALC_F3_POST(DX,BX,SI,AX,CX)
-
-#define CALC_44 \
- CALC_F3_PRE(0x160,DI) \
- PRECALC_20(Y7) \
- CALC_F3_POST(AX,CX,BX,DI,DX)
-
-#define CALC_45 \
- CALC_F3_PRE(0x164,SI) \
- PRECALC_21(Y7) \
- CALC_F3_POST(DI,DX,CX,SI,AX)
-
-#define CALC_46 \
- CALC_F3_PRE(0x168,BX) \
- CALC_F3_POST(SI,AX,DX,BX,DI)
-
-#define CALC_47 \
- CALC_F3_PRE(0x16c,CX) \
- VPXOR Y9, Y0, Y7 \
- VPADDD 0x20(R8), Y7, Y0 \
- VMOVDQU Y0, 0xa0(R14) \
- CALC_F3_POST(BX,DI,AX,CX,SI)
-
-#define CALC_48 \
- CALC_F3_PRE(0x180,DX) \
- PRECALC_16(Y13,Y12,Y7,Y5) \
- CALC_F3_POST(CX,SI,DI,DX,BX)
-
-#define CALC_49 \
- CALC_F3_PRE(0x184,AX) \
- PRECALC_17(Y13,Y8,Y5) \
- CALC_F3_POST(DX,BX,SI,AX,CX)
-
-#define CALC_50 \
- CALC_F3_PRE(0x188,DI) \
- PRECALC_18(Y5) \
- CALC_F3_POST(AX,CX,BX,DI,DX)
-
-#define CALC_51 \
- CALC_F3_PRE(0x18c,SI) \
- PRECALC_19(Y5) \
- CALC_F3_POST(DI,DX,CX,SI,AX)
-
-#define CALC_52 \
- CALC_F3_PRE(0x1a0,BX) \
- PRECALC_20(Y5) \
- CALC_F3_POST(SI,AX,DX,BX,DI)
-
-#define CALC_53 \
- CALC_F3_PRE(0x1a4,CX) \
- PRECALC_21(Y5) \
- CALC_F3_POST(BX,DI,AX,CX,SI)
-
-#define CALC_54 \
- CALC_F3_PRE(0x1a8,DX) \
- CALC_F3_POST(CX,SI,DI,DX,BX)
-
-#define CALC_55 \
- CALC_F3_PRE(0x1ac,AX) \
- PRECALC_23(Y5,0x20,0xc0) \
- CALC_F3_POST(DX,BX,SI,AX,CX)
-
-#define CALC_56 \
- CALC_F3_PRE(0x1c0,DI) \
- PRECALC_16(Y12,Y8,Y5,Y3) \
- CALC_F3_POST(AX,CX,BX,DI,DX)
-
-#define CALC_57 \
- CALC_F3_PRE(0x1c4,SI) \
- PRECALC_17(Y12,Y7,Y3) \
- CALC_F3_POST(DI,DX,CX,SI,AX)
-
-#define CALC_58 \
- CALC_F3_PRE(0x1c8,BX) \
- PRECALC_18(Y3) \
- CALC_F3_POST(SI,AX,DX,BX,DI)
-
-#define CALC_59 \
- CALC_F2_PRE(0x1cc,BX,SI,CX) \
- PRECALC_19(Y3) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_60 \
- CALC_F2_PRE(0x1e0,CX,BX,DX) \
- PRECALC_20(Y3) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_61 \
- CALC_F2_PRE(0x1e4,DX,CX,AX) \
- PRECALC_21(Y3) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_62 \
- CALC_F2_PRE(0x1e8,AX,DX,DI) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_63 \
- CALC_F2_PRE(0x1ec,DI,AX,SI) \
- PRECALC_23(Y3,0x20,0xe0) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_64 \
- CALC_F2_PRE(0x200,SI,DI,BX) \
- PRECALC_32(Y5,Y3) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_65 \
- CALC_F2_PRE(0x204,BX,SI,CX) \
- PRECALC_33(Y14,Y15) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_66 \
- CALC_F2_PRE(0x208,CX,BX,DX) \
- PRECALC_34(Y8) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_67 \
- CALC_F2_PRE(0x20c,DX,CX,AX) \
- PRECALC_35(Y15) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_68 \
- CALC_F2_PRE(0x220,AX,DX,DI) \
- PRECALC_36(Y15) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_69 \
- CALC_F2_PRE(0x224,DI,AX,SI) \
- PRECALC_37(Y15) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_70 \
- CALC_F2_PRE(0x228,SI,DI,BX) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_71 \
- CALC_F2_PRE(0x22c,BX,SI,CX) \
- PRECALC_39(Y15,0x20,0x100) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_72 \
- CALC_F2_PRE(0x240,CX,BX,DX) \
- PRECALC_32(Y3,Y15) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_73 \
- CALC_F2_PRE(0x244,DX,CX,AX) \
- PRECALC_33(Y13,Y14) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_74 \
- CALC_F2_PRE(0x248,AX,DX,DI) \
- PRECALC_34(Y7) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_75 \
- CALC_F2_PRE(0x24c,DI,AX,SI) \
- PRECALC_35(Y14) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_76 \
- CALC_F2_PRE(0x260,SI,DI,BX) \
- PRECALC_36(Y14) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_77 \
- CALC_F2_PRE(0x264,BX,SI,CX) \
- PRECALC_37(Y14) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_78 \
- CALC_F2_PRE(0x268,CX,BX,DX) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_79 \
- ADDL 0x26c(R15), AX \
- LEAL (AX)(CX*1), AX \
- RORXL $0x1b, DX, R12 \
- PRECALC_39(Y14,0x20,0x120) \
- ADDL R12, AX
-
-// Similar to CALC_0
-#define CALC_80 \
- MOVL CX, DX \
- RORXL $2, CX, CX \
- ANDNL SI, DX, BP \
- ANDL BX, DX \
- XORL BP, DX \
- CALC_F1_PRE(0x10,AX,DX,BX,DI) \
- PRECALC_32(Y15,Y14) \
- CALC_F1_POST(AX,CX,DI)
-
-#define CALC_81 \
- CALC_F1_PRE(0x14,DI,AX,CX,SI) \
- PRECALC_33(Y12,Y13) \
- CALC_F1_POST(DI,DX,SI)
-
-#define CALC_82 \
- CALC_F1_PRE(0x18,SI,DI,DX,BX) \
- PRECALC_34(Y5) \
- CALC_F1_POST(SI,AX,BX)
-
-#define CALC_83 \
- CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
- PRECALC_35(Y13) \
- CALC_F1_POST(BX,DI,CX)
-
-#define CALC_84 \
- CALC_F1_PRE(0x30,CX,BX,DI,DX) \
- PRECALC_36(Y13) \
- CALC_F1_POST(CX,SI,DX)
-
-#define CALC_85 \
- CALC_F1_PRE(0x34,DX,CX,SI,AX) \
- PRECALC_37(Y13) \
- CALC_F1_POST(DX,BX,AX)
-
-#define CALC_86 \
- CALC_F1_PRE(0x38,AX,DX,BX,DI) \
- CALC_F1_POST(AX,CX,DI)
-
-#define CALC_87 \
- CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
- PRECALC_39(Y13,0x40,0x140) \
- CALC_F1_POST(DI,DX,SI)
-
-#define CALC_88 \
- CALC_F1_PRE(0x50,SI,DI,DX,BX) \
- PRECALC_32(Y14,Y13) \
- CALC_F1_POST(SI,AX,BX)
-
-#define CALC_89 \
- CALC_F1_PRE(0x54,BX,SI,AX,CX) \
- PRECALC_33(Y8,Y12) \
- CALC_F1_POST(BX,DI,CX)
-
-#define CALC_90 \
- CALC_F1_PRE(0x58,CX,BX,DI,DX) \
- PRECALC_34(Y3) \
- CALC_F1_POST(CX,SI,DX)
-
-#define CALC_91 \
- CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
- PRECALC_35(Y12) \
- CALC_F1_POST(DX,BX,AX)
-
-#define CALC_92 \
- CALC_F1_PRE(0x70,AX,DX,BX,DI) \
- PRECALC_36(Y12) \
- CALC_F1_POST(AX,CX,DI)
-
-#define CALC_93 \
- CALC_F1_PRE(0x74,DI,AX,CX,SI) \
- PRECALC_37(Y12) \
- CALC_F1_POST(DI,DX,SI)
-
-#define CALC_94 \
- CALC_F1_PRE(0x78,SI,DI,DX,BX) \
- CALC_F1_POST(SI,AX,BX)
-
-#define CALC_95 \
- CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
- PRECALC_39(Y12,0x40,0x160) \
- CALC_F1_POST(BX,DI,CX)
-
-#define CALC_96 \
- CALC_F1_PRE(0x90,CX,BX,DI,DX) \
- PRECALC_32(Y13,Y12) \
- CALC_F1_POST(CX,SI,DX)
-
-#define CALC_97 \
- CALC_F1_PRE(0x94,DX,CX,SI,AX) \
- PRECALC_33(Y7,Y8) \
- CALC_F1_POST(DX,BX,AX)
-
-#define CALC_98 \
- CALC_F1_PRE(0x98,AX,DX,BX,DI) \
- PRECALC_34(Y15) \
- CALC_F1_POST(AX,CX,DI)
-
-#define CALC_99 \
- CALC_F2_PRE(0x9c,DI,AX,SI) \
- PRECALC_35(Y8) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_100 \
- CALC_F2_PRE(0xb0,SI,DI,BX) \
- PRECALC_36(Y8) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_101 \
- CALC_F2_PRE(0xb4,BX,SI,CX) \
- PRECALC_37(Y8) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_102 \
- CALC_F2_PRE(0xb8,CX,BX,DX) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_103 \
- CALC_F2_PRE(0xbc,DX,CX,AX) \
- PRECALC_39(Y8,0x40,0x180) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_104 \
- CALC_F2_PRE(0xd0,AX,DX,DI) \
- PRECALC_32(Y12,Y8) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_105 \
- CALC_F2_PRE(0xd4,DI,AX,SI) \
- PRECALC_33(Y5,Y7) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_106 \
- CALC_F2_PRE(0xd8,SI,DI,BX) \
- PRECALC_34(Y14) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_107 \
- CALC_F2_PRE(0xdc,BX,SI,CX) \
- PRECALC_35(Y7) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_108 \
- CALC_F2_PRE(0xf0,CX,BX,DX) \
- PRECALC_36(Y7) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_109 \
- CALC_F2_PRE(0xf4,DX,CX,AX) \
- PRECALC_37(Y7) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_110 \
- CALC_F2_PRE(0xf8,AX,DX,DI) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_111 \
- CALC_F2_PRE(0xfc,DI,AX,SI) \
- PRECALC_39(Y7,0x40,0x1a0) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_112 \
- CALC_F2_PRE(0x110,SI,DI,BX) \
- PRECALC_32(Y8,Y7) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_113 \
- CALC_F2_PRE(0x114,BX,SI,CX) \
- PRECALC_33(Y3,Y5) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_114 \
- CALC_F2_PRE(0x118,CX,BX,DX) \
- PRECALC_34(Y13) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_115 \
- CALC_F2_PRE(0x11c,DX,CX,AX) \
- PRECALC_35(Y5) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_116 \
- CALC_F2_PRE(0x130,AX,DX,DI) \
- PRECALC_36(Y5) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_117 \
- CALC_F2_PRE(0x134,DI,AX,SI) \
- PRECALC_37(Y5) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_118 \
- CALC_F2_PRE(0x138,SI,DI,BX) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_119 \
- CALC_F3_PRE(0x13c,CX) \
- PRECALC_39(Y5,0x40,0x1c0) \
- CALC_F3_POST(BX,DI,AX,CX,SI)
-
-#define CALC_120 \
- CALC_F3_PRE(0x150,DX) \
- PRECALC_32(Y7,Y5) \
- CALC_F3_POST(CX,SI,DI,DX,BX)
-
-#define CALC_121 \
- CALC_F3_PRE(0x154,AX) \
- PRECALC_33(Y15,Y3) \
- CALC_F3_POST(DX,BX,SI,AX,CX)
-
-#define CALC_122 \
- CALC_F3_PRE(0x158,DI) \
- PRECALC_34(Y12) \
- CALC_F3_POST(AX,CX,BX,DI,DX)
-
-#define CALC_123 \
- CALC_F3_PRE(0x15c,SI) \
- PRECALC_35(Y3) \
- CALC_F3_POST(DI,DX,CX,SI,AX)
-
-#define CALC_124 \
- CALC_F3_PRE(0x170,BX) \
- PRECALC_36(Y3) \
- CALC_F3_POST(SI,AX,DX,BX,DI)
-
-#define CALC_125 \
- CALC_F3_PRE(0x174,CX) \
- PRECALC_37(Y3) \
- CALC_F3_POST(BX,DI,AX,CX,SI)
-
-#define CALC_126 \
- CALC_F3_PRE(0x178,DX) \
- CALC_F3_POST(CX,SI,DI,DX,BX)
-
-#define CALC_127 \
- CALC_F3_PRE(0x17c,AX) \
- PRECALC_39(Y3,0x60,0x1e0) \
- CALC_F3_POST(DX,BX,SI,AX,CX)
-
-#define CALC_128 \
- CALC_F3_PRE(0x190,DI) \
- PRECALC_32(Y5,Y3) \
- CALC_F3_POST(AX,CX,BX,DI,DX)
-
-#define CALC_129 \
- CALC_F3_PRE(0x194,SI) \
- PRECALC_33(Y14,Y15) \
- CALC_F3_POST(DI,DX,CX,SI,AX)
-
-#define CALC_130 \
- CALC_F3_PRE(0x198,BX) \
- PRECALC_34(Y8) \
- CALC_F3_POST(SI,AX,DX,BX,DI)
-
-#define CALC_131 \
- CALC_F3_PRE(0x19c,CX) \
- PRECALC_35(Y15) \
- CALC_F3_POST(BX,DI,AX,CX,SI)
-
-#define CALC_132 \
- CALC_F3_PRE(0x1b0,DX) \
- PRECALC_36(Y15) \
- CALC_F3_POST(CX,SI,DI,DX,BX)
-
-#define CALC_133 \
- CALC_F3_PRE(0x1b4,AX) \
- PRECALC_37(Y15) \
- CALC_F3_POST(DX,BX,SI,AX,CX)
-
-#define CALC_134 \
- CALC_F3_PRE(0x1b8,DI) \
- CALC_F3_POST(AX,CX,BX,DI,DX)
-
-#define CALC_135 \
- CALC_F3_PRE(0x1bc,SI) \
- PRECALC_39(Y15,0x60,0x200) \
- CALC_F3_POST(DI,DX,CX,SI,AX)
-
-#define CALC_136 \
- CALC_F3_PRE(0x1d0,BX) \
- PRECALC_32(Y3,Y15) \
- CALC_F3_POST(SI,AX,DX,BX,DI)
-
-#define CALC_137 \
- CALC_F3_PRE(0x1d4,CX) \
- PRECALC_33(Y13,Y14) \
- CALC_F3_POST(BX,DI,AX,CX,SI)
-
-#define CALC_138 \
- CALC_F3_PRE(0x1d8,DX) \
- PRECALC_34(Y7) \
- CALC_F3_POST(CX,SI,DI,DX,BX)
-
-#define CALC_139 \
- CALC_F2_PRE(0x1dc,DX,CX,AX) \
- PRECALC_35(Y14) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_140 \
- CALC_F2_PRE(0x1f0,AX,DX,DI) \
- PRECALC_36(Y14) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_141 \
- CALC_F2_PRE(0x1f4,DI,AX,SI) \
- PRECALC_37(Y14) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_142 \
- CALC_F2_PRE(0x1f8,SI,DI,BX) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_143 \
- CALC_F2_PRE(0x1fc,BX,SI,CX) \
- PRECALC_39(Y14,0x60,0x220) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_144 \
- CALC_F2_PRE(0x210,CX,BX,DX) \
- PRECALC_32(Y15,Y14) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_145 \
- CALC_F2_PRE(0x214,DX,CX,AX) \
- PRECALC_33(Y12,Y13) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_146 \
- CALC_F2_PRE(0x218,AX,DX,DI) \
- PRECALC_34(Y5) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_147 \
- CALC_F2_PRE(0x21c,DI,AX,SI) \
- PRECALC_35(Y13) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_148 \
- CALC_F2_PRE(0x230,SI,DI,BX) \
- PRECALC_36(Y13) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_149 \
- CALC_F2_PRE(0x234,BX,SI,CX) \
- PRECALC_37(Y13) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_150 \
- CALC_F2_PRE(0x238,CX,BX,DX) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_151 \
- CALC_F2_PRE(0x23c,DX,CX,AX) \
- PRECALC_39(Y13,0x60,0x240) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_152 \
- CALC_F2_PRE(0x250,AX,DX,DI) \
- PRECALC_32(Y14,Y13) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_153 \
- CALC_F2_PRE(0x254,DI,AX,SI) \
- PRECALC_33(Y8,Y12) \
- CALC_F2_POST(DI,DX,CX,SI)
-
-#define CALC_154 \
- CALC_F2_PRE(0x258,SI,DI,BX) \
- PRECALC_34(Y3) \
- CALC_F2_POST(SI,AX,DX,BX)
-
-#define CALC_155 \
- CALC_F2_PRE(0x25c,BX,SI,CX) \
- PRECALC_35(Y12) \
- CALC_F2_POST(BX,DI,AX,CX)
-
-#define CALC_156 \
- CALC_F2_PRE(0x270,CX,BX,DX) \
- PRECALC_36(Y12) \
- CALC_F2_POST(CX,SI,DI,DX)
-
-#define CALC_157 \
- CALC_F2_PRE(0x274,DX,CX,AX) \
- PRECALC_37(Y12) \
- CALC_F2_POST(DX,BX,SI,AX)
-
-#define CALC_158 \
- CALC_F2_PRE(0x278,AX,DX,DI) \
- CALC_F2_POST(AX,CX,BX,DI)
-
-#define CALC_159 \
- ADDL 0x27c(R15),SI \
- LEAL (SI)(AX*1), SI \
- RORXL $0x1b, DI, R12 \
- PRECALC_39(Y12,0x60,0x260) \
- ADDL R12, SI
-
-
-
-#define CALC \
- MOVL (R9), CX \
- MOVL 4(R9), SI \
- MOVL 8(R9), DI \
- MOVL 12(R9), AX \
- MOVL 16(R9), DX \
- MOVQ SP, R14 \
- LEAQ (2*4*80+32)(SP), R15 \
- PRECALC \ // Precalc WK for first 2 blocks
- XCHGQ R15, R14 \
-loop: \ // this loops is unrolled
- CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block
- JNE begin \
- VZEROUPPER \
- RET \
-begin: \
- CALC_0 \
- CALC_1 \
- CALC_2 \
- CALC_3 \
- CALC_4 \
- CALC_5 \
- CALC_6 \
- CALC_7 \
- CALC_8 \
- CALC_9 \
- CALC_10 \
- CALC_11 \
- CALC_12 \
- CALC_13 \
- CALC_14 \
- CALC_15 \
- CALC_16 \
- CALC_17 \
- CALC_18 \
- CALC_19 \
- CALC_20 \
- CALC_21 \
- CALC_22 \
- CALC_23 \
- CALC_24 \
- CALC_25 \
- CALC_26 \
- CALC_27 \
- CALC_28 \
- CALC_29 \
- CALC_30 \
- CALC_31 \
- CALC_32 \
- CALC_33 \
- CALC_34 \
- CALC_35 \
- CALC_36 \
- CALC_37 \
- CALC_38 \
- CALC_39 \
- CALC_40 \
- CALC_41 \
- CALC_42 \
- CALC_43 \
- CALC_44 \
- CALC_45 \
- CALC_46 \
- CALC_47 \
- CALC_48 \
- CALC_49 \
- CALC_50 \
- CALC_51 \
- CALC_52 \
- CALC_53 \
- CALC_54 \
- CALC_55 \
- CALC_56 \
- CALC_57 \
- CALC_58 \
- CALC_59 \
- ADDQ $128, R10 \ // move to next even-64-byte block
- CMPQ R10, R11 \ // is current block the last one?
- CMOVQCC R8, R10 \ // signal the last iteration smartly
- CALC_60 \
- CALC_61 \
- CALC_62 \
- CALC_63 \
- CALC_64 \
- CALC_65 \
- CALC_66 \
- CALC_67 \
- CALC_68 \
- CALC_69 \
- CALC_70 \
- CALC_71 \
- CALC_72 \
- CALC_73 \
- CALC_74 \
- CALC_75 \
- CALC_76 \
- CALC_77 \
- CALC_78 \
- CALC_79 \
- UPDATE_HASH(AX,DX,BX,SI,DI) \
- CMPQ R10, R8 \ // is current block the last one?
- JE loop\
- MOVL DX, CX \
- CALC_80 \
- CALC_81 \
- CALC_82 \
- CALC_83 \
- CALC_84 \
- CALC_85 \
- CALC_86 \
- CALC_87 \
- CALC_88 \
- CALC_89 \
- CALC_90 \
- CALC_91 \
- CALC_92 \
- CALC_93 \
- CALC_94 \
- CALC_95 \
- CALC_96 \
- CALC_97 \
- CALC_98 \
- CALC_99 \
- CALC_100 \
- CALC_101 \
- CALC_102 \
- CALC_103 \
- CALC_104 \
- CALC_105 \
- CALC_106 \
- CALC_107 \
- CALC_108 \
- CALC_109 \
- CALC_110 \
- CALC_111 \
- CALC_112 \
- CALC_113 \
- CALC_114 \
- CALC_115 \
- CALC_116 \
- CALC_117 \
- CALC_118 \
- CALC_119 \
- CALC_120 \
- CALC_121 \
- CALC_122 \
- CALC_123 \
- CALC_124 \
- CALC_125 \
- CALC_126 \
- CALC_127 \
- CALC_128 \
- CALC_129 \
- CALC_130 \
- CALC_131 \
- CALC_132 \
- CALC_133 \
- CALC_134 \
- CALC_135 \
- CALC_136 \
- CALC_137 \
- CALC_138 \
- CALC_139 \
- ADDQ $128, R13 \ //move to next even-64-byte block
- CMPQ R13, R11 \ //is current block the last one?
- CMOVQCC R8, R10 \
- CALC_140 \
- CALC_141 \
- CALC_142 \
- CALC_143 \
- CALC_144 \
- CALC_145 \
- CALC_146 \
- CALC_147 \
- CALC_148 \
- CALC_149 \
- CALC_150 \
- CALC_151 \
- CALC_152 \
- CALC_153 \
- CALC_154 \
- CALC_155 \
- CALC_156 \
- CALC_157 \
- CALC_158 \
- CALC_159 \
- UPDATE_HASH(SI,DI,DX,CX,BX) \
- MOVL SI, R12 \ //Reset state for AVX2 reg permutation
- MOVL DI, SI \
- MOVL DX, DI \
- MOVL BX, DX \
- MOVL CX, AX \
- MOVL R12, CX \
- XCHGQ R15, R14 \
- JMP loop
-
-
-
-TEXT ·blockAVX2(SB),$1408-32
-
- MOVQ dig+0(FP), DI
- MOVQ p_base+8(FP), SI
- MOVQ p_len+16(FP), DX
- SHRQ $6, DX
- SHLQ $6, DX
-
- MOVQ $K_XMM_AR<>(SB), R8
-
- MOVQ DI, R9
- MOVQ SI, R10
- LEAQ 64(SI), R13
-
- ADDQ SI, DX
- ADDQ $64, DX
- MOVQ DX, R11
-
- CMPQ R13, R11
- CMOVQCC R8, R13
-
- VMOVDQU BSWAP_SHUFB_CTL<>(SB), Y10
-
- CALC // RET is inside macros
-
-DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
-DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
-DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
-DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
-DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
-DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
-DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
-DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
-DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
-DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
-DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
-DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
-DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
-DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
-DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
-DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
-DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
-DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
-DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
-DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
-DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
-DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
-DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
-DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
-DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
-DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
-DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
-DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
-DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
-DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
-DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
-DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
-GLOBL K_XMM_AR<>(SB),RODATA,$128
+loop:
+ CMPQ R10, R8
+ JNE begin
+ VZEROUPPER
+ RET
-DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
-DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
-DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
-DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
-DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
-DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
-DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
-DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
-GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32
+begin:
+ MOVL SI, BX
+ RORXL $0x02, SI, SI
+ ANDNL AX, BX, BP
+ ANDL DI, BX
+ XORL BP, BX
+ ADDL (R15), DX
+ ANDNL DI, CX, BP
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VMOVDQU 128(R10), X0
+ ANDL SI, CX
+ XORL BP, CX
+ LEAL (DX)(R12*1), DX
+ ADDL 4(R15), AX
+ ANDNL SI, DX, BP
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VINSERTI128 $0x01, 128(R13), Y0, Y0
+ ANDL BX, DX
+ XORL BP, DX
+ LEAL (AX)(R12*1), AX
+ ADDL 8(R15), DI
+ ANDNL BX, AX, BP
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPSHUFB Y10, Y0, Y15
+ ANDL CX, AX
+ XORL BP, AX
+ LEAL (DI)(R12*1), DI
+ ADDL 12(R15), SI
+ ANDNL CX, DI, BP
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ ANDL DX, DI
+ XORL BP, DI
+ LEAL (SI)(R12*1), SI
+ ADDL 32(R15), BX
+ ANDNL DX, SI, BP
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPADDD (R8), Y15, Y0
+ ANDL AX, SI
+ XORL BP, SI
+ LEAL (BX)(R12*1), BX
+ ADDL 36(R15), CX
+ ANDNL AX, BX, BP
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ ANDL DI, BX
+ XORL BP, BX
+ LEAL (CX)(R12*1), CX
+ ADDL 40(R15), DX
+ ANDNL DI, CX, BP
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ ANDL SI, CX
+ XORL BP, CX
+ LEAL (DX)(R12*1), DX
+ ADDL 44(R15), AX
+ ANDNL SI, DX, BP
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VMOVDQU Y0, (R14)
+ ANDL BX, DX
+ XORL BP, DX
+ LEAL (AX)(R12*1), AX
+ ADDL 64(R15), DI
+ ANDNL BX, AX, BP
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VMOVDQU 144(R10), X0
+ ANDL CX, AX
+ XORL BP, AX
+ LEAL (DI)(R12*1), DI
+ ADDL 68(R15), SI
+ ANDNL CX, DI, BP
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VINSERTI128 $0x01, 144(R13), Y0, Y0
+ ANDL DX, DI
+ XORL BP, DI
+ LEAL (SI)(R12*1), SI
+ ADDL 72(R15), BX
+ ANDNL DX, SI, BP
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPSHUFB Y10, Y0, Y14
+ ANDL AX, SI
+ XORL BP, SI
+ LEAL (BX)(R12*1), BX
+ ADDL 76(R15), CX
+ ANDNL AX, BX, BP
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ ANDL DI, BX
+ XORL BP, BX
+ LEAL (CX)(R12*1), CX
+ ADDL 96(R15), DX
+ ANDNL DI, CX, BP
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPADDD (R8), Y14, Y0
+ ANDL SI, CX
+ XORL BP, CX
+ LEAL (DX)(R12*1), DX
+ ADDL 100(R15), AX
+ ANDNL SI, DX, BP
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ ANDL BX, DX
+ XORL BP, DX
+ LEAL (AX)(R12*1), AX
+ ADDL 104(R15), DI
+ ANDNL BX, AX, BP
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ ANDL CX, AX
+ XORL BP, AX
+ LEAL (DI)(R12*1), DI
+ ADDL 108(R15), SI
+ ANDNL CX, DI, BP
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VMOVDQU Y0, 32(R14)
+ ANDL DX, DI
+ XORL BP, DI
+ LEAL (SI)(R12*1), SI
+ ADDL 128(R15), BX
+ ANDNL DX, SI, BP
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VMOVDQU 160(R10), X0
+ ANDL AX, SI
+ XORL BP, SI
+ LEAL (BX)(R12*1), BX
+ ADDL 132(R15), CX
+ ANDNL AX, BX, BP
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VINSERTI128 $0x01, 160(R13), Y0, Y0
+ ANDL DI, BX
+ XORL BP, BX
+ LEAL (CX)(R12*1), CX
+ ADDL 136(R15), DX
+ ANDNL DI, CX, BP
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPSHUFB Y10, Y0, Y13
+ ANDL SI, CX
+ XORL BP, CX
+ LEAL (DX)(R12*1), DX
+ ADDL 140(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 160(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPADDD (R8), Y13, Y0
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 164(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 168(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 172(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VMOVDQU Y0, 64(R14)
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 192(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VMOVDQU 176(R10), X0
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 196(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VINSERTI128 $0x01, 176(R13), Y0, Y0
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 200(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPSHUFB Y10, Y0, Y12
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 204(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 224(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPADDD (R8), Y12, Y0
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 228(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 232(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 236(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VMOVDQU Y0, 96(R14)
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 256(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPALIGNR $0x08, Y15, Y14, Y8
+ VPSRLDQ $0x04, Y12, Y0
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 260(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPXOR Y13, Y8, Y8
+ VPXOR Y15, Y0, Y0
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 264(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPXOR Y0, Y8, Y8
+ VPSLLDQ $0x0c, Y8, Y9
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 268(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPSLLD $0x01, Y8, Y0
+ VPSRLD $0x1f, Y8, Y8
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 288(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPOR Y8, Y0, Y0
+ VPSLLD $0x02, Y9, Y8
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 292(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPSRLD $0x1e, Y9, Y9
+ VPXOR Y8, Y0, Y0
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 296(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 300(R15), SI
+ VPXOR Y9, Y0, Y8
+ VPADDD (R8), Y8, Y0
+ VMOVDQU Y0, 128(R14)
+ LEAL (SI)(AX*1), SI
+ MOVL DX, BP
+ ORL DI, BP
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ ANDL CX, BP
+ ANDL DX, DI
+ ORL BP, DI
+ ADDL R12, SI
+ ADDL 320(R15), BX
+ VPALIGNR $0x08, Y14, Y13, Y7
+ VPSRLDQ $0x04, Y8, Y0
+ LEAL (BX)(DI*1), BX
+ MOVL AX, BP
+ ORL SI, BP
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ ANDL DX, BP
+ ANDL AX, SI
+ ORL BP, SI
+ ADDL R12, BX
+ ADDL 324(R15), CX
+ VPXOR Y12, Y7, Y7
+ VPXOR Y14, Y0, Y0
+ LEAL (CX)(SI*1), CX
+ MOVL DI, BP
+ ORL BX, BP
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ ANDL AX, BP
+ ANDL DI, BX
+ ORL BP, BX
+ ADDL R12, CX
+ ADDL 328(R15), DX
+ VPXOR Y0, Y7, Y7
+ VPSLLDQ $0x0c, Y7, Y9
+ LEAL (DX)(BX*1), DX
+ MOVL SI, BP
+ ORL CX, BP
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ ANDL DI, BP
+ ANDL SI, CX
+ ORL BP, CX
+ ADDL R12, DX
+ ADDL 332(R15), AX
+ VPSLLD $0x01, Y7, Y0
+ VPSRLD $0x1f, Y7, Y7
+ LEAL (AX)(CX*1), AX
+ MOVL BX, BP
+ ORL DX, BP
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ ANDL SI, BP
+ ANDL BX, DX
+ ORL BP, DX
+ ADDL R12, AX
+ ADDL 352(R15), DI
+ VPOR Y7, Y0, Y0
+ VPSLLD $0x02, Y9, Y7
+ LEAL (DI)(DX*1), DI
+ MOVL CX, BP
+ ORL AX, BP
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ ANDL BX, BP
+ ANDL CX, AX
+ ORL BP, AX
+ ADDL R12, DI
+ ADDL 356(R15), SI
+ VPSRLD $0x1e, Y9, Y9
+ VPXOR Y7, Y0, Y0
+ LEAL (SI)(AX*1), SI
+ MOVL DX, BP
+ ORL DI, BP
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ ANDL CX, BP
+ ANDL DX, DI
+ ORL BP, DI
+ ADDL R12, SI
+ ADDL 360(R15), BX
+ LEAL (BX)(DI*1), BX
+ MOVL AX, BP
+ ORL SI, BP
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ ANDL DX, BP
+ ANDL AX, SI
+ ORL BP, SI
+ ADDL R12, BX
+ ADDL 364(R15), CX
+ VPXOR Y9, Y0, Y7
+ VPADDD 32(R8), Y7, Y0
+ VMOVDQU Y0, 160(R14)
+ LEAL (CX)(SI*1), CX
+ MOVL DI, BP
+ ORL BX, BP
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ ANDL AX, BP
+ ANDL DI, BX
+ ORL BP, BX
+ ADDL R12, CX
+ ADDL 384(R15), DX
+ VPALIGNR $0x08, Y13, Y12, Y5
+ VPSRLDQ $0x04, Y7, Y0
+ LEAL (DX)(BX*1), DX
+ MOVL SI, BP
+ ORL CX, BP
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ ANDL DI, BP
+ ANDL SI, CX
+ ORL BP, CX
+ ADDL R12, DX
+ ADDL 388(R15), AX
+ VPXOR Y8, Y5, Y5
+ VPXOR Y13, Y0, Y0
+ LEAL (AX)(CX*1), AX
+ MOVL BX, BP
+ ORL DX, BP
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ ANDL SI, BP
+ ANDL BX, DX
+ ORL BP, DX
+ ADDL R12, AX
+ ADDL 392(R15), DI
+ VPXOR Y0, Y5, Y5
+ VPSLLDQ $0x0c, Y5, Y9
+ LEAL (DI)(DX*1), DI
+ MOVL CX, BP
+ ORL AX, BP
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ ANDL BX, BP
+ ANDL CX, AX
+ ORL BP, AX
+ ADDL R12, DI
+ ADDL 396(R15), SI
+ VPSLLD $0x01, Y5, Y0
+ VPSRLD $0x1f, Y5, Y5
+ LEAL (SI)(AX*1), SI
+ MOVL DX, BP
+ ORL DI, BP
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ ANDL CX, BP
+ ANDL DX, DI
+ ORL BP, DI
+ ADDL R12, SI
+ ADDL 416(R15), BX
+ VPOR Y5, Y0, Y0
+ VPSLLD $0x02, Y9, Y5
+ LEAL (BX)(DI*1), BX
+ MOVL AX, BP
+ ORL SI, BP
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ ANDL DX, BP
+ ANDL AX, SI
+ ORL BP, SI
+ ADDL R12, BX
+ ADDL 420(R15), CX
+ VPSRLD $0x1e, Y9, Y9
+ VPXOR Y5, Y0, Y0
+ LEAL (CX)(SI*1), CX
+ MOVL DI, BP
+ ORL BX, BP
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ ANDL AX, BP
+ ANDL DI, BX
+ ORL BP, BX
+ ADDL R12, CX
+ ADDL 424(R15), DX
+ LEAL (DX)(BX*1), DX
+ MOVL SI, BP
+ ORL CX, BP
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ ANDL DI, BP
+ ANDL SI, CX
+ ORL BP, CX
+ ADDL R12, DX
+ ADDL 428(R15), AX
+ VPXOR Y9, Y0, Y5
+ VPADDD 32(R8), Y5, Y0
+ VMOVDQU Y0, 192(R14)
+ LEAL (AX)(CX*1), AX
+ MOVL BX, BP
+ ORL DX, BP
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ ANDL SI, BP
+ ANDL BX, DX
+ ORL BP, DX
+ ADDL R12, AX
+ ADDL 448(R15), DI
+ VPALIGNR $0x08, Y12, Y8, Y3
+ VPSRLDQ $0x04, Y5, Y0
+ LEAL (DI)(DX*1), DI
+ MOVL CX, BP
+ ORL AX, BP
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ ANDL BX, BP
+ ANDL CX, AX
+ ORL BP, AX
+ ADDL R12, DI
+ ADDL 452(R15), SI
+ VPXOR Y7, Y3, Y3
+ VPXOR Y12, Y0, Y0
+ LEAL (SI)(AX*1), SI
+ MOVL DX, BP
+ ORL DI, BP
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ ANDL CX, BP
+ ANDL DX, DI
+ ORL BP, DI
+ ADDL R12, SI
+ ADDL 456(R15), BX
+ VPXOR Y0, Y3, Y3
+ VPSLLDQ $0x0c, Y3, Y9
+ LEAL (BX)(DI*1), BX
+ MOVL AX, BP
+ ORL SI, BP
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ ANDL DX, BP
+ ANDL AX, SI
+ ORL BP, SI
+ ADDL R12, BX
+ ADDL 460(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPSLLD $0x01, Y3, Y0
+ VPSRLD $0x1f, Y3, Y3
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDQ $0x80, R10
+ CMPQ R10, R11
+ CMOVQCC R8, R10
+ ADDL 480(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPOR Y3, Y0, Y0
+ VPSLLD $0x02, Y9, Y3
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 484(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPSRLD $0x1e, Y9, Y9
+ VPXOR Y3, Y0, Y0
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 488(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 492(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPXOR Y9, Y0, Y3
+ VPADDD 32(R8), Y3, Y0
+ VMOVDQU Y0, 224(R14)
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 512(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPALIGNR $0x08, Y5, Y3, Y0
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 516(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPXOR Y14, Y15, Y15
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 520(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPXOR Y8, Y0, Y0
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 524(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPXOR Y0, Y15, Y15
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 544(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPSLLD $0x02, Y15, Y0
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 548(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPSRLD $0x1e, Y15, Y15
+ VPOR Y15, Y0, Y15
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 552(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 556(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPADDD 32(R8), Y15, Y0
+ VMOVDQU Y0, 256(R14)
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 576(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPALIGNR $0x08, Y3, Y15, Y0
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 580(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPXOR Y13, Y14, Y14
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 584(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPXOR Y7, Y0, Y0
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 588(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPXOR Y0, Y14, Y14
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 608(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPSLLD $0x02, Y14, Y0
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 612(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPSRLD $0x1e, Y14, Y14
+ VPOR Y14, Y0, Y14
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 616(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 620(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ VPADDD 32(R8), Y14, Y0
+ VMOVDQU Y0, 288(R14)
+ ADDL R12, AX
+ ADDL (R9), AX
+ MOVL AX, (R9)
+ ADDL 4(R9), DX
+ MOVL DX, 4(R9)
+ ADDL 8(R9), BX
+ MOVL BX, 8(R9)
+ ADDL 12(R9), SI
+ MOVL SI, 12(R9)
+ ADDL 16(R9), DI
+ MOVL DI, 16(R9)
+ CMPQ R10, R8
+ JE loop
+ MOVL DX, CX
+ MOVL CX, DX
+ RORXL $0x02, CX, CX
+ ANDNL SI, DX, BP
+ ANDL BX, DX
+ XORL BP, DX
+ ADDL 16(R15), DI
+ ANDNL BX, AX, BP
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPALIGNR $0x08, Y15, Y14, Y0
+ ANDL CX, AX
+ XORL BP, AX
+ LEAL (DI)(R12*1), DI
+ ADDL 20(R15), SI
+ ANDNL CX, DI, BP
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPXOR Y12, Y13, Y13
+ ANDL DX, DI
+ XORL BP, DI
+ LEAL (SI)(R12*1), SI
+ ADDL 24(R15), BX
+ ANDNL DX, SI, BP
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPXOR Y5, Y0, Y0
+ ANDL AX, SI
+ XORL BP, SI
+ LEAL (BX)(R12*1), BX
+ ADDL 28(R15), CX
+ ANDNL AX, BX, BP
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPXOR Y0, Y13, Y13
+ ANDL DI, BX
+ XORL BP, BX
+ LEAL (CX)(R12*1), CX
+ ADDL 48(R15), DX
+ ANDNL DI, CX, BP
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPSLLD $0x02, Y13, Y0
+ ANDL SI, CX
+ XORL BP, CX
+ LEAL (DX)(R12*1), DX
+ ADDL 52(R15), AX
+ ANDNL SI, DX, BP
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPSRLD $0x1e, Y13, Y13
+ VPOR Y13, Y0, Y13
+ ANDL BX, DX
+ XORL BP, DX
+ LEAL (AX)(R12*1), AX
+ ADDL 56(R15), DI
+ ANDNL BX, AX, BP
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ ANDL CX, AX
+ XORL BP, AX
+ LEAL (DI)(R12*1), DI
+ ADDL 60(R15), SI
+ ANDNL CX, DI, BP
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPADDD 64(R8), Y13, Y0
+ VMOVDQU Y0, 320(R14)
+ ANDL DX, DI
+ XORL BP, DI
+ LEAL (SI)(R12*1), SI
+ ADDL 80(R15), BX
+ ANDNL DX, SI, BP
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPALIGNR $0x08, Y14, Y13, Y0
+ ANDL AX, SI
+ XORL BP, SI
+ LEAL (BX)(R12*1), BX
+ ADDL 84(R15), CX
+ ANDNL AX, BX, BP
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPXOR Y8, Y12, Y12
+ ANDL DI, BX
+ XORL BP, BX
+ LEAL (CX)(R12*1), CX
+ ADDL 88(R15), DX
+ ANDNL DI, CX, BP
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPXOR Y3, Y0, Y0
+ ANDL SI, CX
+ XORL BP, CX
+ LEAL (DX)(R12*1), DX
+ ADDL 92(R15), AX
+ ANDNL SI, DX, BP
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPXOR Y0, Y12, Y12
+ ANDL BX, DX
+ XORL BP, DX
+ LEAL (AX)(R12*1), AX
+ ADDL 112(R15), DI
+ ANDNL BX, AX, BP
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPSLLD $0x02, Y12, Y0
+ ANDL CX, AX
+ XORL BP, AX
+ LEAL (DI)(R12*1), DI
+ ADDL 116(R15), SI
+ ANDNL CX, DI, BP
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPSRLD $0x1e, Y12, Y12
+ VPOR Y12, Y0, Y12
+ ANDL DX, DI
+ XORL BP, DI
+ LEAL (SI)(R12*1), SI
+ ADDL 120(R15), BX
+ ANDNL DX, SI, BP
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ ANDL AX, SI
+ XORL BP, SI
+ LEAL (BX)(R12*1), BX
+ ADDL 124(R15), CX
+ ANDNL AX, BX, BP
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPADDD 64(R8), Y12, Y0
+ VMOVDQU Y0, 352(R14)
+ ANDL DI, BX
+ XORL BP, BX
+ LEAL (CX)(R12*1), CX
+ ADDL 144(R15), DX
+ ANDNL DI, CX, BP
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPALIGNR $0x08, Y13, Y12, Y0
+ ANDL SI, CX
+ XORL BP, CX
+ LEAL (DX)(R12*1), DX
+ ADDL 148(R15), AX
+ ANDNL SI, DX, BP
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPXOR Y7, Y8, Y8
+ ANDL BX, DX
+ XORL BP, DX
+ LEAL (AX)(R12*1), AX
+ ADDL 152(R15), DI
+ ANDNL BX, AX, BP
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPXOR Y15, Y0, Y0
+ ANDL CX, AX
+ XORL BP, AX
+ LEAL (DI)(R12*1), DI
+ ADDL 156(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPXOR Y0, Y8, Y8
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 176(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPSLLD $0x02, Y8, Y0
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 180(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPSRLD $0x1e, Y8, Y8
+ VPOR Y8, Y0, Y8
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 184(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 188(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPADDD 64(R8), Y8, Y0
+ VMOVDQU Y0, 384(R14)
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 208(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPALIGNR $0x08, Y12, Y8, Y0
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 212(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPXOR Y5, Y7, Y7
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 216(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPXOR Y14, Y0, Y0
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 220(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPXOR Y0, Y7, Y7
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 240(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPSLLD $0x02, Y7, Y0
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 244(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPSRLD $0x1e, Y7, Y7
+ VPOR Y7, Y0, Y7
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 248(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 252(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPADDD 64(R8), Y7, Y0
+ VMOVDQU Y0, 416(R14)
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 272(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPALIGNR $0x08, Y8, Y7, Y0
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 276(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPXOR Y3, Y5, Y5
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 280(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPXOR Y13, Y0, Y0
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 284(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPXOR Y0, Y5, Y5
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 304(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPSLLD $0x02, Y5, Y0
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 308(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPSRLD $0x1e, Y5, Y5
+ VPOR Y5, Y0, Y5
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 312(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 316(R15), CX
+ VPADDD 64(R8), Y5, Y0
+ VMOVDQU Y0, 448(R14)
+ LEAL (CX)(SI*1), CX
+ MOVL DI, BP
+ ORL BX, BP
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ ANDL AX, BP
+ ANDL DI, BX
+ ORL BP, BX
+ ADDL R12, CX
+ ADDL 336(R15), DX
+ VPALIGNR $0x08, Y7, Y5, Y0
+ LEAL (DX)(BX*1), DX
+ MOVL SI, BP
+ ORL CX, BP
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ ANDL DI, BP
+ ANDL SI, CX
+ ORL BP, CX
+ ADDL R12, DX
+ ADDL 340(R15), AX
+ VPXOR Y15, Y3, Y3
+ LEAL (AX)(CX*1), AX
+ MOVL BX, BP
+ ORL DX, BP
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ ANDL SI, BP
+ ANDL BX, DX
+ ORL BP, DX
+ ADDL R12, AX
+ ADDL 344(R15), DI
+ VPXOR Y12, Y0, Y0
+ LEAL (DI)(DX*1), DI
+ MOVL CX, BP
+ ORL AX, BP
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ ANDL BX, BP
+ ANDL CX, AX
+ ORL BP, AX
+ ADDL R12, DI
+ ADDL 348(R15), SI
+ VPXOR Y0, Y3, Y3
+ LEAL (SI)(AX*1), SI
+ MOVL DX, BP
+ ORL DI, BP
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ ANDL CX, BP
+ ANDL DX, DI
+ ORL BP, DI
+ ADDL R12, SI
+ ADDL 368(R15), BX
+ VPSLLD $0x02, Y3, Y0
+ LEAL (BX)(DI*1), BX
+ MOVL AX, BP
+ ORL SI, BP
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ ANDL DX, BP
+ ANDL AX, SI
+ ORL BP, SI
+ ADDL R12, BX
+ ADDL 372(R15), CX
+ VPSRLD $0x1e, Y3, Y3
+ VPOR Y3, Y0, Y3
+ LEAL (CX)(SI*1), CX
+ MOVL DI, BP
+ ORL BX, BP
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ ANDL AX, BP
+ ANDL DI, BX
+ ORL BP, BX
+ ADDL R12, CX
+ ADDL 376(R15), DX
+ LEAL (DX)(BX*1), DX
+ MOVL SI, BP
+ ORL CX, BP
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ ANDL DI, BP
+ ANDL SI, CX
+ ORL BP, CX
+ ADDL R12, DX
+ ADDL 380(R15), AX
+ VPADDD 96(R8), Y3, Y0
+ VMOVDQU Y0, 480(R14)
+ LEAL (AX)(CX*1), AX
+ MOVL BX, BP
+ ORL DX, BP
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ ANDL SI, BP
+ ANDL BX, DX
+ ORL BP, DX
+ ADDL R12, AX
+ ADDL 400(R15), DI
+ VPALIGNR $0x08, Y5, Y3, Y0
+ LEAL (DI)(DX*1), DI
+ MOVL CX, BP
+ ORL AX, BP
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ ANDL BX, BP
+ ANDL CX, AX
+ ORL BP, AX
+ ADDL R12, DI
+ ADDL 404(R15), SI
+ VPXOR Y14, Y15, Y15
+ LEAL (SI)(AX*1), SI
+ MOVL DX, BP
+ ORL DI, BP
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ ANDL CX, BP
+ ANDL DX, DI
+ ORL BP, DI
+ ADDL R12, SI
+ ADDL 408(R15), BX
+ VPXOR Y8, Y0, Y0
+ LEAL (BX)(DI*1), BX
+ MOVL AX, BP
+ ORL SI, BP
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ ANDL DX, BP
+ ANDL AX, SI
+ ORL BP, SI
+ ADDL R12, BX
+ ADDL 412(R15), CX
+ VPXOR Y0, Y15, Y15
+ LEAL (CX)(SI*1), CX
+ MOVL DI, BP
+ ORL BX, BP
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ ANDL AX, BP
+ ANDL DI, BX
+ ORL BP, BX
+ ADDL R12, CX
+ ADDL 432(R15), DX
+ VPSLLD $0x02, Y15, Y0
+ LEAL (DX)(BX*1), DX
+ MOVL SI, BP
+ ORL CX, BP
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ ANDL DI, BP
+ ANDL SI, CX
+ ORL BP, CX
+ ADDL R12, DX
+ ADDL 436(R15), AX
+ VPSRLD $0x1e, Y15, Y15
+ VPOR Y15, Y0, Y15
+ LEAL (AX)(CX*1), AX
+ MOVL BX, BP
+ ORL DX, BP
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ ANDL SI, BP
+ ANDL BX, DX
+ ORL BP, DX
+ ADDL R12, AX
+ ADDL 440(R15), DI
+ LEAL (DI)(DX*1), DI
+ MOVL CX, BP
+ ORL AX, BP
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ ANDL BX, BP
+ ANDL CX, AX
+ ORL BP, AX
+ ADDL R12, DI
+ ADDL 444(R15), SI
+ VPADDD 96(R8), Y15, Y0
+ VMOVDQU Y0, 512(R14)
+ LEAL (SI)(AX*1), SI
+ MOVL DX, BP
+ ORL DI, BP
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ ANDL CX, BP
+ ANDL DX, DI
+ ORL BP, DI
+ ADDL R12, SI
+ ADDL 464(R15), BX
+ VPALIGNR $0x08, Y3, Y15, Y0
+ LEAL (BX)(DI*1), BX
+ MOVL AX, BP
+ ORL SI, BP
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ ANDL DX, BP
+ ANDL AX, SI
+ ORL BP, SI
+ ADDL R12, BX
+ ADDL 468(R15), CX
+ VPXOR Y13, Y14, Y14
+ LEAL (CX)(SI*1), CX
+ MOVL DI, BP
+ ORL BX, BP
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ ANDL AX, BP
+ ANDL DI, BX
+ ORL BP, BX
+ ADDL R12, CX
+ ADDL 472(R15), DX
+ VPXOR Y7, Y0, Y0
+ LEAL (DX)(BX*1), DX
+ MOVL SI, BP
+ ORL CX, BP
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ ANDL DI, BP
+ ANDL SI, CX
+ ORL BP, CX
+ ADDL R12, DX
+ ADDL 476(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPXOR Y0, Y14, Y14
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDQ $0x80, R13
+ CMPQ R13, R11
+ CMOVQCC R8, R10
+ ADDL 496(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPSLLD $0x02, Y14, Y0
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 500(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPSRLD $0x1e, Y14, Y14
+ VPOR Y14, Y0, Y14
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 504(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 508(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPADDD 96(R8), Y14, Y0
+ VMOVDQU Y0, 544(R14)
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 528(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPALIGNR $0x08, Y15, Y14, Y0
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 532(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPXOR Y12, Y13, Y13
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 536(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPXOR Y5, Y0, Y0
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 540(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPXOR Y0, Y13, Y13
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 560(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPSLLD $0x02, Y13, Y0
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 564(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPSRLD $0x1e, Y13, Y13
+ VPOR Y13, Y0, Y13
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 568(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 572(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPADDD 96(R8), Y13, Y0
+ VMOVDQU Y0, 576(R14)
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 592(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ VPALIGNR $0x08, Y14, Y13, Y0
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 596(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ RORXL $0x02, DI, AX
+ VPXOR Y8, Y12, Y12
+ XORL DX, DI
+ ADDL R12, SI
+ XORL CX, DI
+ ADDL 600(R15), BX
+ LEAL (BX)(DI*1), BX
+ RORXL $0x1b, SI, R12
+ RORXL $0x02, SI, DI
+ VPXOR Y3, Y0, Y0
+ XORL AX, SI
+ ADDL R12, BX
+ XORL DX, SI
+ ADDL 604(R15), CX
+ LEAL (CX)(SI*1), CX
+ RORXL $0x1b, BX, R12
+ RORXL $0x02, BX, SI
+ VPXOR Y0, Y12, Y12
+ XORL DI, BX
+ ADDL R12, CX
+ XORL AX, BX
+ ADDL 624(R15), DX
+ LEAL (DX)(BX*1), DX
+ RORXL $0x1b, CX, R12
+ RORXL $0x02, CX, BX
+ VPSLLD $0x02, Y12, Y0
+ XORL SI, CX
+ ADDL R12, DX
+ XORL DI, CX
+ ADDL 628(R15), AX
+ LEAL (AX)(CX*1), AX
+ RORXL $0x1b, DX, R12
+ RORXL $0x02, DX, CX
+ VPSRLD $0x1e, Y12, Y12
+ VPOR Y12, Y0, Y12
+ XORL BX, DX
+ ADDL R12, AX
+ XORL SI, DX
+ ADDL 632(R15), DI
+ LEAL (DI)(DX*1), DI
+ RORXL $0x1b, AX, R12
+ RORXL $0x02, AX, DX
+ XORL CX, AX
+ ADDL R12, DI
+ XORL BX, AX
+ ADDL 636(R15), SI
+ LEAL (SI)(AX*1), SI
+ RORXL $0x1b, DI, R12
+ VPADDD 96(R8), Y12, Y0
+ VMOVDQU Y0, 608(R14)
+ ADDL R12, SI
+ ADDL (R9), SI
+ MOVL SI, (R9)
+ ADDL 4(R9), DI
+ MOVL DI, 4(R9)
+ ADDL 8(R9), DX
+ MOVL DX, 8(R9)
+ ADDL 12(R9), CX
+ MOVL CX, 12(R9)
+ ADDL 16(R9), BX
+ MOVL BX, 16(R9)
+ MOVL SI, R12
+ MOVL DI, SI
+ MOVL DX, DI
+ MOVL BX, DX
+ MOVL CX, AX
+ MOVL R12, CX
+ XCHGQ R15, R14
+ JMP loop
+
+DATA K_XMM_AR<>+0(SB)/4, $0x5a827999
+DATA K_XMM_AR<>+4(SB)/4, $0x5a827999
+DATA K_XMM_AR<>+8(SB)/4, $0x5a827999
+DATA K_XMM_AR<>+12(SB)/4, $0x5a827999
+DATA K_XMM_AR<>+16(SB)/4, $0x5a827999
+DATA K_XMM_AR<>+20(SB)/4, $0x5a827999
+DATA K_XMM_AR<>+24(SB)/4, $0x5a827999
+DATA K_XMM_AR<>+28(SB)/4, $0x5a827999
+DATA K_XMM_AR<>+32(SB)/4, $0x6ed9eba1
+DATA K_XMM_AR<>+36(SB)/4, $0x6ed9eba1
+DATA K_XMM_AR<>+40(SB)/4, $0x6ed9eba1
+DATA K_XMM_AR<>+44(SB)/4, $0x6ed9eba1
+DATA K_XMM_AR<>+48(SB)/4, $0x6ed9eba1
+DATA K_XMM_AR<>+52(SB)/4, $0x6ed9eba1
+DATA K_XMM_AR<>+56(SB)/4, $0x6ed9eba1
+DATA K_XMM_AR<>+60(SB)/4, $0x6ed9eba1
+DATA K_XMM_AR<>+64(SB)/4, $0x8f1bbcdc
+DATA K_XMM_AR<>+68(SB)/4, $0x8f1bbcdc
+DATA K_XMM_AR<>+72(SB)/4, $0x8f1bbcdc
+DATA K_XMM_AR<>+76(SB)/4, $0x8f1bbcdc
+DATA K_XMM_AR<>+80(SB)/4, $0x8f1bbcdc
+DATA K_XMM_AR<>+84(SB)/4, $0x8f1bbcdc
+DATA K_XMM_AR<>+88(SB)/4, $0x8f1bbcdc
+DATA K_XMM_AR<>+92(SB)/4, $0x8f1bbcdc
+DATA K_XMM_AR<>+96(SB)/4, $0xca62c1d6
+DATA K_XMM_AR<>+100(SB)/4, $0xca62c1d6
+DATA K_XMM_AR<>+104(SB)/4, $0xca62c1d6
+DATA K_XMM_AR<>+108(SB)/4, $0xca62c1d6
+DATA K_XMM_AR<>+112(SB)/4, $0xca62c1d6
+DATA K_XMM_AR<>+116(SB)/4, $0xca62c1d6
+DATA K_XMM_AR<>+120(SB)/4, $0xca62c1d6
+DATA K_XMM_AR<>+124(SB)/4, $0xca62c1d6
+GLOBL K_XMM_AR<>(SB), RODATA, $128
+
+DATA BSWAP_SHUFB_CTL<>+0(SB)/4, $0x00010203
+DATA BSWAP_SHUFB_CTL<>+4(SB)/4, $0x04050607
+DATA BSWAP_SHUFB_CTL<>+8(SB)/4, $0x08090a0b
+DATA BSWAP_SHUFB_CTL<>+12(SB)/4, $0x0c0d0e0f
+DATA BSWAP_SHUFB_CTL<>+16(SB)/4, $0x00010203
+DATA BSWAP_SHUFB_CTL<>+20(SB)/4, $0x04050607
+DATA BSWAP_SHUFB_CTL<>+24(SB)/4, $0x08090a0b
+DATA BSWAP_SHUFB_CTL<>+28(SB)/4, $0x0c0d0e0f
+GLOBL BSWAP_SHUFB_CTL<>(SB), RODATA, $32