From: Garrett Bodley Date: Fri, 28 Jun 2024 04:46:32 +0000 (-0400) Subject: crypto/sha256: Avo port of sha256block_amd64.s X-Git-Tag: go1.24rc1~1012 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=dbd50a16dc79d0876f9c523af2aaeb9e1b7b7839;p=gostls13.git crypto/sha256: Avo port of sha256block_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="src/crypto/sha256/sha256block_amd64.s" REFERENCE="54fe0fd43fcf8609666c16ae6d15ed92873b1564" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I $GOROOT/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) 3513c3513 < MOVQ $K256<>(SB), BP --- > LEAQ K256<>(SB), BP 4572c4572 < MOVQ $K256<>(SB), BP --- > LEAQ K256<>(SB), BP Change-Id: I637c01d746ca775b8a09f874f7925ffc3b4965ad Reviewed-on: https://go-review.googlesource.com/c/go/+/595559 Reviewed-by: Russell Webb Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Reviewed-by: Dmitri Shuralyov --- diff --git a/src/cmd/compile/internal/types2/stdlib_test.go b/src/cmd/compile/internal/types2/stdlib_test.go index ed79b92c46..70bb0ae922 100644 --- a/src/cmd/compile/internal/types2/stdlib_test.go +++ b/src/cmd/compile/internal/types2/stdlib_test.go @@ -355,8 +355,9 @@ var excluded = map[string]bool{ "builtin": true, // go.dev/issue/46027: some imports are missing for this submodule. - "crypto/internal/edwards25519/field/_asm": true, "crypto/internal/bigmod/_asm": true, + "crypto/internal/edwards25519/field/_asm": true, + "crypto/sha256/_asm": true, } // printPackageMu synchronizes the printing of type-checked package files in diff --git a/src/crypto/sha256/_asm/go.mod b/src/crypto/sha256/_asm/go.mod new file mode 100644 index 0000000000..cd247e3510 --- /dev/null +++ b/src/crypto/sha256/_asm/go.mod @@ -0,0 +1,11 @@ +module std/crypto/sha256/_asm + +go 1.24 + +require github.com/mmcloughlin/avo v0.6.0 + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/src/crypto/sha256/_asm/go.sum b/src/crypto/sha256/_asm/go.sum new file mode 100644 index 0000000000..76af484b2e --- /dev/null +++ b/src/crypto/sha256/_asm/go.sum @@ -0,0 +1,8 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/src/crypto/sha256/_asm/sha256block_amd64_asm.go b/src/crypto/sha256/_asm/sha256block_amd64_asm.go new file mode 100644 index 0000000000..3c70e018ce --- /dev/null +++ b/src/crypto/sha256/_asm/sha256block_amd64_asm.go @@ -0,0 +1,1214 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" +) + +//go:generate go run . -out ../sha256block_amd64.s -pkg sha256 + +// SHA256 block routine. See sha256block.go for Go equivalent. +// +// The algorithm is detailed in FIPS 180-4: +// +// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf + +// The avx2-version is described in an Intel White-Paper: +// "Fast SHA-256 Implementations on Intel Architecture Processors" +// To find it, surf to http://www.intel.com/p/en_US/embedded +// and search for that title. +// AVX2 version by Intel, same algorithm as code in Linux kernel: +// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S +// by +// James Guilford +// Kirk Yap +// Tim Chen + +// Wt = Mt; for 0 <= t <= 15 +// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 +// +// a = H0 +// b = H1 +// c = H2 +// d = H3 +// e = H4 +// f = H5 +// g = H6 +// h = H7 +// +// for t = 0 to 63 { +// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt +// T2 = BIGSIGMA0(a) + Maj(a,b,c) +// h = g +// g = f +// f = e +// e = d + T1 +// d = c +// c = b +// b = a +// a = T1 + T2 +// } +// +// H0 = a + H0 +// H1 = b + H1 +// H2 = c + H2 +// H3 = d + H3 +// H4 = e + H4 +// H5 = f + H5 +// H6 = g + H6 +// H7 = h + H7 + +func main() { + Package("crypto/sha256") + ConstraintExpr("!purego") + block() + Generate() +} + +// Wt = Mt; for 0 <= t <= 15 +func msgSchedule0(index int) { + MOVL(Mem{Base: SI}.Offset(index*4), EAX) + BSWAPL(EAX) + MOVL(EAX, Mem{Base: BP}.Offset(index*4)) +} + +// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 +// +// SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x) +// SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x) +func msgSchedule1(index int) { + MOVL(Mem{Base: BP}.Offset((index-2)*4), EAX) + MOVL(EAX, ECX) + RORL(Imm(17), EAX) + MOVL(ECX, EDX) + RORL(Imm(19), ECX) + SHRL(Imm(10), EDX) + MOVL(Mem{Base: BP}.Offset((index-15)*4), EBX) + XORL(ECX, EAX) + MOVL(EBX, ECX) + XORL(EDX, EAX) + RORL(Imm(7), EBX) + MOVL(ECX, EDX) + SHRL(Imm(3), EDX) + RORL(Imm(18), ECX) + ADDL(Mem{Base: BP}.Offset((index-7)*4), EAX) + XORL(ECX, EBX) + XORL(EDX, EBX) + ADDL(Mem{Base: BP}.Offset((index-16)*4), EBX) + ADDL(EBX, EAX) + MOVL(EAX, Mem{Base: BP}.Offset((index)*4)) +} + +// Calculate T1 in AX - uses AX, CX and DX registers. +// h is also used as an accumulator. Wt is passed in AX. +// +// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt +// BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x) +// Ch(x, y, z) = (x AND y) XOR (NOT x AND z) +func sha256T1(konst uint32, e, f, g, h GPPhysical) { + ADDL(EAX, h) + MOVL(e, EAX) + ADDL(U32(konst), h) + MOVL(e, ECX) + RORL(U8(6), EAX) + MOVL(e, EDX) + RORL(U8(11), ECX) + XORL(ECX, EAX) + MOVL(e, ECX) + RORL(U8(25), EDX) + ANDL(f, ECX) + XORL(EAX, EDX) + MOVL(e, EAX) + NOTL(EAX) + ADDL(EDX, h) + ANDL(g, EAX) + XORL(ECX, EAX) + ADDL(h, EAX) +} + +// Calculate T2 in BX - uses BX, CX, DX and DI registers. +// +// T2 = BIGSIGMA0(a) + Maj(a, b, c) +// BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x) +// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) +func sha256T2(a, b, c GPPhysical) { + MOVL(a, EDI) + MOVL(c, EBX) + RORL(U8(2), EDI) + MOVL(a, EDX) + ANDL(b, EBX) + RORL(U8(13), EDX) + MOVL(a, ECX) + ANDL(c, ECX) + XORL(EDX, EDI) + XORL(ECX, EBX) + MOVL(a, EDX) + MOVL(b, ECX) + RORL(U8(22), EDX) + ANDL(a, ECX) + XORL(ECX, EBX) + XORL(EDX, EDI) + ADDL(EDI, EBX) +} + +// Calculate T1 and T2, then e = d + T1 and a = T1 + T2. +// The values for e and a are stored in d and h, ready for rotation. +func sha256Round(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) { + sha256T1(konst, e, f, g, h) + sha256T2(a, b, c) + MOVL(EBX, h) + ADDL(EAX, d) + ADDL(EAX, h) +} + +func sha256Round0(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) { + msgSchedule0(index) + sha256Round(index, konst, a, b, c, d, e, f, g, h) +} + +func sha256Round1(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) { + msgSchedule1(index) + sha256Round(index, konst, a, b, c, d, e, f, g, h) +} + +// Definitions for AVX2 version + +// addm (mem), reg +// - Add reg to mem using reg-mem add and store +func addm(P1 Mem, P2 GPPhysical) { + ADDL(P2, P1) + MOVL(P1, P2) +} + +var ( + XDWORD0 VecPhysical = Y4 + XDWORD1 = Y5 + XDWORD2 = Y6 + XDWORD3 = Y7 + + XWORD0 = X4 + XWORD1 = X5 + XWORD2 = X6 + XWORD3 = X7 + + XTMP0 = Y0 + XTMP1 = Y1 + XTMP2 = Y2 + XTMP3 = Y3 + XTMP4 = Y8 + XTMP5 = Y11 + + XFER = Y9 + + BYTE_FLIP_MASK = Y13 // mask to convert LE -> BE + X_BYTE_FLIP_MASK = X13 + + NUM_BYTES GPPhysical = RDX + INP = RDI + + CTX = RSI // Beginning of digest in memory (a, b, c, ... , h) + + a = EAX + b = EBX + c = ECX + d = R8L + e = EDX + f = R9L + g = R10L + h = R11L + + old_h = R11L + + TBL = RBP + + SRND = RSI // SRND is same register as CTX + + T1 = R12L + + y0 = R13L + y1 = R14L + y2 = R15L + y3 = EDI + + // Offsets + XFER_SIZE = 2 * 64 * 4 + INP_END_SIZE = 8 + INP_SIZE = 8 + + _XFER = 0 + _INP_END = _XFER + XFER_SIZE + _INP = _INP_END + INP_END_SIZE + STACK_SIZE = _INP + INP_SIZE +) + +func roundAndSchedN0(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { + // ############################# RND N + 0 ############################// + MOVL(a, y3) // y3 = a + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + + ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + VPALIGNR(Imm(4), XDWORD2, XDWORD3, XTMP0) // XTMP0 = W[-7] + MOVL(f, y2) // y2 = f + RORXL(Imm(13), a, T1) // T1 = a >> 13 + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + XORL(g, y2) // y2 = f^g + VPADDD(XDWORD0, XTMP0, XTMP0) // XTMP0 = W[-7] + W[-16] + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + + ANDL(e, y2) // y2 = (f^g)&e + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(22), a, y1) // y1 = a >> 22 + ADDL(h, d) // d = k + w + h + d + + ANDL(b, y3) // y3 = (a|c)&b + VPALIGNR(Imm(4), XDWORD0, XDWORD1, XTMP1) // XTMP1 = W[-15] + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + VPSRLD(Imm(7), XTMP1, XTMP2) // + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(c, T1) // T1 = a&c + + ADDL(y0, y2) // y2 = S1 + CH + VPSLLD(Imm(32-7), XTMP1, XTMP3) // + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 + VPOR(XTMP2, XTMP3, XTMP3) // XTMP3 = W[-15] ror 7 + + VPSRLD(Imm(18), XTMP1, XTMP2) + ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + ADDL(y3, h) // h = t1 + S0 + MAJ +} + +func roundAndSchedN1(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { + // ################################### RND N + 1 ############################ + MOVL(a, y3) // y3 = a + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + VPSRLD(Imm(3), XTMP1, XTMP4) // XTMP4 = W[-15] >> 3 + MOVL(f, y2) // y2 = f + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + XORL(g, y2) // y2 = f^g + + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(22), a, y1) // y1 = a >> 22 + ANDL(e, y2) // y2 = (f^g)&e + ADDL(h, d) // d = k + w + h + d + + VPSLLD(Imm(32-18), XTMP1, XTMP1) + ANDL(b, y3) // y3 = (a|c)&b + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + + VPXOR(XTMP1, XTMP3, XTMP3) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + + VPXOR(XTMP2, XTMP3, XTMP3) // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + + VPXOR(XTMP4, XTMP3, XTMP1) // XTMP1 = s0 + VPSHUFD(Imm(0xFA), XDWORD3, XTMP2) // XTMP2 = W[-2] {BBAA} + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + + VPADDD(XTMP1, XTMP0, XTMP0) // XTMP0 = W[-16] + W[-7] + s0 + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 + ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + ADDL(y3, h) // h = t1 + S0 + MAJ + + VPSRLD(Imm(10), XTMP2, XTMP4) // XTMP4 = W[-2] >> 10 {BBAA} +} + +func roundAndSchedN2(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { + // ################################### RND N + 2 ############################ + var shuff_00BA Mem = shuff_00BA_DATA() + + MOVL(a, y3) // y3 = a + RORXL(Imm(25), e, y0) // y0 = e >> 25 + ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h + + VPSRLQ(Imm(19), XTMP2, XTMP3) // XTMP3 = W[-2] ror 19 {xBxA} + RORXL(Imm(11), e, y1) // y1 = e >> 11 + ORL(c, y3) // y3 = a|c + MOVL(f, y2) // y2 = f + XORL(g, y2) // y2 = f^g + + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + VPSRLQ(Imm(17), XTMP2, XTMP2) // XTMP2 = W[-2] ror 17 {xBxA} + ANDL(e, y2) // y2 = (f^g)&e + + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + VPXOR(XTMP3, XTMP2, XTMP2) + ADDL(h, d) // d = k + w + h + d + ANDL(b, y3) // y3 = (a|c)&b + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(22), a, y1) // y1 = a >> 22 + VPXOR(XTMP2, XTMP4, XTMP4) // XTMP4 = s1 {xBxA} + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + + VPSHUFB(shuff_00BA, XTMP4, XTMP4) // XTMP4 = s1 {00BA} + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + VPADDD(XTMP4, XTMP0, XTMP0) // XTMP0 = {..., ..., W[1], W[0]} + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + VPSHUFD(Imm(80), XTMP0, XTMP2) // XTMP2 = W[-2] {DDCC} + + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 + ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + + ADDL(y3, h) // h = t1 + S0 + MAJ +} + +func roundAndSchedN3(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { + // ################################### RND N + 3 ############################ + var shuff_DC00 Mem = shuff_DC00_DATA() + + MOVL(a, y3) // y3 = a + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + VPSRLD(Imm(10), XTMP2, XTMP5) // XTMP5 = W[-2] >> 10 {DDCC} + MOVL(f, y2) // y2 = f + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + XORL(g, y2) // y2 = f^g + + VPSRLQ(Imm(19), XTMP2, XTMP3) // XTMP3 = W[-2] ror 19 {xDxC} + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + ANDL(e, y2) // y2 = (f^g)&e + ADDL(h, d) // d = k + w + h + d + ANDL(b, y3) // y3 = (a|c)&b + + VPSRLQ(Imm(17), XTMP2, XTMP2) // XTMP2 = W[-2] ror 17 {xDxC} + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + + VPXOR(XTMP3, XTMP2, XTMP2) + RORXL(Imm(22), a, y1) // y1 = a >> 22 + ADDL(y0, y2) // y2 = S1 + CH + + VPXOR(XTMP2, XTMP5, XTMP5) // XTMP5 = s1 {xDxC} + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 + + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + + VPSHUFB(shuff_DC00, XTMP5, XTMP5) // XTMP5 = s1 {DC00} + + VPADDD(XTMP0, XTMP5, XDWORD0) // XDWORD0 = {W[3], W[2], W[1], W[0]} + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(c, T1) // T1 = a&c + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + + ADDL(y1, h) // h = k + w + h + S0 + ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + ADDL(y3, h) // h = t1 + S0 + MAJ +} + +func doRoundN0(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { + // ################################### RND N + 0 ########################### + MOVL(f, y2) // y2 = f + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + XORL(g, y2) // y2 = f^g + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + ANDL(e, y2) // y2 = (f^g)&e + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + RORXL(Imm(22), a, y1) // y1 = a >> 22 + MOVL(a, y3) // y3 = a + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(b, y3) // y3 = (a|c)&b + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + + ADDL(h, d) // d = k + w + h + d + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 +} + +func doRoundN1(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { + // ################################### RND N + 1 ########################### + ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + MOVL(f, y2) // y2 = f + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + XORL(g, y2) // y2 = f^g + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + ANDL(e, y2) // y2 = (f^g)&e + ADDL(y3, old_h) // h = t1 + S0 + MAJ + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + RORXL(Imm(22), a, y1) // y1 = a >> 22 + MOVL(a, y3) // y3 = a + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(b, y3) // y3 = (a|c)&b + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + + ADDL(h, d) // d = k + w + h + d + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 +} + +func doRoundN2(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { + // ################################### RND N + 2 ############################## + ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + MOVL(f, y2) // y2 = f + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + XORL(g, y2) // y2 = f^g + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + ANDL(e, y2) // y2 = (f^g)&e + ADDL(y3, old_h) // h = t1 + S0 + MAJ + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + RORXL(Imm(22), a, y1) // y1 = a >> 22 + MOVL(a, y3) // y3 = a + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(b, y3) // y3 = (a|c)&b + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + + ADDL(h, d) // d = k + w + h + d + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 +} + +func doRoundN3(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { + // ################################### RND N + 3 ########################### + ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + MOVL(f, y2) // y2 = f + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + XORL(g, y2) // y2 = f^g + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + ANDL(e, y2) // y2 = (f^g)&e + ADDL(y3, old_h) // h = t1 + S0 + MAJ + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + RORXL(Imm(22), a, y1) // y1 = a >> 22 + MOVL(a, y3) // y3 = a + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(b, y3) // y3 = (a|c)&b + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + + ADDL(h, d) // d = k + w + h + d + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 + + ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + + ADDL(y3, h) // h = t1 + S0 + MAJ +} + +// Definitions for sha-ni version +// +// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2 +// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version +// +// Reference +// S. Gulley, et al, "New Instructions Supporting the Secure Hash +// Algorithm on Intel® Architecture Processors", July 2013 +// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html +// + +var ( + digestPtr GPPhysical = RDI // input/output, base pointer to digest hash vector H0, H1, ..., H7 + dataPtr = RSI // input, base pointer to first input data block + numBytes = RDX // input, number of input bytes to be processed + sha256Constants = RAX // round contents from K256 table, indexed by round number x 32 + msg VecPhysical = X0 // input data + state0 = X1 // round intermediates and outputs + state1 = X2 + m0 = X3 // m0, m1,... m4 -- round message temps + m1 = X4 + m2 = X5 + m3 = X6 + m4 = X7 + shufMask = X8 // input data endian conversion control mask + abefSave = X9 // digest hash vector inter-block buffer abef + cdghSave = X10 // digest hash vector inter-block buffer cdgh +) + +// nop instead of final SHA256MSG1 for first and last few rounds +func nop(m, a VecPhysical) { +} + +// final SHA256MSG1 for middle rounds that require it +func sha256msg1(m, a VecPhysical) { + SHA256MSG1(m, a) +} + +// msg copy for all but rounds 12-15 +func vmov(a, b VecPhysical) { + VMOVDQA(a, b) +} + +// reverse copy for rounds 12-15 +func vmovrev(a, b VecPhysical) { + VMOVDQA(b, a) +} + +type VecFunc func(a, b VecPhysical) + +// sha rounds 0 to 11 +// +// identical with the exception of the final msg op +// which is replaced with a nop for rounds where it is not needed +// refer to Gulley, et al for more information +func rounds0to11(m, a VecPhysical, c int, sha256msg1 VecFunc) { + VMOVDQU(Mem{Base: dataPtr}.Offset(c*16), msg) + PSHUFB(shufMask, msg) + VMOVDQA(msg, m) + PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg) + SHA256RNDS2(msg, state0, state1) + PSHUFD(U8(0x0e), msg, msg) + SHA256RNDS2(msg, state1, state0) + sha256msg1(m, a) +} + +// sha rounds 12 to 59 +// +// identical with the exception of the final msg op +// and the reverse copy(m,msg) in round 12 which is required +// after the last data load +// refer to Gulley, et al for more information +func rounds12to59(m VecPhysical, c int, a, t VecPhysical, sha256msg1, movop VecFunc) { + movop(m, msg) + PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg) + SHA256RNDS2(msg, state0, state1) + VMOVDQA(m, m4) + PALIGNR(Imm(4), a, m4) + PADDD(m4, t) + SHA256MSG2(m, t) + PSHUFD(Imm(0x0e), msg, msg) + SHA256RNDS2(msg, state1, state0) + sha256msg1(m, a) +} + +func block() { + Implement("block") + AllocLocal(536) + + checkArchFlags() + sha256() + avx2() + sha_ni() +} + +func checkArchFlags() { + CMPB(Mem{Symbol: Symbol{Name: "·useSHA"}, Base: StaticBase}, Imm(1)) + JE(LabelRef("sha_ni")) + CMPB(Mem{Symbol: Symbol{Name: "·useAVX2"}, Base: StaticBase}, Imm(1)) + JE(LabelRef("avx2")) +} + +func sha256() { + Load(Param("p").Base(), RSI) + Load(Param("p").Len(), RDX) + SHRQ(Imm(6), RDX) + SHLQ(Imm(6), RDX) + + // Return if p is empty + LEAQ(Mem{Base: RSI, Index: RDX, Scale: 1}, RDI) + MOVQ(RDI, Mem{Base: SP}.Offset(256)) + CMPQ(RSI, RDI) + JEQ(LabelRef("end")) + + BP := Mem{Base: BP} + Load(Param("dig"), RBP) + MOVL(BP.Offset(0*4), R8L) // a = H0 + MOVL(BP.Offset(1*4), R9L) // b = H1 + MOVL(BP.Offset(2*4), R10L) // c = H2 + MOVL(BP.Offset(3*4), R11L) // d = H3 + MOVL(BP.Offset(4*4), R12L) // e = H4 + MOVL(BP.Offset(5*4), R13L) // f = H5 + MOVL(BP.Offset(6*4), R14L) // g = H6 + MOVL(BP.Offset(7*4), R15L) // h = H7 + + loop() + end() +} + +func rotateRight(slice *[]GPPhysical) []GPPhysical { + n := len(*slice) + new := make([]GPPhysical, n) + for i, reg := range *slice { + new[(i+1)%n] = reg + } + return new +} + +func loop() { + Label("loop") + MOVQ(RSP, RBP) + + regs := []GPPhysical{R8L, R9L, R10L, R11L, R12L, R13L, R14L, R15L} + n := len(_K) + + for i := 0; i < 16; i++ { + sha256Round0(i, _K[i], regs[0], regs[1], regs[2], regs[3], regs[4], regs[5], regs[6], regs[7]) + regs = rotateRight(®s) + } + + for i := 16; i < n; i++ { + sha256Round1(i, _K[i], regs[0], regs[1], regs[2], regs[3], regs[4], regs[5], regs[6], regs[7]) + regs = rotateRight(®s) + } + + Load(Param("dig"), RBP) + BP := Mem{Base: BP} + ADDL(BP.Offset(0*4), R8L) // H0 = a + H0 + MOVL(R8L, BP.Offset(0*4)) + ADDL(BP.Offset(1*4), R9L) // H1 = b + H1 + MOVL(R9L, BP.Offset(1*4)) + ADDL(BP.Offset(2*4), R10L) // H2 = c + H2 + MOVL(R10L, BP.Offset(2*4)) + ADDL(BP.Offset(3*4), R11L) // H3 = d + H3 + MOVL(R11L, BP.Offset(3*4)) + ADDL(BP.Offset(4*4), R12L) // H4 = e + H4 + MOVL(R12L, BP.Offset(4*4)) + ADDL(BP.Offset(5*4), R13L) // H5 = f + H5 + MOVL(R13L, BP.Offset(5*4)) + ADDL(BP.Offset(6*4), R14L) // H6 = g + H6 + MOVL(R14L, BP.Offset(6*4)) + ADDL(BP.Offset(7*4), R15L) // H7 = h + H7 + MOVL(R15L, BP.Offset(7*4)) + + ADDQ(Imm(64), RSI) + CMPQ(RSI, Mem{Base: SP}.Offset(256)) + JB(LabelRef("loop")) +} + +func end() { + Label("end") + RET() +} + +func avx2() { + Label("avx2") + Load(Param("dig"), CTX) // d.h[8] + Load(Param("p").Base(), INP) + Load(Param("p").Len(), NUM_BYTES) + + LEAQ(Mem{Base: INP, Index: NUM_BYTES, Scale: 1, Disp: -64}, NUM_BYTES) // Pointer to the last block + MOVQ(NUM_BYTES, Mem{Base: SP}.Offset(_INP_END)) + + CMPQ(NUM_BYTES, INP) + JE(LabelRef("avx2_only_one_block")) + + Comment("Load initial digest") + CTX := Mem{Base: CTX} + MOVL(CTX.Offset(0), a) // a = H0 + MOVL(CTX.Offset(4), b) // b = H1 + MOVL(CTX.Offset(8), c) // c = H2 + MOVL(CTX.Offset(12), d) // d = H3 + MOVL(CTX.Offset(16), e) // e = H4 + MOVL(CTX.Offset(20), f) // f = H5 + MOVL(CTX.Offset(24), g) // g = H6 + MOVL(CTX.Offset(28), h) // h = H7 + + avx2_loop0() + avx2_last_block_enter() + avx2_loop1() + avx2_loop2() + avx2_loop3() + avx2_do_last_block() + avx2_only_one_block() + done_hash() +} + +func avx2_loop0() { + Label("avx2_loop0") + Comment("at each iteration works with one block (512 bit)") + VMOVDQU(Mem{Base: INP}.Offset(0*32), XTMP0) + VMOVDQU(Mem{Base: INP}.Offset(1*32), XTMP1) + VMOVDQU(Mem{Base: INP}.Offset(2*32), XTMP2) + VMOVDQU(Mem{Base: INP}.Offset(3*32), XTMP3) + + flip_mask := flip_mask_DATA() + + VMOVDQU(flip_mask, BYTE_FLIP_MASK) + + Comment("Apply Byte Flip Mask: LE -> BE") + VPSHUFB(BYTE_FLIP_MASK, XTMP0, XTMP0) + VPSHUFB(BYTE_FLIP_MASK, XTMP1, XTMP1) + VPSHUFB(BYTE_FLIP_MASK, XTMP2, XTMP2) + VPSHUFB(BYTE_FLIP_MASK, XTMP3, XTMP3) + + Comment("Transpose data into high/low parts") + VPERM2I128(Imm(0x20), XTMP2, XTMP0, XDWORD0) // w3, w2, w1, w0 + VPERM2I128(Imm(0x31), XTMP2, XTMP0, XDWORD1) // w7, w6, w5, w4 + VPERM2I128(Imm(0x20), XTMP3, XTMP1, XDWORD2) // w11, w10, w9, w8 + VPERM2I128(Imm(0x31), XTMP3, XTMP1, XDWORD3) // w15, w14, w13, w12 + + K256 := K256_DATA() + LEAQ(K256, TBL) // Loading address of table with round-specific constants +} + +func avx2_last_block_enter() { + Label("avx2_last_block_enter") + ADDQ(Imm(64), INP) + MOVQ(INP, Mem{Base: SP}.Offset(_INP)) + XORQ(SRND, SRND) +} + +// for w0 - w47 +func avx2_loop1() { + Label("avx2_loop1") + + Comment("Do 4 rounds and scheduling") + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((0 * 32)), XDWORD0, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32)) + roundAndSchedN0(_XFER+0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + roundAndSchedN1(_XFER+0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + roundAndSchedN2(_XFER+0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + roundAndSchedN3(_XFER+0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + + Comment("Do 4 rounds and scheduling") + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32)) + roundAndSchedN0(_XFER+1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + roundAndSchedN1(_XFER+1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + roundAndSchedN2(_XFER+1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + roundAndSchedN3(_XFER+1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + + Comment("Do 4 rounds and scheduling") + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((2 * 32)), XDWORD2, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+2*32)) + roundAndSchedN0(_XFER+2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + roundAndSchedN1(_XFER+2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + roundAndSchedN2(_XFER+2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + roundAndSchedN3(_XFER+2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + + Comment("Do 4 rounds and scheduling") + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((3 * 32)), XDWORD3, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+3*32)) + roundAndSchedN0(_XFER+3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + roundAndSchedN1(_XFER+3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + roundAndSchedN2(_XFER+3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + roundAndSchedN3(_XFER+3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + + ADDQ(Imm(4*32), SRND) + CMPQ(SRND, U32(3*4*32)) + JB(LabelRef("avx2_loop1")) +} + +// w48 - w63 processed with no scheduling (last 16 rounds) +func avx2_loop2() { + Label("avx2_loop2") + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(0*32), XDWORD0, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32)) + doRoundN0(_XFER+0*32, a, b, c, d, e, f, g, h, h) + doRoundN1(_XFER+0*32, h, a, b, c, d, e, f, g, h) + doRoundN2(_XFER+0*32, g, h, a, b, c, d, e, f, g) + doRoundN3(_XFER+0*32, f, g, h, a, b, c, d, e, f) + + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32)) + doRoundN0(_XFER+1*32, e, f, g, h, a, b, c, d, e) + doRoundN1(_XFER+1*32, d, e, f, g, h, a, b, c, d) + doRoundN2(_XFER+1*32, c, d, e, f, g, h, a, b, c) + doRoundN3(_XFER+1*32, b, c, d, e, f, g, h, a, b) + + ADDQ(Imm(2*32), SRND) + + VMOVDQU(XDWORD2, XDWORD0) + VMOVDQU(XDWORD3, XDWORD1) + + CMPQ(SRND, U32(4*4*32)) + JB(LabelRef("avx2_loop2")) + + Load(Param("dig"), CTX) // d.h[8] + MOVQ(Mem{Base: SP}.Offset(_INP), INP) + + registers := []GPPhysical{a, b, c, d, e, f, g, h} + for i, reg := range registers { + addm(Mem{Base: CTX}.Offset(i*4), reg) + } + + CMPQ(Mem{Base: SP}.Offset(_INP_END), INP) + JB(LabelRef("done_hash")) + + XORQ(SRND, SRND) +} + +// Do second block using previously scheduled results +func avx2_loop3() { + Label("avx2_loop3") + doRoundN0(_XFER+0*32+16, a, b, c, d, e, f, g, h, a) + doRoundN1(_XFER+0*32+16, h, a, b, c, d, e, f, g, h) + doRoundN2(_XFER+0*32+16, g, h, a, b, c, d, e, f, g) + doRoundN3(_XFER+0*32+16, f, g, h, a, b, c, d, e, f) + + doRoundN0(_XFER+1*32+16, e, f, g, h, a, b, c, d, e) + doRoundN1(_XFER+1*32+16, d, e, f, g, h, a, b, c, d) + doRoundN2(_XFER+1*32+16, c, d, e, f, g, h, a, b, c) + doRoundN3(_XFER+1*32+16, b, c, d, e, f, g, h, a, b) + + ADDQ(Imm(2*32), SRND) + CMPQ(SRND, U32(4*4*32)) + JB(LabelRef("avx2_loop3")) + + Load(Param("dig"), CTX) // d.h[8] + MOVQ(Mem{Base: SP}.Offset(_INP), INP) + ADDQ(Imm(64), INP) + + registers := []GPPhysical{a, b, c, d, e, f, g, h} + for i, reg := range registers { + addm(Mem{Base: CTX}.Offset(i*4), reg) + } + + CMPQ(Mem{Base: SP}.Offset(_INP_END), INP) + JA(LabelRef("avx2_loop0")) + JB(LabelRef("done_hash")) +} + +func avx2_do_last_block() { + Label("avx2_do_last_block") + VMOVDQU(Mem{Base: INP}.Offset(0), XWORD0) + VMOVDQU(Mem{Base: INP}.Offset(16), XWORD1) + VMOVDQU(Mem{Base: INP}.Offset(32), XWORD2) + VMOVDQU(Mem{Base: INP}.Offset(48), XWORD3) + + flip_mask := flip_mask_DATA() + VMOVDQU(flip_mask, BYTE_FLIP_MASK) + + VPSHUFB(X_BYTE_FLIP_MASK, XWORD0, XWORD0) + VPSHUFB(X_BYTE_FLIP_MASK, XWORD1, XWORD1) + VPSHUFB(X_BYTE_FLIP_MASK, XWORD2, XWORD2) + VPSHUFB(X_BYTE_FLIP_MASK, XWORD3, XWORD3) + + K256 := K256_DATA() + LEAQ(K256, TBL) + + JMP(LabelRef("avx2_last_block_enter")) +} + +// Load initial digest +func avx2_only_one_block() { + Label("avx2_only_one_block") + registers := []GPPhysical{a, b, c, d, e, f, g, h} + for i, reg := range registers { + MOVL(Mem{Base: CTX}.Offset(i*4), reg) + } + JMP(LabelRef("avx2_do_last_block")) +} + +func done_hash() { + Label("done_hash") + VZEROUPPER() + RET() +} + +func sha_ni() { + Label("sha_ni") + Load(Param("dig"), digestPtr) // init digest hash vector H0, H1,..., H7 pointer + Load(Param("p").Base(), dataPtr) // init input data base pointer + Load(Param("p").Len(), numBytes) // get number of input bytes to hash + SHRQ(Imm(6), numBytes) // force modulo 64 input buffer length + SHLQ(Imm(6), numBytes) + CMPQ(numBytes, Imm(0)) // exit early for zero-length input buffer + JEQ(LabelRef("done")) + ADDQ(dataPtr, numBytes) // point numBytes to end of input buffer + VMOVDQU(Mem{Base: digestPtr}.Offset(0*16), state0) // load initial hash values and reorder + VMOVDQU(Mem{Base: digestPtr}.Offset(1*16), state1) // DCBA, HGFE -> ABEF, CDGH + PSHUFD(Imm(0xb1), state0, state0) // CDAB + PSHUFD(Imm(0x1b), state1, state1) // EFGH + VMOVDQA(state0, m4) + PALIGNR(Imm(8), state1, state0) // ABEF + PBLENDW(Imm(0xf0), m4, state1) // CDGH + flip_mask := flip_mask_DATA() + VMOVDQA(flip_mask, shufMask) + LEAQ(K256_DATA(), sha256Constants) + + roundLoop() + done() +} + +func roundLoop() { + Label("roundLoop") + Comment("save hash values for addition after rounds") + VMOVDQA(state0, abefSave) + VMOVDQA(state1, cdghSave) + + Comment("do rounds 0-59") + rounds0to11(m0, nil, 0, nop) // 0-3 + rounds0to11(m1, m0, 1, sha256msg1) // 4-7 + rounds0to11(m2, m1, 2, sha256msg1) // 8-11 + VMOVDQU(Mem{Base: dataPtr}.Offset(3*16), msg) + PSHUFB(shufMask, msg) + rounds12to59(m3, 3, m2, m0, sha256msg1, vmovrev) // 12-15 + rounds12to59(m0, 4, m3, m1, sha256msg1, vmov) // 16-19 + rounds12to59(m1, 5, m0, m2, sha256msg1, vmov) // 20-23 + rounds12to59(m2, 6, m1, m3, sha256msg1, vmov) // 24-27 + rounds12to59(m3, 7, m2, m0, sha256msg1, vmov) // 28-31 + rounds12to59(m0, 8, m3, m1, sha256msg1, vmov) // 32-35 + rounds12to59(m1, 9, m0, m2, sha256msg1, vmov) // 36-39 + rounds12to59(m2, 10, m1, m3, sha256msg1, vmov) // 40-43 + rounds12to59(m3, 11, m2, m0, sha256msg1, vmov) // 44-47 + rounds12to59(m0, 12, m3, m1, sha256msg1, vmov) // 48-51 + rounds12to59(m1, 13, m0, m2, nop, vmov) // 52-55 + rounds12to59(m2, 14, m1, m3, nop, vmov) // 56-59 + + Comment("do rounds 60-63") + VMOVDQA(m3, msg) + PADDD(Mem{Base: sha256Constants}.Offset(15*32), msg) + SHA256RNDS2(msg, state0, state1) + PSHUFD(Imm(0x0e), msg, msg) + SHA256RNDS2(msg, state1, state0) + + Comment("add current hash values with previously saved") + PADDD(abefSave, state0) + PADDD(cdghSave, state1) + + Comment("advance data pointer; loop until buffer empty") + ADDQ(Imm(64), dataPtr) + CMPQ(numBytes, dataPtr) + JNE(LabelRef("roundLoop")) + + Comment("write hash values back in the correct order") + PSHUFD(Imm(0x1b), state0, state0) + PSHUFD(Imm(0xb1), state1, state1) + VMOVDQA(state0, m4) + PBLENDW(Imm(0xf0), state1, state0) + PALIGNR(Imm(8), m4, state1) + VMOVDQU(state0, Mem{Base: digestPtr}.Offset(0*16)) + VMOVDQU(state1, Mem{Base: digestPtr}.Offset(1*16)) +} + +func done() { + Label("done") + RET() +} + +/**~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~**/ + +// Pointers for memoizing Data section symbols +var flip_maskPtr, shuff_00BAPtr, shuff_DC00Ptr, K256Ptr *Mem + +// shuffle byte order from LE to BE +func flip_mask_DATA() Mem { + if flip_maskPtr != nil { + return *flip_maskPtr + } + + flip_mask := GLOBL("flip_mask", RODATA) + flip_maskPtr = &flip_mask + + DATA(0x00, U64(0x0405060700010203)) + DATA(0x08, U64(0x0c0d0e0f08090a0b)) + DATA(0x10, U64(0x0405060700010203)) + DATA(0x18, U64(0x0c0d0e0f08090a0b)) + return flip_mask +} + +// shuffle xBxA -> 00BA +func shuff_00BA_DATA() Mem { + if shuff_00BAPtr != nil { + return *shuff_00BAPtr + } + + shuff_00BA := GLOBL("shuff_00BA", RODATA) + shuff_00BAPtr = &shuff_00BA + + DATA(0x00, U64(0x0b0a090803020100)) + DATA(0x08, U64(0xFFFFFFFFFFFFFFFF)) + DATA(0x10, U64(0x0b0a090803020100)) + DATA(0x18, U64(0xFFFFFFFFFFFFFFFF)) + return shuff_00BA +} + +// shuffle xDxC -> DC00 +func shuff_DC00_DATA() Mem { + if shuff_DC00Ptr != nil { + return *shuff_DC00Ptr + } + + shuff_DC00 := GLOBL("shuff_DC00", RODATA) + shuff_DC00Ptr = &shuff_DC00 + + DATA(0x00, U64(0xFFFFFFFFFFFFFFFF)) + DATA(0x08, U64(0x0b0a090803020100)) + DATA(0x10, U64(0xFFFFFFFFFFFFFFFF)) + DATA(0x18, U64(0x0b0a090803020100)) + return shuff_DC00 +} + +var _K = []uint32{ + 0x428a2f98, + 0x71374491, + 0xb5c0fbcf, + 0xe9b5dba5, + 0x3956c25b, + 0x59f111f1, + 0x923f82a4, + 0xab1c5ed5, + 0xd807aa98, + 0x12835b01, + 0x243185be, + 0x550c7dc3, + 0x72be5d74, + 0x80deb1fe, + 0x9bdc06a7, + 0xc19bf174, + 0xe49b69c1, + 0xefbe4786, + 0x0fc19dc6, + 0x240ca1cc, + 0x2de92c6f, + 0x4a7484aa, + 0x5cb0a9dc, + 0x76f988da, + 0x983e5152, + 0xa831c66d, + 0xb00327c8, + 0xbf597fc7, + 0xc6e00bf3, + 0xd5a79147, + 0x06ca6351, + 0x14292967, + 0x27b70a85, + 0x2e1b2138, + 0x4d2c6dfc, + 0x53380d13, + 0x650a7354, + 0x766a0abb, + 0x81c2c92e, + 0x92722c85, + 0xa2bfe8a1, + 0xa81a664b, + 0xc24b8b70, + 0xc76c51a3, + 0xd192e819, + 0xd6990624, + 0xf40e3585, + 0x106aa070, + 0x19a4c116, + 0x1e376c08, + 0x2748774c, + 0x34b0bcb5, + 0x391c0cb3, + 0x4ed8aa4a, + 0x5b9cca4f, + 0x682e6ff3, + 0x748f82ee, + 0x78a5636f, + 0x84c87814, + 0x8cc70208, + 0x90befffa, + 0xa4506ceb, + 0xbef9a3f7, + 0xc67178f2, +} + +// Round specific constants +func K256_DATA() Mem { + if K256Ptr != nil { + return *K256Ptr + } + + K256 := GLOBL("K256", NOPTR+RODATA) + K256Ptr = &K256 + + offset_idx := 0 + + for i := 0; i < len(_K); i += 4 { + DATA((offset_idx+0)*4, U32(_K[i+0])) // k1 + DATA((offset_idx+1)*4, U32(_K[i+1])) // k2 + DATA((offset_idx+2)*4, U32(_K[i+2])) // k3 + DATA((offset_idx+3)*4, U32(_K[i+3])) // k4 + + DATA((offset_idx+4)*4, U32(_K[i+0])) // k1 + DATA((offset_idx+5)*4, U32(_K[i+1])) // k2 + DATA((offset_idx+6)*4, U32(_K[i+2])) // k3 + DATA((offset_idx+7)*4, U32(_K[i+3])) // k4 + offset_idx += 8 + } + return K256 +} diff --git a/src/crypto/sha256/sha256block_amd64.s b/src/crypto/sha256/sha256block_amd64.s index 2559f659a2..700a4eff97 100644 --- a/src/crypto/sha256/sha256block_amd64.s +++ b/src/crypto/sha256/sha256block_amd64.s @@ -1,1175 +1,4973 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run sha256block_amd64_asm.go -out ../sha256block_amd64.s -pkg sha256. DO NOT EDIT. //go:build !purego #include "textflag.h" -// SHA256 block routine. See sha256block.go for Go equivalent. -// -// The algorithm is detailed in FIPS 180-4: -// -// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf - -// The avx2-version is described in an Intel White-Paper: -// "Fast SHA-256 Implementations on Intel Architecture Processors" -// To find it, surf to http://www.intel.com/p/en_US/embedded -// and search for that title. -// AVX2 version by Intel, same algorithm as code in Linux kernel: -// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S -// by -// James Guilford -// Kirk Yap -// Tim Chen - -// Wt = Mt; for 0 <= t <= 15 -// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 -// -// a = H0 -// b = H1 -// c = H2 -// d = H3 -// e = H4 -// f = H5 -// g = H6 -// h = H7 -// -// for t = 0 to 63 { -// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt -// T2 = BIGSIGMA0(a) + Maj(a,b,c) -// h = g -// g = f -// f = e -// e = d + T1 -// d = c -// c = b -// b = a -// a = T1 + T2 -// } -// -// H0 = a + H0 -// H1 = b + H1 -// H2 = c + H2 -// H3 = d + H3 -// H4 = e + H4 -// H5 = f + H5 -// H6 = g + H6 -// H7 = h + H7 - -// Wt = Mt; for 0 <= t <= 15 -#define MSGSCHEDULE0(index) \ - MOVL (index*4)(SI), AX; \ - BSWAPL AX; \ - MOVL AX, (index*4)(BP) - -// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 -// SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x) -// SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x) -#define MSGSCHEDULE1(index) \ - MOVL ((index-2)*4)(BP), AX; \ - MOVL AX, CX; \ - RORL $17, AX; \ - MOVL CX, DX; \ - RORL $19, CX; \ - SHRL $10, DX; \ - MOVL ((index-15)*4)(BP), BX; \ - XORL CX, AX; \ - MOVL BX, CX; \ - XORL DX, AX; \ - RORL $7, BX; \ - MOVL CX, DX; \ - SHRL $3, DX; \ - RORL $18, CX; \ - ADDL ((index-7)*4)(BP), AX; \ - XORL CX, BX; \ - XORL DX, BX; \ - ADDL ((index-16)*4)(BP), BX; \ - ADDL BX, AX; \ - MOVL AX, ((index)*4)(BP) - -// Calculate T1 in AX - uses AX, CX and DX registers. -// h is also used as an accumulator. Wt is passed in AX. -// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt -// BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x) -// Ch(x, y, z) = (x AND y) XOR (NOT x AND z) -#define SHA256T1(const, e, f, g, h) \ - ADDL AX, h; \ - MOVL e, AX; \ - ADDL $const, h; \ - MOVL e, CX; \ - RORL $6, AX; \ - MOVL e, DX; \ - RORL $11, CX; \ - XORL CX, AX; \ - MOVL e, CX; \ - RORL $25, DX; \ - ANDL f, CX; \ - XORL AX, DX; \ - MOVL e, AX; \ - NOTL AX; \ - ADDL DX, h; \ - ANDL g, AX; \ - XORL CX, AX; \ - ADDL h, AX - -// Calculate T2 in BX - uses BX, CX, DX and DI registers. -// T2 = BIGSIGMA0(a) + Maj(a, b, c) -// BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x) -// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) -#define SHA256T2(a, b, c) \ - MOVL a, DI; \ - MOVL c, BX; \ - RORL $2, DI; \ - MOVL a, DX; \ - ANDL b, BX; \ - RORL $13, DX; \ - MOVL a, CX; \ - ANDL c, CX; \ - XORL DX, DI; \ - XORL CX, BX; \ - MOVL a, DX; \ - MOVL b, CX; \ - RORL $22, DX; \ - ANDL a, CX; \ - XORL CX, BX; \ - XORL DX, DI; \ - ADDL DI, BX - -// Calculate T1 and T2, then e = d + T1 and a = T1 + T2. -// The values for e and a are stored in d and h, ready for rotation. -#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \ - SHA256T1(const, e, f, g, h); \ - SHA256T2(a, b, c); \ - MOVL BX, h; \ - ADDL AX, d; \ - ADDL AX, h - -#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \ - MSGSCHEDULE0(index); \ - SHA256ROUND(index, const, a, b, c, d, e, f, g, h) - -#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \ - MSGSCHEDULE1(index); \ - SHA256ROUND(index, const, a, b, c, d, e, f, g, h) - - -// Definitions for AVX2 version - -// addm (mem), reg -// Add reg to mem using reg-mem add and store -#define addm(P1, P2) \ - ADDL P2, P1; \ - MOVL P1, P2 - -#define XDWORD0 Y4 -#define XDWORD1 Y5 -#define XDWORD2 Y6 -#define XDWORD3 Y7 - -#define XWORD0 X4 -#define XWORD1 X5 -#define XWORD2 X6 -#define XWORD3 X7 - -#define XTMP0 Y0 -#define XTMP1 Y1 -#define XTMP2 Y2 -#define XTMP3 Y3 -#define XTMP4 Y8 -#define XTMP5 Y11 - -#define XFER Y9 - -#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE -#define X_BYTE_FLIP_MASK X13 - -#define NUM_BYTES DX -#define INP DI - -#define CTX SI // Beginning of digest in memory (a, b, c, ... , h) - -#define a AX -#define b BX -#define c CX -#define d R8 -#define e DX -#define f R9 -#define g R10 -#define h R11 - -#define old_h R11 - -#define TBL BP - -#define SRND SI // SRND is same register as CTX - -#define T1 R12 - -#define y0 R13 -#define y1 R14 -#define y2 R15 -#define y3 DI - -// Offsets -#define XFER_SIZE 2*64*4 -#define INP_END_SIZE 8 -#define INP_SIZE 8 - -#define _XFER 0 -#define _INP_END _XFER + XFER_SIZE -#define _INP _INP_END + INP_END_SIZE -#define STACK_SIZE _INP + INP_SIZE - -#define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ - ; \ // ############################# RND N + 0 ############################// - MOVL a, y3; \ // y3 = a // MAJA - RORXL $25, e, y0; \ // y0 = e >> 25 // S1A - RORXL $11, e, y1; \ // y1 = e >> 11 // S1B - ; \ - ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w - ORL c, y3; \ // y3 = a|c // MAJA - VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7] - MOVL f, y2; \ // y2 = f // CH - RORXL $13, a, T1; \ // T1 = a >> 13 // S0B - ; \ - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 - XORL g, y2; \ // y2 = f^g // CH - VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1 - RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 - ; \ - ANDL e, y2; \ // y2 = (f^g)&e // CH - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 - RORXL $22, a, y1; \ // y1 = a >> 22 // S0A - ADDL h, d; \ // d = k + w + h + d // -- - ; \ - ANDL b, y3; \ // y3 = (a|c)&b // MAJA - VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15] - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 - RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 - ; \ - XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH - VPSRLD $7, XTMP1, XTMP2; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 - MOVL a, T1; \ // T1 = a // MAJB - ANDL c, T1; \ // T1 = a&c // MAJB - ; \ - ADDL y0, y2; \ // y2 = S1 + CH // -- - VPSLLD $(32-7), XTMP1, XTMP3; \ - ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ - ADDL y1, h; \ // h = k + w + h + S0 // -- - ; \ - ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- - VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 - ; \ - VPSRLD $18, XTMP1, XTMP2; \ - ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- - ADDL y3, h // h = t1 + S0 + MAJ // -- - -#define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ - ; \ // ################################### RND N + 1 ############################ - ; \ - MOVL a, y3; \ // y3 = a // MAJA - RORXL $25, e, y0; \ // y0 = e >> 25 // S1A - RORXL $11, e, y1; \ // y1 = e >> 11 // S1B - ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- - ORL c, y3; \ // y3 = a|c // MAJA - ; \ - VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3 - MOVL f, y2; \ // y2 = f // CH - RORXL $13, a, T1; \ // T1 = a >> 13 // S0B - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 - XORL g, y2; \ // y2 = f^g // CH - ; \ - RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 - RORXL $22, a, y1; \ // y1 = a >> 22 // S0A - ANDL e, y2; \ // y2 = (f^g)&e // CH - ADDL h, d; \ // d = k + w + h + d // -- - ; \ - VPSLLD $(32-18), XTMP1, XTMP1; \ - ANDL b, y3; \ // y3 = (a|c)&b // MAJA - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 - ; \ - VPXOR XTMP1, XTMP3, XTMP3; \ - RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 - XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH - ; \ - VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 - MOVL a, T1; \ // T1 = a // MAJB - ANDL c, T1; \ // T1 = a&c // MAJB - ADDL y0, y2; \ // y2 = S1 + CH // -- - ; \ - VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0 - VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA} - ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ - ADDL y1, h; \ // h = k + w + h + S0 // -- - ; \ - VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0 - ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- - ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- - ADDL y3, h; \ // h = t1 + S0 + MAJ // -- - ; \ - VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA} - -#define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ - ; \ // ################################### RND N + 2 ############################ - ; \ - MOVL a, y3; \ // y3 = a // MAJA - RORXL $25, e, y0; \ // y0 = e >> 25 // S1A - ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- - ; \ - VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA} - RORXL $11, e, y1; \ // y1 = e >> 11 // S1B - ORL c, y3; \ // y3 = a|c // MAJA - MOVL f, y2; \ // y2 = f // CH - XORL g, y2; \ // y2 = f^g // CH - ; \ - RORXL $13, a, T1; \ // T1 = a >> 13 // S0B - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 - VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA} - ANDL e, y2; \ // y2 = (f^g)&e // CH - ; \ - RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 - VPXOR XTMP3, XTMP2, XTMP2; \ - ADDL h, d; \ // d = k + w + h + d // -- - ANDL b, y3; \ // y3 = (a|c)&b // MAJA - ; \ - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 - RORXL $22, a, y1; \ // y1 = a >> 22 // S0A - VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA} - XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH - ; \ - VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA} - ; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 - RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 - VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]} - ; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 - MOVL a, T1; \ // T1 = a // MAJB - ANDL c, T1; \ // T1 = a&c // MAJB - ADDL y0, y2; \ // y2 = S1 + CH // -- - VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC} - ; \ - ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ - ADDL y1, h; \ // h = k + w + h + S0 // -- - ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- - ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- - ; \ - ADDL y3, h // h = t1 + S0 + MAJ // -- - -#define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ - ; \ // ################################### RND N + 3 ############################ - ; \ - MOVL a, y3; \ // y3 = a // MAJA - RORXL $25, e, y0; \ // y0 = e >> 25 // S1A - RORXL $11, e, y1; \ // y1 = e >> 11 // S1B - ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- - ORL c, y3; \ // y3 = a|c // MAJA - ; \ - VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC} - MOVL f, y2; \ // y2 = f // CH - RORXL $13, a, T1; \ // T1 = a >> 13 // S0B - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 - XORL g, y2; \ // y2 = f^g // CH - ; \ - VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC} - RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 - ANDL e, y2; \ // y2 = (f^g)&e // CH - ADDL h, d; \ // d = k + w + h + d // -- - ANDL b, y3; \ // y3 = (a|c)&b // MAJA - ; \ - VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC} - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 - XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH - ; \ - VPXOR XTMP3, XTMP2, XTMP2; \ - RORXL $22, a, y1; \ // y1 = a >> 22 // S0A - ADDL y0, y2; \ // y2 = S1 + CH // -- - ; \ - VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC} - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 - ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- - ; \ - RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 - ; \ - VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00} - ; \ - VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 - MOVL a, T1; \ // T1 = a // MAJB - ANDL c, T1; \ // T1 = a&c // MAJB - ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ - ; \ - ADDL y1, h; \ // h = k + w + h + S0 // -- - ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- - ADDL y3, h // h = t1 + S0 + MAJ // -- - -#define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \ - ; \ // ################################### RND N + 0 ########################### - MOVL f, y2; \ // y2 = f // CH - RORXL $25, e, y0; \ // y0 = e >> 25 // S1A - RORXL $11, e, y1; \ // y1 = e >> 11 // S1B - XORL g, y2; \ // y2 = f^g // CH - ; \ - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 - RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 - ANDL e, y2; \ // y2 = (f^g)&e // CH - ; \ - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 - RORXL $13, a, T1; \ // T1 = a >> 13 // S0B - XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH - RORXL $22, a, y1; \ // y1 = a >> 22 // S0A - MOVL a, y3; \ // y3 = a // MAJA - ; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 - RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 - ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // -- - ORL c, y3; \ // y3 = a|c // MAJA - ; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 - MOVL a, T1; \ // T1 = a // MAJB - ANDL b, y3; \ // y3 = (a|c)&b // MAJA - ANDL c, T1; \ // T1 = a&c // MAJB - ADDL y0, y2; \ // y2 = S1 + CH // -- - ; \ - ADDL h, d; \ // d = k + w + h + d // -- - ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ - ADDL y1, h; \ // h = k + w + h + S0 // -- - ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- - -#define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \ - ; \ // ################################### RND N + 1 ########################### - ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // -- - MOVL f, y2; \ // y2 = f // CH - RORXL $25, e, y0; \ // y0 = e >> 25 // S1A - RORXL $11, e, y1; \ // y1 = e >> 11 // S1B - XORL g, y2; \ // y2 = f^g // CH - ; \ - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 - RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 - ANDL e, y2; \ // y2 = (f^g)&e // CH - ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- - ; \ - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 - RORXL $13, a, T1; \ // T1 = a >> 13 // S0B - XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH - RORXL $22, a, y1; \ // y1 = a >> 22 // S0A - MOVL a, y3; \ // y3 = a // MAJA - ; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 - RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 - ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // -- - ORL c, y3; \ // y3 = a|c // MAJA - ; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 - MOVL a, T1; \ // T1 = a // MAJB - ANDL b, y3; \ // y3 = (a|c)&b // MAJA - ANDL c, T1; \ // T1 = a&c // MAJB - ADDL y0, y2; \ // y2 = S1 + CH // -- - ; \ - ADDL h, d; \ // d = k + w + h + d // -- - ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ - ADDL y1, h; \ // h = k + w + h + S0 // -- - ; \ - ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- - -#define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \ - ; \ // ################################### RND N + 2 ############################## - ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- - MOVL f, y2; \ // y2 = f // CH - RORXL $25, e, y0; \ // y0 = e >> 25 // S1A - RORXL $11, e, y1; \ // y1 = e >> 11 // S1B - XORL g, y2; \ // y2 = f^g // CH - ; \ - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 - RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 - ANDL e, y2; \ // y2 = (f^g)&e // CH - ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- - ; \ - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 - RORXL $13, a, T1; \ // T1 = a >> 13 // S0B - XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH - RORXL $22, a, y1; \ // y1 = a >> 22 // S0A - MOVL a, y3; \ // y3 = a // MAJA - ; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 - RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 - ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // -- - ORL c, y3; \ // y3 = a|c // MAJA - ; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 - MOVL a, T1; \ // T1 = a // MAJB - ANDL b, y3; \ // y3 = (a|c)&b // MAJA - ANDL c, T1; \ // T1 = a&c // MAJB - ADDL y0, y2; \ // y2 = S1 + CH // -- - ; \ - ADDL h, d; \ // d = k + w + h + d // -- - ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ - ADDL y1, h; \ // h = k + w + h + S0 // -- - ; \ - ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // -- - -#define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \ - ; \ // ################################### RND N + 3 ########################### - ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- - MOVL f, y2; \ // y2 = f // CH - RORXL $25, e, y0; \ // y0 = e >> 25 // S1A - RORXL $11, e, y1; \ // y1 = e >> 11 // S1B - XORL g, y2; \ // y2 = f^g // CH - ; \ - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1 - RORXL $6, e, y1; \ // y1 = (e >> 6) // S1 - ANDL e, y2; \ // y2 = (f^g)&e // CH - ADDL y3, old_h; \ // h = t1 + S0 + MAJ // -- - ; \ - XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1 - RORXL $13, a, T1; \ // T1 = a >> 13 // S0B - XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH - RORXL $22, a, y1; \ // y1 = a >> 22 // S0A - MOVL a, y3; \ // y3 = a // MAJA - ; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0 - RORXL $2, a, T1; \ // T1 = (a >> 2) // S0 - ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // -- - ORL c, y3; \ // y3 = a|c // MAJA - ; \ - XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0 - MOVL a, T1; \ // T1 = a // MAJB - ANDL b, y3; \ // y3 = (a|c)&b // MAJA - ANDL c, T1; \ // T1 = a&c // MAJB - ADDL y0, y2; \ // y2 = S1 + CH // -- - ; \ - ADDL h, d; \ // d = k + w + h + d // -- - ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ - ADDL y1, h; \ // h = k + w + h + S0 // -- - ; \ - ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // -- - ; \ - ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// -- - ; \ - ADDL y3, h // h = t1 + S0 + MAJ // -- - -// Definitions for sha-ni version -// -// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2 -// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version -// -// Reference -// S. Gulley, et al, "New Instructions Supporting the Secure Hash -// Algorithm on Intel® Architecture Processors", July 2013 -// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html -// - -#define digestPtr DI // input/output, base pointer to digest hash vector H0, H1, ..., H7 -#define dataPtr SI // input, base pointer to first input data block -#define numBytes DX // input, number of input bytes to be processed -#define sha256Constants AX // round contents from K256 table, indexed by round number x 32 -#define msg X0 // input data -#define state0 X1 // round intermediates and outputs -#define state1 X2 -#define m0 X3 // m0, m1,... m4 -- round message temps -#define m1 X4 -#define m2 X5 -#define m3 X6 -#define m4 X7 -#define shufMask X8 // input data endian conversion control mask -#define abefSave X9 // digest hash vector inter-block buffer abef -#define cdghSave X10 // digest hash vector inter-block buffer cdgh - -#define nop(m,a) // nop instead of final SHA256MSG1 for first and last few rounds - -#define sha256msg1(m,a) \ // final SHA256MSG1 for middle rounds that require it - SHA256MSG1 m, a - -#define vmov(a,b) \ // msg copy for all but rounds 12-15 - VMOVDQA a, b - -#define vmovrev(a,b) \ // reverse copy for rounds 12-15 - VMOVDQA b, a - -// sha rounds 0 to 11 -// identical with the exception of the final msg op -// which is replaced with a nop for rounds where it is not needed -// refer to Gulley, et al for more information -#define rounds0to11(m,a,c,sha256Msg1) \ - VMOVDQU c*16(dataPtr), msg \ - PSHUFB shufMask, msg \ - VMOVDQA msg, m \ - PADDD (c*32)(sha256Constants), msg \ - SHA256RNDS2 msg, state0, state1 \ - PSHUFD $0x0e, msg, msg \ - SHA256RNDS2 msg, state1, state0 \ - sha256Msg1 (m,a) - -// sha rounds 12 to 59 -// identical with the exception of the final msg op -// and the reverse copy(m,msg) in round 12 which is required -// after the last data load -// refer to Gulley, et al for more information -#define rounds12to59(m,c,a,t,sha256Msg1,movop) \ - movop (m,msg) \ - PADDD (c*32)(sha256Constants), msg \ - SHA256RNDS2 msg, state0, state1 \ - VMOVDQA m, m4 \ - PALIGNR $4, a, m4 \ - PADDD m4, t \ - SHA256MSG2 m, t \ - PSHUFD $0x0e, msg, msg \ - SHA256RNDS2 msg, state1, state0 \ - sha256Msg1 (m,a) - -TEXT ·block(SB), 0, $536-32 - CMPB ·useSHA(SB), $1 - JE sha_ni - CMPB ·useAVX2(SB), $1 - JE avx2 - +// func block(dig *digest, p []byte) +// Requires: AVX, AVX2, BMI2, SHA, SSE2, SSE4.1, SSSE3 +TEXT ·block(SB), $536-32 + CMPB ·useSHA+0(SB), $0x01 + JE sha_ni + CMPB ·useAVX2+0(SB), $0x01 + JE avx2 MOVQ p_base+8(FP), SI MOVQ p_len+16(FP), DX - SHRQ $6, DX - SHLQ $6, DX - + SHRQ $0x06, DX + SHLQ $0x06, DX LEAQ (SI)(DX*1), DI MOVQ DI, 256(SP) CMPQ SI, DI JEQ end - MOVQ dig+0(FP), BP - MOVL (0*4)(BP), R8 // a = H0 - MOVL (1*4)(BP), R9 // b = H1 - MOVL (2*4)(BP), R10 // c = H2 - MOVL (3*4)(BP), R11 // d = H3 - MOVL (4*4)(BP), R12 // e = H4 - MOVL (5*4)(BP), R13 // f = H5 - MOVL (6*4)(BP), R14 // g = H6 - MOVL (7*4)(BP), R15 // h = H7 + MOVL (BP), R8 + MOVL 4(BP), R9 + MOVL 8(BP), R10 + MOVL 12(BP), R11 + MOVL 16(BP), R12 + MOVL 20(BP), R13 + MOVL 24(BP), R14 + MOVL 28(BP), R15 loop: - MOVQ SP, BP - - SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15) - SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14) - SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13) - SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12) - SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11) - SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10) - SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9) - SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8) - SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15) - SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14) - SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13) - SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12) - SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11) - SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10) - SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9) - SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8) - - SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15) - SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14) - SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13) - SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12) - SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11) - SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10) - SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9) - SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8) - SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15) - SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14) - SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13) - SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12) - SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11) - SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10) - SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9) - SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8) - SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15) - SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14) - SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13) - SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12) - SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11) - SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10) - SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9) - SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8) - SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15) - SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14) - SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13) - SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12) - SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11) - SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10) - SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9) - SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8) - SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15) - SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14) - SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13) - SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12) - SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11) - SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10) - SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9) - SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8) - SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15) - SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14) - SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13) - SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12) - SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11) - SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10) - SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9) - SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8) - - MOVQ dig+0(FP), BP - ADDL (0*4)(BP), R8 // H0 = a + H0 - MOVL R8, (0*4)(BP) - ADDL (1*4)(BP), R9 // H1 = b + H1 - MOVL R9, (1*4)(BP) - ADDL (2*4)(BP), R10 // H2 = c + H2 - MOVL R10, (2*4)(BP) - ADDL (3*4)(BP), R11 // H3 = d + H3 - MOVL R11, (3*4)(BP) - ADDL (4*4)(BP), R12 // H4 = e + H4 - MOVL R12, (4*4)(BP) - ADDL (5*4)(BP), R13 // H5 = f + H5 - MOVL R13, (5*4)(BP) - ADDL (6*4)(BP), R14 // H6 = g + H6 - MOVL R14, (6*4)(BP) - ADDL (7*4)(BP), R15 // H7 = h + H7 - MOVL R15, (7*4)(BP) - - ADDQ $64, SI - CMPQ SI, 256(SP) - JB loop + MOVQ SP, BP + MOVL (SI), AX + BSWAPL AX + MOVL AX, (BP) + ADDL AX, R15 + MOVL R12, AX + ADDL $0x428a2f98, R15 + MOVL R12, CX + RORL $0x06, AX + MOVL R12, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R12, CX + RORL $0x19, DX + ANDL R13, CX + XORL AX, DX + MOVL R12, AX + NOTL AX + ADDL DX, R15 + ANDL R14, AX + XORL CX, AX + ADDL R15, AX + MOVL R8, DI + MOVL R10, BX + RORL $0x02, DI + MOVL R8, DX + ANDL R9, BX + RORL $0x0d, DX + MOVL R8, CX + ANDL R10, CX + XORL DX, DI + XORL CX, BX + MOVL R8, DX + MOVL R9, CX + RORL $0x16, DX + ANDL R8, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R15 + ADDL AX, R11 + ADDL AX, R15 + MOVL 4(SI), AX + BSWAPL AX + MOVL AX, 4(BP) + ADDL AX, R14 + MOVL R11, AX + ADDL $0x71374491, R14 + MOVL R11, CX + RORL $0x06, AX + MOVL R11, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R11, CX + RORL $0x19, DX + ANDL R12, CX + XORL AX, DX + MOVL R11, AX + NOTL AX + ADDL DX, R14 + ANDL R13, AX + XORL CX, AX + ADDL R14, AX + MOVL R15, DI + MOVL R9, BX + RORL $0x02, DI + MOVL R15, DX + ANDL R8, BX + RORL $0x0d, DX + MOVL R15, CX + ANDL R9, CX + XORL DX, DI + XORL CX, BX + MOVL R15, DX + MOVL R8, CX + RORL $0x16, DX + ANDL R15, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R14 + ADDL AX, R10 + ADDL AX, R14 + MOVL 8(SI), AX + BSWAPL AX + MOVL AX, 8(BP) + ADDL AX, R13 + MOVL R10, AX + ADDL $0xb5c0fbcf, R13 + MOVL R10, CX + RORL $0x06, AX + MOVL R10, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R10, CX + RORL $0x19, DX + ANDL R11, CX + XORL AX, DX + MOVL R10, AX + NOTL AX + ADDL DX, R13 + ANDL R12, AX + XORL CX, AX + ADDL R13, AX + MOVL R14, DI + MOVL R8, BX + RORL $0x02, DI + MOVL R14, DX + ANDL R15, BX + RORL $0x0d, DX + MOVL R14, CX + ANDL R8, CX + XORL DX, DI + XORL CX, BX + MOVL R14, DX + MOVL R15, CX + RORL $0x16, DX + ANDL R14, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R13 + ADDL AX, R9 + ADDL AX, R13 + MOVL 12(SI), AX + BSWAPL AX + MOVL AX, 12(BP) + ADDL AX, R12 + MOVL R9, AX + ADDL $0xe9b5dba5, R12 + MOVL R9, CX + RORL $0x06, AX + MOVL R9, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R9, CX + RORL $0x19, DX + ANDL R10, CX + XORL AX, DX + MOVL R9, AX + NOTL AX + ADDL DX, R12 + ANDL R11, AX + XORL CX, AX + ADDL R12, AX + MOVL R13, DI + MOVL R15, BX + RORL $0x02, DI + MOVL R13, DX + ANDL R14, BX + RORL $0x0d, DX + MOVL R13, CX + ANDL R15, CX + XORL DX, DI + XORL CX, BX + MOVL R13, DX + MOVL R14, CX + RORL $0x16, DX + ANDL R13, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R12 + ADDL AX, R8 + ADDL AX, R12 + MOVL 16(SI), AX + BSWAPL AX + MOVL AX, 16(BP) + ADDL AX, R11 + MOVL R8, AX + ADDL $0x3956c25b, R11 + MOVL R8, CX + RORL $0x06, AX + MOVL R8, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R8, CX + RORL $0x19, DX + ANDL R9, CX + XORL AX, DX + MOVL R8, AX + NOTL AX + ADDL DX, R11 + ANDL R10, AX + XORL CX, AX + ADDL R11, AX + MOVL R12, DI + MOVL R14, BX + RORL $0x02, DI + MOVL R12, DX + ANDL R13, BX + RORL $0x0d, DX + MOVL R12, CX + ANDL R14, CX + XORL DX, DI + XORL CX, BX + MOVL R12, DX + MOVL R13, CX + RORL $0x16, DX + ANDL R12, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R11 + ADDL AX, R15 + ADDL AX, R11 + MOVL 20(SI), AX + BSWAPL AX + MOVL AX, 20(BP) + ADDL AX, R10 + MOVL R15, AX + ADDL $0x59f111f1, R10 + MOVL R15, CX + RORL $0x06, AX + MOVL R15, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R15, CX + RORL $0x19, DX + ANDL R8, CX + XORL AX, DX + MOVL R15, AX + NOTL AX + ADDL DX, R10 + ANDL R9, AX + XORL CX, AX + ADDL R10, AX + MOVL R11, DI + MOVL R13, BX + RORL $0x02, DI + MOVL R11, DX + ANDL R12, BX + RORL $0x0d, DX + MOVL R11, CX + ANDL R13, CX + XORL DX, DI + XORL CX, BX + MOVL R11, DX + MOVL R12, CX + RORL $0x16, DX + ANDL R11, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R10 + ADDL AX, R14 + ADDL AX, R10 + MOVL 24(SI), AX + BSWAPL AX + MOVL AX, 24(BP) + ADDL AX, R9 + MOVL R14, AX + ADDL $0x923f82a4, R9 + MOVL R14, CX + RORL $0x06, AX + MOVL R14, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R14, CX + RORL $0x19, DX + ANDL R15, CX + XORL AX, DX + MOVL R14, AX + NOTL AX + ADDL DX, R9 + ANDL R8, AX + XORL CX, AX + ADDL R9, AX + MOVL R10, DI + MOVL R12, BX + RORL $0x02, DI + MOVL R10, DX + ANDL R11, BX + RORL $0x0d, DX + MOVL R10, CX + ANDL R12, CX + XORL DX, DI + XORL CX, BX + MOVL R10, DX + MOVL R11, CX + RORL $0x16, DX + ANDL R10, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R9 + ADDL AX, R13 + ADDL AX, R9 + MOVL 28(SI), AX + BSWAPL AX + MOVL AX, 28(BP) + ADDL AX, R8 + MOVL R13, AX + ADDL $0xab1c5ed5, R8 + MOVL R13, CX + RORL $0x06, AX + MOVL R13, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R13, CX + RORL $0x19, DX + ANDL R14, CX + XORL AX, DX + MOVL R13, AX + NOTL AX + ADDL DX, R8 + ANDL R15, AX + XORL CX, AX + ADDL R8, AX + MOVL R9, DI + MOVL R11, BX + RORL $0x02, DI + MOVL R9, DX + ANDL R10, BX + RORL $0x0d, DX + MOVL R9, CX + ANDL R11, CX + XORL DX, DI + XORL CX, BX + MOVL R9, DX + MOVL R10, CX + RORL $0x16, DX + ANDL R9, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R8 + ADDL AX, R12 + ADDL AX, R8 + MOVL 32(SI), AX + BSWAPL AX + MOVL AX, 32(BP) + ADDL AX, R15 + MOVL R12, AX + ADDL $0xd807aa98, R15 + MOVL R12, CX + RORL $0x06, AX + MOVL R12, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R12, CX + RORL $0x19, DX + ANDL R13, CX + XORL AX, DX + MOVL R12, AX + NOTL AX + ADDL DX, R15 + ANDL R14, AX + XORL CX, AX + ADDL R15, AX + MOVL R8, DI + MOVL R10, BX + RORL $0x02, DI + MOVL R8, DX + ANDL R9, BX + RORL $0x0d, DX + MOVL R8, CX + ANDL R10, CX + XORL DX, DI + XORL CX, BX + MOVL R8, DX + MOVL R9, CX + RORL $0x16, DX + ANDL R8, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R15 + ADDL AX, R11 + ADDL AX, R15 + MOVL 36(SI), AX + BSWAPL AX + MOVL AX, 36(BP) + ADDL AX, R14 + MOVL R11, AX + ADDL $0x12835b01, R14 + MOVL R11, CX + RORL $0x06, AX + MOVL R11, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R11, CX + RORL $0x19, DX + ANDL R12, CX + XORL AX, DX + MOVL R11, AX + NOTL AX + ADDL DX, R14 + ANDL R13, AX + XORL CX, AX + ADDL R14, AX + MOVL R15, DI + MOVL R9, BX + RORL $0x02, DI + MOVL R15, DX + ANDL R8, BX + RORL $0x0d, DX + MOVL R15, CX + ANDL R9, CX + XORL DX, DI + XORL CX, BX + MOVL R15, DX + MOVL R8, CX + RORL $0x16, DX + ANDL R15, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R14 + ADDL AX, R10 + ADDL AX, R14 + MOVL 40(SI), AX + BSWAPL AX + MOVL AX, 40(BP) + ADDL AX, R13 + MOVL R10, AX + ADDL $0x243185be, R13 + MOVL R10, CX + RORL $0x06, AX + MOVL R10, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R10, CX + RORL $0x19, DX + ANDL R11, CX + XORL AX, DX + MOVL R10, AX + NOTL AX + ADDL DX, R13 + ANDL R12, AX + XORL CX, AX + ADDL R13, AX + MOVL R14, DI + MOVL R8, BX + RORL $0x02, DI + MOVL R14, DX + ANDL R15, BX + RORL $0x0d, DX + MOVL R14, CX + ANDL R8, CX + XORL DX, DI + XORL CX, BX + MOVL R14, DX + MOVL R15, CX + RORL $0x16, DX + ANDL R14, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R13 + ADDL AX, R9 + ADDL AX, R13 + MOVL 44(SI), AX + BSWAPL AX + MOVL AX, 44(BP) + ADDL AX, R12 + MOVL R9, AX + ADDL $0x550c7dc3, R12 + MOVL R9, CX + RORL $0x06, AX + MOVL R9, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R9, CX + RORL $0x19, DX + ANDL R10, CX + XORL AX, DX + MOVL R9, AX + NOTL AX + ADDL DX, R12 + ANDL R11, AX + XORL CX, AX + ADDL R12, AX + MOVL R13, DI + MOVL R15, BX + RORL $0x02, DI + MOVL R13, DX + ANDL R14, BX + RORL $0x0d, DX + MOVL R13, CX + ANDL R15, CX + XORL DX, DI + XORL CX, BX + MOVL R13, DX + MOVL R14, CX + RORL $0x16, DX + ANDL R13, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R12 + ADDL AX, R8 + ADDL AX, R12 + MOVL 48(SI), AX + BSWAPL AX + MOVL AX, 48(BP) + ADDL AX, R11 + MOVL R8, AX + ADDL $0x72be5d74, R11 + MOVL R8, CX + RORL $0x06, AX + MOVL R8, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R8, CX + RORL $0x19, DX + ANDL R9, CX + XORL AX, DX + MOVL R8, AX + NOTL AX + ADDL DX, R11 + ANDL R10, AX + XORL CX, AX + ADDL R11, AX + MOVL R12, DI + MOVL R14, BX + RORL $0x02, DI + MOVL R12, DX + ANDL R13, BX + RORL $0x0d, DX + MOVL R12, CX + ANDL R14, CX + XORL DX, DI + XORL CX, BX + MOVL R12, DX + MOVL R13, CX + RORL $0x16, DX + ANDL R12, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R11 + ADDL AX, R15 + ADDL AX, R11 + MOVL 52(SI), AX + BSWAPL AX + MOVL AX, 52(BP) + ADDL AX, R10 + MOVL R15, AX + ADDL $0x80deb1fe, R10 + MOVL R15, CX + RORL $0x06, AX + MOVL R15, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R15, CX + RORL $0x19, DX + ANDL R8, CX + XORL AX, DX + MOVL R15, AX + NOTL AX + ADDL DX, R10 + ANDL R9, AX + XORL CX, AX + ADDL R10, AX + MOVL R11, DI + MOVL R13, BX + RORL $0x02, DI + MOVL R11, DX + ANDL R12, BX + RORL $0x0d, DX + MOVL R11, CX + ANDL R13, CX + XORL DX, DI + XORL CX, BX + MOVL R11, DX + MOVL R12, CX + RORL $0x16, DX + ANDL R11, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R10 + ADDL AX, R14 + ADDL AX, R10 + MOVL 56(SI), AX + BSWAPL AX + MOVL AX, 56(BP) + ADDL AX, R9 + MOVL R14, AX + ADDL $0x9bdc06a7, R9 + MOVL R14, CX + RORL $0x06, AX + MOVL R14, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R14, CX + RORL $0x19, DX + ANDL R15, CX + XORL AX, DX + MOVL R14, AX + NOTL AX + ADDL DX, R9 + ANDL R8, AX + XORL CX, AX + ADDL R9, AX + MOVL R10, DI + MOVL R12, BX + RORL $0x02, DI + MOVL R10, DX + ANDL R11, BX + RORL $0x0d, DX + MOVL R10, CX + ANDL R12, CX + XORL DX, DI + XORL CX, BX + MOVL R10, DX + MOVL R11, CX + RORL $0x16, DX + ANDL R10, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R9 + ADDL AX, R13 + ADDL AX, R9 + MOVL 60(SI), AX + BSWAPL AX + MOVL AX, 60(BP) + ADDL AX, R8 + MOVL R13, AX + ADDL $0xc19bf174, R8 + MOVL R13, CX + RORL $0x06, AX + MOVL R13, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R13, CX + RORL $0x19, DX + ANDL R14, CX + XORL AX, DX + MOVL R13, AX + NOTL AX + ADDL DX, R8 + ANDL R15, AX + XORL CX, AX + ADDL R8, AX + MOVL R9, DI + MOVL R11, BX + RORL $0x02, DI + MOVL R9, DX + ANDL R10, BX + RORL $0x0d, DX + MOVL R9, CX + ANDL R11, CX + XORL DX, DI + XORL CX, BX + MOVL R9, DX + MOVL R10, CX + RORL $0x16, DX + ANDL R9, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R8 + ADDL AX, R12 + ADDL AX, R8 + MOVL 56(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 4(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 36(BP), AX + XORL CX, BX + XORL DX, BX + ADDL (BP), BX + ADDL BX, AX + MOVL AX, 64(BP) + ADDL AX, R15 + MOVL R12, AX + ADDL $0xe49b69c1, R15 + MOVL R12, CX + RORL $0x06, AX + MOVL R12, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R12, CX + RORL $0x19, DX + ANDL R13, CX + XORL AX, DX + MOVL R12, AX + NOTL AX + ADDL DX, R15 + ANDL R14, AX + XORL CX, AX + ADDL R15, AX + MOVL R8, DI + MOVL R10, BX + RORL $0x02, DI + MOVL R8, DX + ANDL R9, BX + RORL $0x0d, DX + MOVL R8, CX + ANDL R10, CX + XORL DX, DI + XORL CX, BX + MOVL R8, DX + MOVL R9, CX + RORL $0x16, DX + ANDL R8, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R15 + ADDL AX, R11 + ADDL AX, R15 + MOVL 60(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 8(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 40(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 4(BP), BX + ADDL BX, AX + MOVL AX, 68(BP) + ADDL AX, R14 + MOVL R11, AX + ADDL $0xefbe4786, R14 + MOVL R11, CX + RORL $0x06, AX + MOVL R11, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R11, CX + RORL $0x19, DX + ANDL R12, CX + XORL AX, DX + MOVL R11, AX + NOTL AX + ADDL DX, R14 + ANDL R13, AX + XORL CX, AX + ADDL R14, AX + MOVL R15, DI + MOVL R9, BX + RORL $0x02, DI + MOVL R15, DX + ANDL R8, BX + RORL $0x0d, DX + MOVL R15, CX + ANDL R9, CX + XORL DX, DI + XORL CX, BX + MOVL R15, DX + MOVL R8, CX + RORL $0x16, DX + ANDL R15, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R14 + ADDL AX, R10 + ADDL AX, R14 + MOVL 64(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 12(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 44(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 8(BP), BX + ADDL BX, AX + MOVL AX, 72(BP) + ADDL AX, R13 + MOVL R10, AX + ADDL $0x0fc19dc6, R13 + MOVL R10, CX + RORL $0x06, AX + MOVL R10, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R10, CX + RORL $0x19, DX + ANDL R11, CX + XORL AX, DX + MOVL R10, AX + NOTL AX + ADDL DX, R13 + ANDL R12, AX + XORL CX, AX + ADDL R13, AX + MOVL R14, DI + MOVL R8, BX + RORL $0x02, DI + MOVL R14, DX + ANDL R15, BX + RORL $0x0d, DX + MOVL R14, CX + ANDL R8, CX + XORL DX, DI + XORL CX, BX + MOVL R14, DX + MOVL R15, CX + RORL $0x16, DX + ANDL R14, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R13 + ADDL AX, R9 + ADDL AX, R13 + MOVL 68(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 16(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 48(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 12(BP), BX + ADDL BX, AX + MOVL AX, 76(BP) + ADDL AX, R12 + MOVL R9, AX + ADDL $0x240ca1cc, R12 + MOVL R9, CX + RORL $0x06, AX + MOVL R9, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R9, CX + RORL $0x19, DX + ANDL R10, CX + XORL AX, DX + MOVL R9, AX + NOTL AX + ADDL DX, R12 + ANDL R11, AX + XORL CX, AX + ADDL R12, AX + MOVL R13, DI + MOVL R15, BX + RORL $0x02, DI + MOVL R13, DX + ANDL R14, BX + RORL $0x0d, DX + MOVL R13, CX + ANDL R15, CX + XORL DX, DI + XORL CX, BX + MOVL R13, DX + MOVL R14, CX + RORL $0x16, DX + ANDL R13, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R12 + ADDL AX, R8 + ADDL AX, R12 + MOVL 72(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 20(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 52(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 16(BP), BX + ADDL BX, AX + MOVL AX, 80(BP) + ADDL AX, R11 + MOVL R8, AX + ADDL $0x2de92c6f, R11 + MOVL R8, CX + RORL $0x06, AX + MOVL R8, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R8, CX + RORL $0x19, DX + ANDL R9, CX + XORL AX, DX + MOVL R8, AX + NOTL AX + ADDL DX, R11 + ANDL R10, AX + XORL CX, AX + ADDL R11, AX + MOVL R12, DI + MOVL R14, BX + RORL $0x02, DI + MOVL R12, DX + ANDL R13, BX + RORL $0x0d, DX + MOVL R12, CX + ANDL R14, CX + XORL DX, DI + XORL CX, BX + MOVL R12, DX + MOVL R13, CX + RORL $0x16, DX + ANDL R12, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R11 + ADDL AX, R15 + ADDL AX, R11 + MOVL 76(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 24(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 56(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 20(BP), BX + ADDL BX, AX + MOVL AX, 84(BP) + ADDL AX, R10 + MOVL R15, AX + ADDL $0x4a7484aa, R10 + MOVL R15, CX + RORL $0x06, AX + MOVL R15, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R15, CX + RORL $0x19, DX + ANDL R8, CX + XORL AX, DX + MOVL R15, AX + NOTL AX + ADDL DX, R10 + ANDL R9, AX + XORL CX, AX + ADDL R10, AX + MOVL R11, DI + MOVL R13, BX + RORL $0x02, DI + MOVL R11, DX + ANDL R12, BX + RORL $0x0d, DX + MOVL R11, CX + ANDL R13, CX + XORL DX, DI + XORL CX, BX + MOVL R11, DX + MOVL R12, CX + RORL $0x16, DX + ANDL R11, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R10 + ADDL AX, R14 + ADDL AX, R10 + MOVL 80(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 28(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 60(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 24(BP), BX + ADDL BX, AX + MOVL AX, 88(BP) + ADDL AX, R9 + MOVL R14, AX + ADDL $0x5cb0a9dc, R9 + MOVL R14, CX + RORL $0x06, AX + MOVL R14, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R14, CX + RORL $0x19, DX + ANDL R15, CX + XORL AX, DX + MOVL R14, AX + NOTL AX + ADDL DX, R9 + ANDL R8, AX + XORL CX, AX + ADDL R9, AX + MOVL R10, DI + MOVL R12, BX + RORL $0x02, DI + MOVL R10, DX + ANDL R11, BX + RORL $0x0d, DX + MOVL R10, CX + ANDL R12, CX + XORL DX, DI + XORL CX, BX + MOVL R10, DX + MOVL R11, CX + RORL $0x16, DX + ANDL R10, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R9 + ADDL AX, R13 + ADDL AX, R9 + MOVL 84(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 32(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 64(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 28(BP), BX + ADDL BX, AX + MOVL AX, 92(BP) + ADDL AX, R8 + MOVL R13, AX + ADDL $0x76f988da, R8 + MOVL R13, CX + RORL $0x06, AX + MOVL R13, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R13, CX + RORL $0x19, DX + ANDL R14, CX + XORL AX, DX + MOVL R13, AX + NOTL AX + ADDL DX, R8 + ANDL R15, AX + XORL CX, AX + ADDL R8, AX + MOVL R9, DI + MOVL R11, BX + RORL $0x02, DI + MOVL R9, DX + ANDL R10, BX + RORL $0x0d, DX + MOVL R9, CX + ANDL R11, CX + XORL DX, DI + XORL CX, BX + MOVL R9, DX + MOVL R10, CX + RORL $0x16, DX + ANDL R9, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R8 + ADDL AX, R12 + ADDL AX, R8 + MOVL 88(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 36(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 68(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 32(BP), BX + ADDL BX, AX + MOVL AX, 96(BP) + ADDL AX, R15 + MOVL R12, AX + ADDL $0x983e5152, R15 + MOVL R12, CX + RORL $0x06, AX + MOVL R12, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R12, CX + RORL $0x19, DX + ANDL R13, CX + XORL AX, DX + MOVL R12, AX + NOTL AX + ADDL DX, R15 + ANDL R14, AX + XORL CX, AX + ADDL R15, AX + MOVL R8, DI + MOVL R10, BX + RORL $0x02, DI + MOVL R8, DX + ANDL R9, BX + RORL $0x0d, DX + MOVL R8, CX + ANDL R10, CX + XORL DX, DI + XORL CX, BX + MOVL R8, DX + MOVL R9, CX + RORL $0x16, DX + ANDL R8, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R15 + ADDL AX, R11 + ADDL AX, R15 + MOVL 92(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 40(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 72(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 36(BP), BX + ADDL BX, AX + MOVL AX, 100(BP) + ADDL AX, R14 + MOVL R11, AX + ADDL $0xa831c66d, R14 + MOVL R11, CX + RORL $0x06, AX + MOVL R11, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R11, CX + RORL $0x19, DX + ANDL R12, CX + XORL AX, DX + MOVL R11, AX + NOTL AX + ADDL DX, R14 + ANDL R13, AX + XORL CX, AX + ADDL R14, AX + MOVL R15, DI + MOVL R9, BX + RORL $0x02, DI + MOVL R15, DX + ANDL R8, BX + RORL $0x0d, DX + MOVL R15, CX + ANDL R9, CX + XORL DX, DI + XORL CX, BX + MOVL R15, DX + MOVL R8, CX + RORL $0x16, DX + ANDL R15, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R14 + ADDL AX, R10 + ADDL AX, R14 + MOVL 96(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 44(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 76(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 40(BP), BX + ADDL BX, AX + MOVL AX, 104(BP) + ADDL AX, R13 + MOVL R10, AX + ADDL $0xb00327c8, R13 + MOVL R10, CX + RORL $0x06, AX + MOVL R10, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R10, CX + RORL $0x19, DX + ANDL R11, CX + XORL AX, DX + MOVL R10, AX + NOTL AX + ADDL DX, R13 + ANDL R12, AX + XORL CX, AX + ADDL R13, AX + MOVL R14, DI + MOVL R8, BX + RORL $0x02, DI + MOVL R14, DX + ANDL R15, BX + RORL $0x0d, DX + MOVL R14, CX + ANDL R8, CX + XORL DX, DI + XORL CX, BX + MOVL R14, DX + MOVL R15, CX + RORL $0x16, DX + ANDL R14, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R13 + ADDL AX, R9 + ADDL AX, R13 + MOVL 100(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 48(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 80(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 44(BP), BX + ADDL BX, AX + MOVL AX, 108(BP) + ADDL AX, R12 + MOVL R9, AX + ADDL $0xbf597fc7, R12 + MOVL R9, CX + RORL $0x06, AX + MOVL R9, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R9, CX + RORL $0x19, DX + ANDL R10, CX + XORL AX, DX + MOVL R9, AX + NOTL AX + ADDL DX, R12 + ANDL R11, AX + XORL CX, AX + ADDL R12, AX + MOVL R13, DI + MOVL R15, BX + RORL $0x02, DI + MOVL R13, DX + ANDL R14, BX + RORL $0x0d, DX + MOVL R13, CX + ANDL R15, CX + XORL DX, DI + XORL CX, BX + MOVL R13, DX + MOVL R14, CX + RORL $0x16, DX + ANDL R13, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R12 + ADDL AX, R8 + ADDL AX, R12 + MOVL 104(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 52(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 84(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 48(BP), BX + ADDL BX, AX + MOVL AX, 112(BP) + ADDL AX, R11 + MOVL R8, AX + ADDL $0xc6e00bf3, R11 + MOVL R8, CX + RORL $0x06, AX + MOVL R8, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R8, CX + RORL $0x19, DX + ANDL R9, CX + XORL AX, DX + MOVL R8, AX + NOTL AX + ADDL DX, R11 + ANDL R10, AX + XORL CX, AX + ADDL R11, AX + MOVL R12, DI + MOVL R14, BX + RORL $0x02, DI + MOVL R12, DX + ANDL R13, BX + RORL $0x0d, DX + MOVL R12, CX + ANDL R14, CX + XORL DX, DI + XORL CX, BX + MOVL R12, DX + MOVL R13, CX + RORL $0x16, DX + ANDL R12, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R11 + ADDL AX, R15 + ADDL AX, R11 + MOVL 108(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 56(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 88(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 52(BP), BX + ADDL BX, AX + MOVL AX, 116(BP) + ADDL AX, R10 + MOVL R15, AX + ADDL $0xd5a79147, R10 + MOVL R15, CX + RORL $0x06, AX + MOVL R15, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R15, CX + RORL $0x19, DX + ANDL R8, CX + XORL AX, DX + MOVL R15, AX + NOTL AX + ADDL DX, R10 + ANDL R9, AX + XORL CX, AX + ADDL R10, AX + MOVL R11, DI + MOVL R13, BX + RORL $0x02, DI + MOVL R11, DX + ANDL R12, BX + RORL $0x0d, DX + MOVL R11, CX + ANDL R13, CX + XORL DX, DI + XORL CX, BX + MOVL R11, DX + MOVL R12, CX + RORL $0x16, DX + ANDL R11, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R10 + ADDL AX, R14 + ADDL AX, R10 + MOVL 112(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 60(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 92(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 56(BP), BX + ADDL BX, AX + MOVL AX, 120(BP) + ADDL AX, R9 + MOVL R14, AX + ADDL $0x06ca6351, R9 + MOVL R14, CX + RORL $0x06, AX + MOVL R14, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R14, CX + RORL $0x19, DX + ANDL R15, CX + XORL AX, DX + MOVL R14, AX + NOTL AX + ADDL DX, R9 + ANDL R8, AX + XORL CX, AX + ADDL R9, AX + MOVL R10, DI + MOVL R12, BX + RORL $0x02, DI + MOVL R10, DX + ANDL R11, BX + RORL $0x0d, DX + MOVL R10, CX + ANDL R12, CX + XORL DX, DI + XORL CX, BX + MOVL R10, DX + MOVL R11, CX + RORL $0x16, DX + ANDL R10, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R9 + ADDL AX, R13 + ADDL AX, R9 + MOVL 116(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 64(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 96(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 60(BP), BX + ADDL BX, AX + MOVL AX, 124(BP) + ADDL AX, R8 + MOVL R13, AX + ADDL $0x14292967, R8 + MOVL R13, CX + RORL $0x06, AX + MOVL R13, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R13, CX + RORL $0x19, DX + ANDL R14, CX + XORL AX, DX + MOVL R13, AX + NOTL AX + ADDL DX, R8 + ANDL R15, AX + XORL CX, AX + ADDL R8, AX + MOVL R9, DI + MOVL R11, BX + RORL $0x02, DI + MOVL R9, DX + ANDL R10, BX + RORL $0x0d, DX + MOVL R9, CX + ANDL R11, CX + XORL DX, DI + XORL CX, BX + MOVL R9, DX + MOVL R10, CX + RORL $0x16, DX + ANDL R9, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R8 + ADDL AX, R12 + ADDL AX, R8 + MOVL 120(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 68(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 100(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 64(BP), BX + ADDL BX, AX + MOVL AX, 128(BP) + ADDL AX, R15 + MOVL R12, AX + ADDL $0x27b70a85, R15 + MOVL R12, CX + RORL $0x06, AX + MOVL R12, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R12, CX + RORL $0x19, DX + ANDL R13, CX + XORL AX, DX + MOVL R12, AX + NOTL AX + ADDL DX, R15 + ANDL R14, AX + XORL CX, AX + ADDL R15, AX + MOVL R8, DI + MOVL R10, BX + RORL $0x02, DI + MOVL R8, DX + ANDL R9, BX + RORL $0x0d, DX + MOVL R8, CX + ANDL R10, CX + XORL DX, DI + XORL CX, BX + MOVL R8, DX + MOVL R9, CX + RORL $0x16, DX + ANDL R8, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R15 + ADDL AX, R11 + ADDL AX, R15 + MOVL 124(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 72(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 104(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 68(BP), BX + ADDL BX, AX + MOVL AX, 132(BP) + ADDL AX, R14 + MOVL R11, AX + ADDL $0x2e1b2138, R14 + MOVL R11, CX + RORL $0x06, AX + MOVL R11, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R11, CX + RORL $0x19, DX + ANDL R12, CX + XORL AX, DX + MOVL R11, AX + NOTL AX + ADDL DX, R14 + ANDL R13, AX + XORL CX, AX + ADDL R14, AX + MOVL R15, DI + MOVL R9, BX + RORL $0x02, DI + MOVL R15, DX + ANDL R8, BX + RORL $0x0d, DX + MOVL R15, CX + ANDL R9, CX + XORL DX, DI + XORL CX, BX + MOVL R15, DX + MOVL R8, CX + RORL $0x16, DX + ANDL R15, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R14 + ADDL AX, R10 + ADDL AX, R14 + MOVL 128(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 76(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 108(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 72(BP), BX + ADDL BX, AX + MOVL AX, 136(BP) + ADDL AX, R13 + MOVL R10, AX + ADDL $0x4d2c6dfc, R13 + MOVL R10, CX + RORL $0x06, AX + MOVL R10, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R10, CX + RORL $0x19, DX + ANDL R11, CX + XORL AX, DX + MOVL R10, AX + NOTL AX + ADDL DX, R13 + ANDL R12, AX + XORL CX, AX + ADDL R13, AX + MOVL R14, DI + MOVL R8, BX + RORL $0x02, DI + MOVL R14, DX + ANDL R15, BX + RORL $0x0d, DX + MOVL R14, CX + ANDL R8, CX + XORL DX, DI + XORL CX, BX + MOVL R14, DX + MOVL R15, CX + RORL $0x16, DX + ANDL R14, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R13 + ADDL AX, R9 + ADDL AX, R13 + MOVL 132(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 80(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 112(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 76(BP), BX + ADDL BX, AX + MOVL AX, 140(BP) + ADDL AX, R12 + MOVL R9, AX + ADDL $0x53380d13, R12 + MOVL R9, CX + RORL $0x06, AX + MOVL R9, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R9, CX + RORL $0x19, DX + ANDL R10, CX + XORL AX, DX + MOVL R9, AX + NOTL AX + ADDL DX, R12 + ANDL R11, AX + XORL CX, AX + ADDL R12, AX + MOVL R13, DI + MOVL R15, BX + RORL $0x02, DI + MOVL R13, DX + ANDL R14, BX + RORL $0x0d, DX + MOVL R13, CX + ANDL R15, CX + XORL DX, DI + XORL CX, BX + MOVL R13, DX + MOVL R14, CX + RORL $0x16, DX + ANDL R13, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R12 + ADDL AX, R8 + ADDL AX, R12 + MOVL 136(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 84(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 116(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 80(BP), BX + ADDL BX, AX + MOVL AX, 144(BP) + ADDL AX, R11 + MOVL R8, AX + ADDL $0x650a7354, R11 + MOVL R8, CX + RORL $0x06, AX + MOVL R8, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R8, CX + RORL $0x19, DX + ANDL R9, CX + XORL AX, DX + MOVL R8, AX + NOTL AX + ADDL DX, R11 + ANDL R10, AX + XORL CX, AX + ADDL R11, AX + MOVL R12, DI + MOVL R14, BX + RORL $0x02, DI + MOVL R12, DX + ANDL R13, BX + RORL $0x0d, DX + MOVL R12, CX + ANDL R14, CX + XORL DX, DI + XORL CX, BX + MOVL R12, DX + MOVL R13, CX + RORL $0x16, DX + ANDL R12, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R11 + ADDL AX, R15 + ADDL AX, R11 + MOVL 140(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 88(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 120(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 84(BP), BX + ADDL BX, AX + MOVL AX, 148(BP) + ADDL AX, R10 + MOVL R15, AX + ADDL $0x766a0abb, R10 + MOVL R15, CX + RORL $0x06, AX + MOVL R15, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R15, CX + RORL $0x19, DX + ANDL R8, CX + XORL AX, DX + MOVL R15, AX + NOTL AX + ADDL DX, R10 + ANDL R9, AX + XORL CX, AX + ADDL R10, AX + MOVL R11, DI + MOVL R13, BX + RORL $0x02, DI + MOVL R11, DX + ANDL R12, BX + RORL $0x0d, DX + MOVL R11, CX + ANDL R13, CX + XORL DX, DI + XORL CX, BX + MOVL R11, DX + MOVL R12, CX + RORL $0x16, DX + ANDL R11, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R10 + ADDL AX, R14 + ADDL AX, R10 + MOVL 144(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 92(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 124(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 88(BP), BX + ADDL BX, AX + MOVL AX, 152(BP) + ADDL AX, R9 + MOVL R14, AX + ADDL $0x81c2c92e, R9 + MOVL R14, CX + RORL $0x06, AX + MOVL R14, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R14, CX + RORL $0x19, DX + ANDL R15, CX + XORL AX, DX + MOVL R14, AX + NOTL AX + ADDL DX, R9 + ANDL R8, AX + XORL CX, AX + ADDL R9, AX + MOVL R10, DI + MOVL R12, BX + RORL $0x02, DI + MOVL R10, DX + ANDL R11, BX + RORL $0x0d, DX + MOVL R10, CX + ANDL R12, CX + XORL DX, DI + XORL CX, BX + MOVL R10, DX + MOVL R11, CX + RORL $0x16, DX + ANDL R10, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R9 + ADDL AX, R13 + ADDL AX, R9 + MOVL 148(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 96(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 128(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 92(BP), BX + ADDL BX, AX + MOVL AX, 156(BP) + ADDL AX, R8 + MOVL R13, AX + ADDL $0x92722c85, R8 + MOVL R13, CX + RORL $0x06, AX + MOVL R13, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R13, CX + RORL $0x19, DX + ANDL R14, CX + XORL AX, DX + MOVL R13, AX + NOTL AX + ADDL DX, R8 + ANDL R15, AX + XORL CX, AX + ADDL R8, AX + MOVL R9, DI + MOVL R11, BX + RORL $0x02, DI + MOVL R9, DX + ANDL R10, BX + RORL $0x0d, DX + MOVL R9, CX + ANDL R11, CX + XORL DX, DI + XORL CX, BX + MOVL R9, DX + MOVL R10, CX + RORL $0x16, DX + ANDL R9, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R8 + ADDL AX, R12 + ADDL AX, R8 + MOVL 152(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 100(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 132(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 96(BP), BX + ADDL BX, AX + MOVL AX, 160(BP) + ADDL AX, R15 + MOVL R12, AX + ADDL $0xa2bfe8a1, R15 + MOVL R12, CX + RORL $0x06, AX + MOVL R12, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R12, CX + RORL $0x19, DX + ANDL R13, CX + XORL AX, DX + MOVL R12, AX + NOTL AX + ADDL DX, R15 + ANDL R14, AX + XORL CX, AX + ADDL R15, AX + MOVL R8, DI + MOVL R10, BX + RORL $0x02, DI + MOVL R8, DX + ANDL R9, BX + RORL $0x0d, DX + MOVL R8, CX + ANDL R10, CX + XORL DX, DI + XORL CX, BX + MOVL R8, DX + MOVL R9, CX + RORL $0x16, DX + ANDL R8, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R15 + ADDL AX, R11 + ADDL AX, R15 + MOVL 156(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 104(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 136(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 100(BP), BX + ADDL BX, AX + MOVL AX, 164(BP) + ADDL AX, R14 + MOVL R11, AX + ADDL $0xa81a664b, R14 + MOVL R11, CX + RORL $0x06, AX + MOVL R11, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R11, CX + RORL $0x19, DX + ANDL R12, CX + XORL AX, DX + MOVL R11, AX + NOTL AX + ADDL DX, R14 + ANDL R13, AX + XORL CX, AX + ADDL R14, AX + MOVL R15, DI + MOVL R9, BX + RORL $0x02, DI + MOVL R15, DX + ANDL R8, BX + RORL $0x0d, DX + MOVL R15, CX + ANDL R9, CX + XORL DX, DI + XORL CX, BX + MOVL R15, DX + MOVL R8, CX + RORL $0x16, DX + ANDL R15, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R14 + ADDL AX, R10 + ADDL AX, R14 + MOVL 160(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 108(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 140(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 104(BP), BX + ADDL BX, AX + MOVL AX, 168(BP) + ADDL AX, R13 + MOVL R10, AX + ADDL $0xc24b8b70, R13 + MOVL R10, CX + RORL $0x06, AX + MOVL R10, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R10, CX + RORL $0x19, DX + ANDL R11, CX + XORL AX, DX + MOVL R10, AX + NOTL AX + ADDL DX, R13 + ANDL R12, AX + XORL CX, AX + ADDL R13, AX + MOVL R14, DI + MOVL R8, BX + RORL $0x02, DI + MOVL R14, DX + ANDL R15, BX + RORL $0x0d, DX + MOVL R14, CX + ANDL R8, CX + XORL DX, DI + XORL CX, BX + MOVL R14, DX + MOVL R15, CX + RORL $0x16, DX + ANDL R14, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R13 + ADDL AX, R9 + ADDL AX, R13 + MOVL 164(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 112(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 144(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 108(BP), BX + ADDL BX, AX + MOVL AX, 172(BP) + ADDL AX, R12 + MOVL R9, AX + ADDL $0xc76c51a3, R12 + MOVL R9, CX + RORL $0x06, AX + MOVL R9, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R9, CX + RORL $0x19, DX + ANDL R10, CX + XORL AX, DX + MOVL R9, AX + NOTL AX + ADDL DX, R12 + ANDL R11, AX + XORL CX, AX + ADDL R12, AX + MOVL R13, DI + MOVL R15, BX + RORL $0x02, DI + MOVL R13, DX + ANDL R14, BX + RORL $0x0d, DX + MOVL R13, CX + ANDL R15, CX + XORL DX, DI + XORL CX, BX + MOVL R13, DX + MOVL R14, CX + RORL $0x16, DX + ANDL R13, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R12 + ADDL AX, R8 + ADDL AX, R12 + MOVL 168(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 116(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 148(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 112(BP), BX + ADDL BX, AX + MOVL AX, 176(BP) + ADDL AX, R11 + MOVL R8, AX + ADDL $0xd192e819, R11 + MOVL R8, CX + RORL $0x06, AX + MOVL R8, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R8, CX + RORL $0x19, DX + ANDL R9, CX + XORL AX, DX + MOVL R8, AX + NOTL AX + ADDL DX, R11 + ANDL R10, AX + XORL CX, AX + ADDL R11, AX + MOVL R12, DI + MOVL R14, BX + RORL $0x02, DI + MOVL R12, DX + ANDL R13, BX + RORL $0x0d, DX + MOVL R12, CX + ANDL R14, CX + XORL DX, DI + XORL CX, BX + MOVL R12, DX + MOVL R13, CX + RORL $0x16, DX + ANDL R12, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R11 + ADDL AX, R15 + ADDL AX, R11 + MOVL 172(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 120(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 152(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 116(BP), BX + ADDL BX, AX + MOVL AX, 180(BP) + ADDL AX, R10 + MOVL R15, AX + ADDL $0xd6990624, R10 + MOVL R15, CX + RORL $0x06, AX + MOVL R15, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R15, CX + RORL $0x19, DX + ANDL R8, CX + XORL AX, DX + MOVL R15, AX + NOTL AX + ADDL DX, R10 + ANDL R9, AX + XORL CX, AX + ADDL R10, AX + MOVL R11, DI + MOVL R13, BX + RORL $0x02, DI + MOVL R11, DX + ANDL R12, BX + RORL $0x0d, DX + MOVL R11, CX + ANDL R13, CX + XORL DX, DI + XORL CX, BX + MOVL R11, DX + MOVL R12, CX + RORL $0x16, DX + ANDL R11, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R10 + ADDL AX, R14 + ADDL AX, R10 + MOVL 176(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 124(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 156(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 120(BP), BX + ADDL BX, AX + MOVL AX, 184(BP) + ADDL AX, R9 + MOVL R14, AX + ADDL $0xf40e3585, R9 + MOVL R14, CX + RORL $0x06, AX + MOVL R14, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R14, CX + RORL $0x19, DX + ANDL R15, CX + XORL AX, DX + MOVL R14, AX + NOTL AX + ADDL DX, R9 + ANDL R8, AX + XORL CX, AX + ADDL R9, AX + MOVL R10, DI + MOVL R12, BX + RORL $0x02, DI + MOVL R10, DX + ANDL R11, BX + RORL $0x0d, DX + MOVL R10, CX + ANDL R12, CX + XORL DX, DI + XORL CX, BX + MOVL R10, DX + MOVL R11, CX + RORL $0x16, DX + ANDL R10, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R9 + ADDL AX, R13 + ADDL AX, R9 + MOVL 180(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 128(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 160(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 124(BP), BX + ADDL BX, AX + MOVL AX, 188(BP) + ADDL AX, R8 + MOVL R13, AX + ADDL $0x106aa070, R8 + MOVL R13, CX + RORL $0x06, AX + MOVL R13, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R13, CX + RORL $0x19, DX + ANDL R14, CX + XORL AX, DX + MOVL R13, AX + NOTL AX + ADDL DX, R8 + ANDL R15, AX + XORL CX, AX + ADDL R8, AX + MOVL R9, DI + MOVL R11, BX + RORL $0x02, DI + MOVL R9, DX + ANDL R10, BX + RORL $0x0d, DX + MOVL R9, CX + ANDL R11, CX + XORL DX, DI + XORL CX, BX + MOVL R9, DX + MOVL R10, CX + RORL $0x16, DX + ANDL R9, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R8 + ADDL AX, R12 + ADDL AX, R8 + MOVL 184(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 132(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 164(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 128(BP), BX + ADDL BX, AX + MOVL AX, 192(BP) + ADDL AX, R15 + MOVL R12, AX + ADDL $0x19a4c116, R15 + MOVL R12, CX + RORL $0x06, AX + MOVL R12, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R12, CX + RORL $0x19, DX + ANDL R13, CX + XORL AX, DX + MOVL R12, AX + NOTL AX + ADDL DX, R15 + ANDL R14, AX + XORL CX, AX + ADDL R15, AX + MOVL R8, DI + MOVL R10, BX + RORL $0x02, DI + MOVL R8, DX + ANDL R9, BX + RORL $0x0d, DX + MOVL R8, CX + ANDL R10, CX + XORL DX, DI + XORL CX, BX + MOVL R8, DX + MOVL R9, CX + RORL $0x16, DX + ANDL R8, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R15 + ADDL AX, R11 + ADDL AX, R15 + MOVL 188(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 136(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 168(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 132(BP), BX + ADDL BX, AX + MOVL AX, 196(BP) + ADDL AX, R14 + MOVL R11, AX + ADDL $0x1e376c08, R14 + MOVL R11, CX + RORL $0x06, AX + MOVL R11, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R11, CX + RORL $0x19, DX + ANDL R12, CX + XORL AX, DX + MOVL R11, AX + NOTL AX + ADDL DX, R14 + ANDL R13, AX + XORL CX, AX + ADDL R14, AX + MOVL R15, DI + MOVL R9, BX + RORL $0x02, DI + MOVL R15, DX + ANDL R8, BX + RORL $0x0d, DX + MOVL R15, CX + ANDL R9, CX + XORL DX, DI + XORL CX, BX + MOVL R15, DX + MOVL R8, CX + RORL $0x16, DX + ANDL R15, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R14 + ADDL AX, R10 + ADDL AX, R14 + MOVL 192(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 140(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 172(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 136(BP), BX + ADDL BX, AX + MOVL AX, 200(BP) + ADDL AX, R13 + MOVL R10, AX + ADDL $0x2748774c, R13 + MOVL R10, CX + RORL $0x06, AX + MOVL R10, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R10, CX + RORL $0x19, DX + ANDL R11, CX + XORL AX, DX + MOVL R10, AX + NOTL AX + ADDL DX, R13 + ANDL R12, AX + XORL CX, AX + ADDL R13, AX + MOVL R14, DI + MOVL R8, BX + RORL $0x02, DI + MOVL R14, DX + ANDL R15, BX + RORL $0x0d, DX + MOVL R14, CX + ANDL R8, CX + XORL DX, DI + XORL CX, BX + MOVL R14, DX + MOVL R15, CX + RORL $0x16, DX + ANDL R14, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R13 + ADDL AX, R9 + ADDL AX, R13 + MOVL 196(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 144(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 176(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 140(BP), BX + ADDL BX, AX + MOVL AX, 204(BP) + ADDL AX, R12 + MOVL R9, AX + ADDL $0x34b0bcb5, R12 + MOVL R9, CX + RORL $0x06, AX + MOVL R9, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R9, CX + RORL $0x19, DX + ANDL R10, CX + XORL AX, DX + MOVL R9, AX + NOTL AX + ADDL DX, R12 + ANDL R11, AX + XORL CX, AX + ADDL R12, AX + MOVL R13, DI + MOVL R15, BX + RORL $0x02, DI + MOVL R13, DX + ANDL R14, BX + RORL $0x0d, DX + MOVL R13, CX + ANDL R15, CX + XORL DX, DI + XORL CX, BX + MOVL R13, DX + MOVL R14, CX + RORL $0x16, DX + ANDL R13, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R12 + ADDL AX, R8 + ADDL AX, R12 + MOVL 200(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 148(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 180(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 144(BP), BX + ADDL BX, AX + MOVL AX, 208(BP) + ADDL AX, R11 + MOVL R8, AX + ADDL $0x391c0cb3, R11 + MOVL R8, CX + RORL $0x06, AX + MOVL R8, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R8, CX + RORL $0x19, DX + ANDL R9, CX + XORL AX, DX + MOVL R8, AX + NOTL AX + ADDL DX, R11 + ANDL R10, AX + XORL CX, AX + ADDL R11, AX + MOVL R12, DI + MOVL R14, BX + RORL $0x02, DI + MOVL R12, DX + ANDL R13, BX + RORL $0x0d, DX + MOVL R12, CX + ANDL R14, CX + XORL DX, DI + XORL CX, BX + MOVL R12, DX + MOVL R13, CX + RORL $0x16, DX + ANDL R12, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R11 + ADDL AX, R15 + ADDL AX, R11 + MOVL 204(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 152(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 184(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 148(BP), BX + ADDL BX, AX + MOVL AX, 212(BP) + ADDL AX, R10 + MOVL R15, AX + ADDL $0x4ed8aa4a, R10 + MOVL R15, CX + RORL $0x06, AX + MOVL R15, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R15, CX + RORL $0x19, DX + ANDL R8, CX + XORL AX, DX + MOVL R15, AX + NOTL AX + ADDL DX, R10 + ANDL R9, AX + XORL CX, AX + ADDL R10, AX + MOVL R11, DI + MOVL R13, BX + RORL $0x02, DI + MOVL R11, DX + ANDL R12, BX + RORL $0x0d, DX + MOVL R11, CX + ANDL R13, CX + XORL DX, DI + XORL CX, BX + MOVL R11, DX + MOVL R12, CX + RORL $0x16, DX + ANDL R11, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R10 + ADDL AX, R14 + ADDL AX, R10 + MOVL 208(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 156(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 188(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 152(BP), BX + ADDL BX, AX + MOVL AX, 216(BP) + ADDL AX, R9 + MOVL R14, AX + ADDL $0x5b9cca4f, R9 + MOVL R14, CX + RORL $0x06, AX + MOVL R14, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R14, CX + RORL $0x19, DX + ANDL R15, CX + XORL AX, DX + MOVL R14, AX + NOTL AX + ADDL DX, R9 + ANDL R8, AX + XORL CX, AX + ADDL R9, AX + MOVL R10, DI + MOVL R12, BX + RORL $0x02, DI + MOVL R10, DX + ANDL R11, BX + RORL $0x0d, DX + MOVL R10, CX + ANDL R12, CX + XORL DX, DI + XORL CX, BX + MOVL R10, DX + MOVL R11, CX + RORL $0x16, DX + ANDL R10, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R9 + ADDL AX, R13 + ADDL AX, R9 + MOVL 212(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 160(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 192(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 156(BP), BX + ADDL BX, AX + MOVL AX, 220(BP) + ADDL AX, R8 + MOVL R13, AX + ADDL $0x682e6ff3, R8 + MOVL R13, CX + RORL $0x06, AX + MOVL R13, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R13, CX + RORL $0x19, DX + ANDL R14, CX + XORL AX, DX + MOVL R13, AX + NOTL AX + ADDL DX, R8 + ANDL R15, AX + XORL CX, AX + ADDL R8, AX + MOVL R9, DI + MOVL R11, BX + RORL $0x02, DI + MOVL R9, DX + ANDL R10, BX + RORL $0x0d, DX + MOVL R9, CX + ANDL R11, CX + XORL DX, DI + XORL CX, BX + MOVL R9, DX + MOVL R10, CX + RORL $0x16, DX + ANDL R9, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R8 + ADDL AX, R12 + ADDL AX, R8 + MOVL 216(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 164(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 196(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 160(BP), BX + ADDL BX, AX + MOVL AX, 224(BP) + ADDL AX, R15 + MOVL R12, AX + ADDL $0x748f82ee, R15 + MOVL R12, CX + RORL $0x06, AX + MOVL R12, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R12, CX + RORL $0x19, DX + ANDL R13, CX + XORL AX, DX + MOVL R12, AX + NOTL AX + ADDL DX, R15 + ANDL R14, AX + XORL CX, AX + ADDL R15, AX + MOVL R8, DI + MOVL R10, BX + RORL $0x02, DI + MOVL R8, DX + ANDL R9, BX + RORL $0x0d, DX + MOVL R8, CX + ANDL R10, CX + XORL DX, DI + XORL CX, BX + MOVL R8, DX + MOVL R9, CX + RORL $0x16, DX + ANDL R8, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R15 + ADDL AX, R11 + ADDL AX, R15 + MOVL 220(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 168(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 200(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 164(BP), BX + ADDL BX, AX + MOVL AX, 228(BP) + ADDL AX, R14 + MOVL R11, AX + ADDL $0x78a5636f, R14 + MOVL R11, CX + RORL $0x06, AX + MOVL R11, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R11, CX + RORL $0x19, DX + ANDL R12, CX + XORL AX, DX + MOVL R11, AX + NOTL AX + ADDL DX, R14 + ANDL R13, AX + XORL CX, AX + ADDL R14, AX + MOVL R15, DI + MOVL R9, BX + RORL $0x02, DI + MOVL R15, DX + ANDL R8, BX + RORL $0x0d, DX + MOVL R15, CX + ANDL R9, CX + XORL DX, DI + XORL CX, BX + MOVL R15, DX + MOVL R8, CX + RORL $0x16, DX + ANDL R15, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R14 + ADDL AX, R10 + ADDL AX, R14 + MOVL 224(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 172(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 204(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 168(BP), BX + ADDL BX, AX + MOVL AX, 232(BP) + ADDL AX, R13 + MOVL R10, AX + ADDL $0x84c87814, R13 + MOVL R10, CX + RORL $0x06, AX + MOVL R10, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R10, CX + RORL $0x19, DX + ANDL R11, CX + XORL AX, DX + MOVL R10, AX + NOTL AX + ADDL DX, R13 + ANDL R12, AX + XORL CX, AX + ADDL R13, AX + MOVL R14, DI + MOVL R8, BX + RORL $0x02, DI + MOVL R14, DX + ANDL R15, BX + RORL $0x0d, DX + MOVL R14, CX + ANDL R8, CX + XORL DX, DI + XORL CX, BX + MOVL R14, DX + MOVL R15, CX + RORL $0x16, DX + ANDL R14, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R13 + ADDL AX, R9 + ADDL AX, R13 + MOVL 228(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 176(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 208(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 172(BP), BX + ADDL BX, AX + MOVL AX, 236(BP) + ADDL AX, R12 + MOVL R9, AX + ADDL $0x8cc70208, R12 + MOVL R9, CX + RORL $0x06, AX + MOVL R9, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R9, CX + RORL $0x19, DX + ANDL R10, CX + XORL AX, DX + MOVL R9, AX + NOTL AX + ADDL DX, R12 + ANDL R11, AX + XORL CX, AX + ADDL R12, AX + MOVL R13, DI + MOVL R15, BX + RORL $0x02, DI + MOVL R13, DX + ANDL R14, BX + RORL $0x0d, DX + MOVL R13, CX + ANDL R15, CX + XORL DX, DI + XORL CX, BX + MOVL R13, DX + MOVL R14, CX + RORL $0x16, DX + ANDL R13, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R12 + ADDL AX, R8 + ADDL AX, R12 + MOVL 232(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 180(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 212(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 176(BP), BX + ADDL BX, AX + MOVL AX, 240(BP) + ADDL AX, R11 + MOVL R8, AX + ADDL $0x90befffa, R11 + MOVL R8, CX + RORL $0x06, AX + MOVL R8, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R8, CX + RORL $0x19, DX + ANDL R9, CX + XORL AX, DX + MOVL R8, AX + NOTL AX + ADDL DX, R11 + ANDL R10, AX + XORL CX, AX + ADDL R11, AX + MOVL R12, DI + MOVL R14, BX + RORL $0x02, DI + MOVL R12, DX + ANDL R13, BX + RORL $0x0d, DX + MOVL R12, CX + ANDL R14, CX + XORL DX, DI + XORL CX, BX + MOVL R12, DX + MOVL R13, CX + RORL $0x16, DX + ANDL R12, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R11 + ADDL AX, R15 + ADDL AX, R11 + MOVL 236(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 184(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 216(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 180(BP), BX + ADDL BX, AX + MOVL AX, 244(BP) + ADDL AX, R10 + MOVL R15, AX + ADDL $0xa4506ceb, R10 + MOVL R15, CX + RORL $0x06, AX + MOVL R15, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R15, CX + RORL $0x19, DX + ANDL R8, CX + XORL AX, DX + MOVL R15, AX + NOTL AX + ADDL DX, R10 + ANDL R9, AX + XORL CX, AX + ADDL R10, AX + MOVL R11, DI + MOVL R13, BX + RORL $0x02, DI + MOVL R11, DX + ANDL R12, BX + RORL $0x0d, DX + MOVL R11, CX + ANDL R13, CX + XORL DX, DI + XORL CX, BX + MOVL R11, DX + MOVL R12, CX + RORL $0x16, DX + ANDL R11, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R10 + ADDL AX, R14 + ADDL AX, R10 + MOVL 240(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 188(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 220(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 184(BP), BX + ADDL BX, AX + MOVL AX, 248(BP) + ADDL AX, R9 + MOVL R14, AX + ADDL $0xbef9a3f7, R9 + MOVL R14, CX + RORL $0x06, AX + MOVL R14, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R14, CX + RORL $0x19, DX + ANDL R15, CX + XORL AX, DX + MOVL R14, AX + NOTL AX + ADDL DX, R9 + ANDL R8, AX + XORL CX, AX + ADDL R9, AX + MOVL R10, DI + MOVL R12, BX + RORL $0x02, DI + MOVL R10, DX + ANDL R11, BX + RORL $0x0d, DX + MOVL R10, CX + ANDL R12, CX + XORL DX, DI + XORL CX, BX + MOVL R10, DX + MOVL R11, CX + RORL $0x16, DX + ANDL R10, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R9 + ADDL AX, R13 + ADDL AX, R9 + MOVL 244(BP), AX + MOVL AX, CX + RORL $0x11, AX + MOVL CX, DX + RORL $0x13, CX + SHRL $0x0a, DX + MOVL 192(BP), BX + XORL CX, AX + MOVL BX, CX + XORL DX, AX + RORL $0x07, BX + MOVL CX, DX + SHRL $0x03, DX + RORL $0x12, CX + ADDL 224(BP), AX + XORL CX, BX + XORL DX, BX + ADDL 188(BP), BX + ADDL BX, AX + MOVL AX, 252(BP) + ADDL AX, R8 + MOVL R13, AX + ADDL $0xc67178f2, R8 + MOVL R13, CX + RORL $0x06, AX + MOVL R13, DX + RORL $0x0b, CX + XORL CX, AX + MOVL R13, CX + RORL $0x19, DX + ANDL R14, CX + XORL AX, DX + MOVL R13, AX + NOTL AX + ADDL DX, R8 + ANDL R15, AX + XORL CX, AX + ADDL R8, AX + MOVL R9, DI + MOVL R11, BX + RORL $0x02, DI + MOVL R9, DX + ANDL R10, BX + RORL $0x0d, DX + MOVL R9, CX + ANDL R11, CX + XORL DX, DI + XORL CX, BX + MOVL R9, DX + MOVL R10, CX + RORL $0x16, DX + ANDL R9, CX + XORL CX, BX + XORL DX, DI + ADDL DI, BX + MOVL BX, R8 + ADDL AX, R12 + ADDL AX, R8 + MOVQ dig+0(FP), BP + ADDL (BP), R8 + MOVL R8, (BP) + ADDL 4(BP), R9 + MOVL R9, 4(BP) + ADDL 8(BP), R10 + MOVL R10, 8(BP) + ADDL 12(BP), R11 + MOVL R11, 12(BP) + ADDL 16(BP), R12 + MOVL R12, 16(BP) + ADDL 20(BP), R13 + MOVL R13, 20(BP) + ADDL 24(BP), R14 + MOVL R14, 24(BP) + ADDL 28(BP), R15 + MOVL R15, 28(BP) + ADDQ $0x40, SI + CMPQ SI, 256(SP) + JB loop end: RET avx2: - MOVQ dig+0(FP), CTX // d.h[8] - MOVQ p_base+8(FP), INP - MOVQ p_len+16(FP), NUM_BYTES - - LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block - MOVQ NUM_BYTES, _INP_END(SP) - - CMPQ NUM_BYTES, INP + MOVQ dig+0(FP), SI + MOVQ p_base+8(FP), DI + MOVQ p_len+16(FP), DX + LEAQ -64(DI)(DX*1), DX + MOVQ DX, 512(SP) + CMPQ DX, DI JE avx2_only_one_block // Load initial digest - MOVL 0(CTX), a // a = H0 - MOVL 4(CTX), b // b = H1 - MOVL 8(CTX), c // c = H2 - MOVL 12(CTX), d // d = H3 - MOVL 16(CTX), e // e = H4 - MOVL 20(CTX), f // f = H5 - MOVL 24(CTX), g // g = H6 - MOVL 28(CTX), h // h = H7 - -avx2_loop0: // at each iteration works with one block (512 bit) - - VMOVDQU (0*32)(INP), XTMP0 - VMOVDQU (1*32)(INP), XTMP1 - VMOVDQU (2*32)(INP), XTMP2 - VMOVDQU (3*32)(INP), XTMP3 + MOVL (SI), AX + MOVL 4(SI), BX + MOVL 8(SI), CX + MOVL 12(SI), R8 + MOVL 16(SI), DX + MOVL 20(SI), R9 + MOVL 24(SI), R10 + MOVL 28(SI), R11 - VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK +avx2_loop0: + // at each iteration works with one block (512 bit) + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU 64(DI), Y2 + VMOVDQU 96(DI), Y3 + VMOVDQU flip_mask<>+0(SB), Y13 // Apply Byte Flip Mask: LE -> BE - VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0 - VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1 - VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2 - VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3 + VPSHUFB Y13, Y0, Y0 + VPSHUFB Y13, Y1, Y1 + VPSHUFB Y13, Y2, Y2 + VPSHUFB Y13, Y3, Y3 // Transpose data into high/low parts - VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0 - VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4 - VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8 - VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12 - - MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants + VPERM2I128 $0x20, Y2, Y0, Y4 + VPERM2I128 $0x31, Y2, Y0, Y5 + VPERM2I128 $0x20, Y3, Y1, Y6 + VPERM2I128 $0x31, Y3, Y1, Y7 + LEAQ K256<>+0(SB), BP avx2_last_block_enter: - ADDQ $64, INP - MOVQ INP, _INP(SP) - XORQ SRND, SRND + ADDQ $0x40, DI + MOVQ DI, 520(SP) + XORQ SI, SI -avx2_loop1: // for w0 - w47 +avx2_loop1: // Do 4 rounds and scheduling - VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER - VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + VPADDD (BP)(SI*1), Y4, Y9 + VMOVDQU Y9, (SP)(SI*1) + MOVL AX, DI + RORXL $0x19, DX, R13 + RORXL $0x0b, DX, R14 + ADDL (SP)(SI*1), R11 + ORL CX, DI + VPALIGNR $0x04, Y6, Y7, Y0 + MOVL R9, R15 + RORXL $0x0d, AX, R12 + XORL R14, R13 + XORL R10, R15 + VPADDD Y4, Y0, Y0 + RORXL $0x06, DX, R14 + ANDL DX, R15 + XORL R14, R13 + RORXL $0x16, AX, R14 + ADDL R11, R8 + ANDL BX, DI + VPALIGNR $0x04, Y4, Y5, Y1 + XORL R12, R14 + RORXL $0x02, AX, R12 + XORL R10, R15 + VPSRLD $0x07, Y1, Y2 + XORL R12, R14 + MOVL AX, R12 + ANDL CX, R12 + ADDL R13, R15 + VPSLLD $0x19, Y1, Y3 + ORL R12, DI + ADDL R14, R11 + ADDL R15, R8 + VPOR Y2, Y3, Y3 + VPSRLD $0x12, Y1, Y2 + ADDL R15, R11 + ADDL DI, R11 + MOVL R11, DI + RORXL $0x19, R8, R13 + RORXL $0x0b, R8, R14 + ADDL 4(SP)(SI*1), R10 + ORL BX, DI + VPSRLD $0x03, Y1, Y8 + MOVL DX, R15 + RORXL $0x0d, R11, R12 + XORL R14, R13 + XORL R9, R15 + RORXL $0x06, R8, R14 + XORL R14, R13 + RORXL $0x16, R11, R14 + ANDL R8, R15 + ADDL R10, CX + VPSLLD $0x0e, Y1, Y1 + ANDL AX, DI + XORL R12, R14 + VPXOR Y1, Y3, Y3 + RORXL $0x02, R11, R12 + XORL R9, R15 + VPXOR Y2, Y3, Y3 + XORL R12, R14 + MOVL R11, R12 + ANDL BX, R12 + ADDL R13, R15 + VPXOR Y8, Y3, Y1 + VPSHUFD $0xfa, Y7, Y2 + ORL R12, DI + ADDL R14, R10 + VPADDD Y1, Y0, Y0 + ADDL R15, CX + ADDL R15, R10 + ADDL DI, R10 + VPSRLD $0x0a, Y2, Y8 + MOVL R10, DI + RORXL $0x19, CX, R13 + ADDL 8(SP)(SI*1), R9 + VPSRLQ $0x13, Y2, Y3 + RORXL $0x0b, CX, R14 + ORL AX, DI + MOVL R8, R15 + XORL DX, R15 + RORXL $0x0d, R10, R12 + XORL R14, R13 + VPSRLQ $0x11, Y2, Y2 + ANDL CX, R15 + RORXL $0x06, CX, R14 + VPXOR Y3, Y2, Y2 + ADDL R9, BX + ANDL R11, DI + XORL R14, R13 + RORXL $0x16, R10, R14 + VPXOR Y2, Y8, Y8 + XORL DX, R15 + VPSHUFB shuff_00BA<>+0(SB), Y8, Y8 + XORL R12, R14 + RORXL $0x02, R10, R12 + VPADDD Y8, Y0, Y0 + XORL R12, R14 + MOVL R10, R12 + ANDL AX, R12 + ADDL R13, R15 + VPSHUFD $0x50, Y0, Y2 + ORL R12, DI + ADDL R14, R9 + ADDL R15, BX + ADDL R15, R9 + ADDL DI, R9 + MOVL R9, DI + RORXL $0x19, BX, R13 + RORXL $0x0b, BX, R14 + ADDL 12(SP)(SI*1), DX + ORL R11, DI + VPSRLD $0x0a, Y2, Y11 + MOVL CX, R15 + RORXL $0x0d, R9, R12 + XORL R14, R13 + XORL R8, R15 + VPSRLQ $0x13, Y2, Y3 + RORXL $0x06, BX, R14 + ANDL BX, R15 + ADDL DX, AX + ANDL R10, DI + VPSRLQ $0x11, Y2, Y2 + XORL R14, R13 + XORL R8, R15 + VPXOR Y3, Y2, Y2 + RORXL $0x16, R9, R14 + ADDL R13, R15 + VPXOR Y2, Y11, Y11 + XORL R12, R14 + ADDL R15, AX + RORXL $0x02, R9, R12 + VPSHUFB shuff_DC00<>+0(SB), Y11, Y11 + VPADDD Y0, Y11, Y4 + XORL R12, R14 + MOVL R9, R12 + ANDL R11, R12 + ORL R12, DI + ADDL R14, DX + ADDL R15, DX + ADDL DI, DX // Do 4 rounds and scheduling - VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER - VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + VPADDD 32(BP)(SI*1), Y5, Y9 + VMOVDQU Y9, 32(SP)(SI*1) + MOVL DX, DI + RORXL $0x19, AX, R13 + RORXL $0x0b, AX, R14 + ADDL 32(SP)(SI*1), R8 + ORL R10, DI + VPALIGNR $0x04, Y7, Y4, Y0 + MOVL BX, R15 + RORXL $0x0d, DX, R12 + XORL R14, R13 + XORL CX, R15 + VPADDD Y5, Y0, Y0 + RORXL $0x06, AX, R14 + ANDL AX, R15 + XORL R14, R13 + RORXL $0x16, DX, R14 + ADDL R8, R11 + ANDL R9, DI + VPALIGNR $0x04, Y5, Y6, Y1 + XORL R12, R14 + RORXL $0x02, DX, R12 + XORL CX, R15 + VPSRLD $0x07, Y1, Y2 + XORL R12, R14 + MOVL DX, R12 + ANDL R10, R12 + ADDL R13, R15 + VPSLLD $0x19, Y1, Y3 + ORL R12, DI + ADDL R14, R8 + ADDL R15, R11 + VPOR Y2, Y3, Y3 + VPSRLD $0x12, Y1, Y2 + ADDL R15, R8 + ADDL DI, R8 + MOVL R8, DI + RORXL $0x19, R11, R13 + RORXL $0x0b, R11, R14 + ADDL 36(SP)(SI*1), CX + ORL R9, DI + VPSRLD $0x03, Y1, Y8 + MOVL AX, R15 + RORXL $0x0d, R8, R12 + XORL R14, R13 + XORL BX, R15 + RORXL $0x06, R11, R14 + XORL R14, R13 + RORXL $0x16, R8, R14 + ANDL R11, R15 + ADDL CX, R10 + VPSLLD $0x0e, Y1, Y1 + ANDL DX, DI + XORL R12, R14 + VPXOR Y1, Y3, Y3 + RORXL $0x02, R8, R12 + XORL BX, R15 + VPXOR Y2, Y3, Y3 + XORL R12, R14 + MOVL R8, R12 + ANDL R9, R12 + ADDL R13, R15 + VPXOR Y8, Y3, Y1 + VPSHUFD $0xfa, Y4, Y2 + ORL R12, DI + ADDL R14, CX + VPADDD Y1, Y0, Y0 + ADDL R15, R10 + ADDL R15, CX + ADDL DI, CX + VPSRLD $0x0a, Y2, Y8 + MOVL CX, DI + RORXL $0x19, R10, R13 + ADDL 40(SP)(SI*1), BX + VPSRLQ $0x13, Y2, Y3 + RORXL $0x0b, R10, R14 + ORL DX, DI + MOVL R11, R15 + XORL AX, R15 + RORXL $0x0d, CX, R12 + XORL R14, R13 + VPSRLQ $0x11, Y2, Y2 + ANDL R10, R15 + RORXL $0x06, R10, R14 + VPXOR Y3, Y2, Y2 + ADDL BX, R9 + ANDL R8, DI + XORL R14, R13 + RORXL $0x16, CX, R14 + VPXOR Y2, Y8, Y8 + XORL AX, R15 + VPSHUFB shuff_00BA<>+0(SB), Y8, Y8 + XORL R12, R14 + RORXL $0x02, CX, R12 + VPADDD Y8, Y0, Y0 + XORL R12, R14 + MOVL CX, R12 + ANDL DX, R12 + ADDL R13, R15 + VPSHUFD $0x50, Y0, Y2 + ORL R12, DI + ADDL R14, BX + ADDL R15, R9 + ADDL R15, BX + ADDL DI, BX + MOVL BX, DI + RORXL $0x19, R9, R13 + RORXL $0x0b, R9, R14 + ADDL 44(SP)(SI*1), AX + ORL R8, DI + VPSRLD $0x0a, Y2, Y11 + MOVL R10, R15 + RORXL $0x0d, BX, R12 + XORL R14, R13 + XORL R11, R15 + VPSRLQ $0x13, Y2, Y3 + RORXL $0x06, R9, R14 + ANDL R9, R15 + ADDL AX, DX + ANDL CX, DI + VPSRLQ $0x11, Y2, Y2 + XORL R14, R13 + XORL R11, R15 + VPXOR Y3, Y2, Y2 + RORXL $0x16, BX, R14 + ADDL R13, R15 + VPXOR Y2, Y11, Y11 + XORL R12, R14 + ADDL R15, DX + RORXL $0x02, BX, R12 + VPSHUFB shuff_DC00<>+0(SB), Y11, Y11 + VPADDD Y0, Y11, Y5 + XORL R12, R14 + MOVL BX, R12 + ANDL R8, R12 + ORL R12, DI + ADDL R14, AX + ADDL R15, AX + ADDL DI, AX // Do 4 rounds and scheduling - VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER - VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + VPADDD 64(BP)(SI*1), Y6, Y9 + VMOVDQU Y9, 64(SP)(SI*1) + MOVL AX, DI + RORXL $0x19, DX, R13 + RORXL $0x0b, DX, R14 + ADDL 64(SP)(SI*1), R11 + ORL CX, DI + VPALIGNR $0x04, Y4, Y5, Y0 + MOVL R9, R15 + RORXL $0x0d, AX, R12 + XORL R14, R13 + XORL R10, R15 + VPADDD Y6, Y0, Y0 + RORXL $0x06, DX, R14 + ANDL DX, R15 + XORL R14, R13 + RORXL $0x16, AX, R14 + ADDL R11, R8 + ANDL BX, DI + VPALIGNR $0x04, Y6, Y7, Y1 + XORL R12, R14 + RORXL $0x02, AX, R12 + XORL R10, R15 + VPSRLD $0x07, Y1, Y2 + XORL R12, R14 + MOVL AX, R12 + ANDL CX, R12 + ADDL R13, R15 + VPSLLD $0x19, Y1, Y3 + ORL R12, DI + ADDL R14, R11 + ADDL R15, R8 + VPOR Y2, Y3, Y3 + VPSRLD $0x12, Y1, Y2 + ADDL R15, R11 + ADDL DI, R11 + MOVL R11, DI + RORXL $0x19, R8, R13 + RORXL $0x0b, R8, R14 + ADDL 68(SP)(SI*1), R10 + ORL BX, DI + VPSRLD $0x03, Y1, Y8 + MOVL DX, R15 + RORXL $0x0d, R11, R12 + XORL R14, R13 + XORL R9, R15 + RORXL $0x06, R8, R14 + XORL R14, R13 + RORXL $0x16, R11, R14 + ANDL R8, R15 + ADDL R10, CX + VPSLLD $0x0e, Y1, Y1 + ANDL AX, DI + XORL R12, R14 + VPXOR Y1, Y3, Y3 + RORXL $0x02, R11, R12 + XORL R9, R15 + VPXOR Y2, Y3, Y3 + XORL R12, R14 + MOVL R11, R12 + ANDL BX, R12 + ADDL R13, R15 + VPXOR Y8, Y3, Y1 + VPSHUFD $0xfa, Y5, Y2 + ORL R12, DI + ADDL R14, R10 + VPADDD Y1, Y0, Y0 + ADDL R15, CX + ADDL R15, R10 + ADDL DI, R10 + VPSRLD $0x0a, Y2, Y8 + MOVL R10, DI + RORXL $0x19, CX, R13 + ADDL 72(SP)(SI*1), R9 + VPSRLQ $0x13, Y2, Y3 + RORXL $0x0b, CX, R14 + ORL AX, DI + MOVL R8, R15 + XORL DX, R15 + RORXL $0x0d, R10, R12 + XORL R14, R13 + VPSRLQ $0x11, Y2, Y2 + ANDL CX, R15 + RORXL $0x06, CX, R14 + VPXOR Y3, Y2, Y2 + ADDL R9, BX + ANDL R11, DI + XORL R14, R13 + RORXL $0x16, R10, R14 + VPXOR Y2, Y8, Y8 + XORL DX, R15 + VPSHUFB shuff_00BA<>+0(SB), Y8, Y8 + XORL R12, R14 + RORXL $0x02, R10, R12 + VPADDD Y8, Y0, Y0 + XORL R12, R14 + MOVL R10, R12 + ANDL AX, R12 + ADDL R13, R15 + VPSHUFD $0x50, Y0, Y2 + ORL R12, DI + ADDL R14, R9 + ADDL R15, BX + ADDL R15, R9 + ADDL DI, R9 + MOVL R9, DI + RORXL $0x19, BX, R13 + RORXL $0x0b, BX, R14 + ADDL 76(SP)(SI*1), DX + ORL R11, DI + VPSRLD $0x0a, Y2, Y11 + MOVL CX, R15 + RORXL $0x0d, R9, R12 + XORL R14, R13 + XORL R8, R15 + VPSRLQ $0x13, Y2, Y3 + RORXL $0x06, BX, R14 + ANDL BX, R15 + ADDL DX, AX + ANDL R10, DI + VPSRLQ $0x11, Y2, Y2 + XORL R14, R13 + XORL R8, R15 + VPXOR Y3, Y2, Y2 + RORXL $0x16, R9, R14 + ADDL R13, R15 + VPXOR Y2, Y11, Y11 + XORL R12, R14 + ADDL R15, AX + RORXL $0x02, R9, R12 + VPSHUFB shuff_DC00<>+0(SB), Y11, Y11 + VPADDD Y0, Y11, Y6 + XORL R12, R14 + MOVL R9, R12 + ANDL R11, R12 + ORL R12, DI + ADDL R14, DX + ADDL R15, DX + ADDL DI, DX // Do 4 rounds and scheduling - VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER - VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1) - ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - - ADDQ $4*32, SRND - CMPQ SRND, $3*4*32 - JB avx2_loop1 + VPADDD 96(BP)(SI*1), Y7, Y9 + VMOVDQU Y9, 96(SP)(SI*1) + MOVL DX, DI + RORXL $0x19, AX, R13 + RORXL $0x0b, AX, R14 + ADDL 96(SP)(SI*1), R8 + ORL R10, DI + VPALIGNR $0x04, Y5, Y6, Y0 + MOVL BX, R15 + RORXL $0x0d, DX, R12 + XORL R14, R13 + XORL CX, R15 + VPADDD Y7, Y0, Y0 + RORXL $0x06, AX, R14 + ANDL AX, R15 + XORL R14, R13 + RORXL $0x16, DX, R14 + ADDL R8, R11 + ANDL R9, DI + VPALIGNR $0x04, Y7, Y4, Y1 + XORL R12, R14 + RORXL $0x02, DX, R12 + XORL CX, R15 + VPSRLD $0x07, Y1, Y2 + XORL R12, R14 + MOVL DX, R12 + ANDL R10, R12 + ADDL R13, R15 + VPSLLD $0x19, Y1, Y3 + ORL R12, DI + ADDL R14, R8 + ADDL R15, R11 + VPOR Y2, Y3, Y3 + VPSRLD $0x12, Y1, Y2 + ADDL R15, R8 + ADDL DI, R8 + MOVL R8, DI + RORXL $0x19, R11, R13 + RORXL $0x0b, R11, R14 + ADDL 100(SP)(SI*1), CX + ORL R9, DI + VPSRLD $0x03, Y1, Y8 + MOVL AX, R15 + RORXL $0x0d, R8, R12 + XORL R14, R13 + XORL BX, R15 + RORXL $0x06, R11, R14 + XORL R14, R13 + RORXL $0x16, R8, R14 + ANDL R11, R15 + ADDL CX, R10 + VPSLLD $0x0e, Y1, Y1 + ANDL DX, DI + XORL R12, R14 + VPXOR Y1, Y3, Y3 + RORXL $0x02, R8, R12 + XORL BX, R15 + VPXOR Y2, Y3, Y3 + XORL R12, R14 + MOVL R8, R12 + ANDL R9, R12 + ADDL R13, R15 + VPXOR Y8, Y3, Y1 + VPSHUFD $0xfa, Y6, Y2 + ORL R12, DI + ADDL R14, CX + VPADDD Y1, Y0, Y0 + ADDL R15, R10 + ADDL R15, CX + ADDL DI, CX + VPSRLD $0x0a, Y2, Y8 + MOVL CX, DI + RORXL $0x19, R10, R13 + ADDL 104(SP)(SI*1), BX + VPSRLQ $0x13, Y2, Y3 + RORXL $0x0b, R10, R14 + ORL DX, DI + MOVL R11, R15 + XORL AX, R15 + RORXL $0x0d, CX, R12 + XORL R14, R13 + VPSRLQ $0x11, Y2, Y2 + ANDL R10, R15 + RORXL $0x06, R10, R14 + VPXOR Y3, Y2, Y2 + ADDL BX, R9 + ANDL R8, DI + XORL R14, R13 + RORXL $0x16, CX, R14 + VPXOR Y2, Y8, Y8 + XORL AX, R15 + VPSHUFB shuff_00BA<>+0(SB), Y8, Y8 + XORL R12, R14 + RORXL $0x02, CX, R12 + VPADDD Y8, Y0, Y0 + XORL R12, R14 + MOVL CX, R12 + ANDL DX, R12 + ADDL R13, R15 + VPSHUFD $0x50, Y0, Y2 + ORL R12, DI + ADDL R14, BX + ADDL R15, R9 + ADDL R15, BX + ADDL DI, BX + MOVL BX, DI + RORXL $0x19, R9, R13 + RORXL $0x0b, R9, R14 + ADDL 108(SP)(SI*1), AX + ORL R8, DI + VPSRLD $0x0a, Y2, Y11 + MOVL R10, R15 + RORXL $0x0d, BX, R12 + XORL R14, R13 + XORL R11, R15 + VPSRLQ $0x13, Y2, Y3 + RORXL $0x06, R9, R14 + ANDL R9, R15 + ADDL AX, DX + ANDL CX, DI + VPSRLQ $0x11, Y2, Y2 + XORL R14, R13 + XORL R11, R15 + VPXOR Y3, Y2, Y2 + RORXL $0x16, BX, R14 + ADDL R13, R15 + VPXOR Y2, Y11, Y11 + XORL R12, R14 + ADDL R15, DX + RORXL $0x02, BX, R12 + VPSHUFB shuff_DC00<>+0(SB), Y11, Y11 + VPADDD Y0, Y11, Y7 + XORL R12, R14 + MOVL BX, R12 + ANDL R8, R12 + ORL R12, DI + ADDL R14, AX + ADDL R15, AX + ADDL DI, AX + ADDQ $0x80, SI + CMPQ SI, $0x00000180 + JB avx2_loop1 avx2_loop2: - // w48 - w63 processed with no scheduling (last 16 rounds) - VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER - VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1) - DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h) - DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h) - DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g) - DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f) - - VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER - VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1) - DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e) - DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d) - DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c) - DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b) - - ADDQ $2*32, SRND - - VMOVDQU XDWORD2, XDWORD0 - VMOVDQU XDWORD3, XDWORD1 - - CMPQ SRND, $4*4*32 - JB avx2_loop2 - - MOVQ dig+0(FP), CTX // d.h[8] - MOVQ _INP(SP), INP - - addm( 0(CTX), a) - addm( 4(CTX), b) - addm( 8(CTX), c) - addm( 12(CTX), d) - addm( 16(CTX), e) - addm( 20(CTX), f) - addm( 24(CTX), g) - addm( 28(CTX), h) - - CMPQ _INP_END(SP), INP - JB done_hash - - XORQ SRND, SRND - -avx2_loop3: // Do second block using previously scheduled results - DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a) - DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h) - DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g) - DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f) - - DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e) - DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d) - DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c) - DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b) - - ADDQ $2*32, SRND - CMPQ SRND, $4*4*32 - JB avx2_loop3 - - MOVQ dig+0(FP), CTX // d.h[8] - MOVQ _INP(SP), INP - ADDQ $64, INP - - addm( 0(CTX), a) - addm( 4(CTX), b) - addm( 8(CTX), c) - addm( 12(CTX), d) - addm( 16(CTX), e) - addm( 20(CTX), f) - addm( 24(CTX), g) - addm( 28(CTX), h) + VPADDD (BP)(SI*1), Y4, Y9 + VMOVDQU Y9, (SP)(SI*1) + MOVL R9, R15 + RORXL $0x19, DX, R13 + RORXL $0x0b, DX, R14 + XORL R10, R15 + XORL R14, R13 + RORXL $0x06, DX, R14 + ANDL DX, R15 + XORL R14, R13 + RORXL $0x0d, AX, R12 + XORL R10, R15 + RORXL $0x16, AX, R14 + MOVL AX, DI + XORL R12, R14 + RORXL $0x02, AX, R12 + ADDL (SP)(SI*1), R11 + ORL CX, DI + XORL R12, R14 + MOVL AX, R12 + ANDL BX, DI + ANDL CX, R12 + ADDL R13, R15 + ADDL R11, R8 + ORL R12, DI + ADDL R14, R11 + ADDL R15, R8 + ADDL R15, R11 + MOVL DX, R15 + RORXL $0x19, R8, R13 + RORXL $0x0b, R8, R14 + XORL R9, R15 + XORL R14, R13 + RORXL $0x06, R8, R14 + ANDL R8, R15 + ADDL DI, R11 + XORL R14, R13 + RORXL $0x0d, R11, R12 + XORL R9, R15 + RORXL $0x16, R11, R14 + MOVL R11, DI + XORL R12, R14 + RORXL $0x02, R11, R12 + ADDL 4(SP)(SI*1), R10 + ORL BX, DI + XORL R12, R14 + MOVL R11, R12 + ANDL AX, DI + ANDL BX, R12 + ADDL R13, R15 + ADDL R10, CX + ORL R12, DI + ADDL R14, R10 + ADDL R15, CX + ADDL R15, R10 + MOVL R8, R15 + RORXL $0x19, CX, R13 + RORXL $0x0b, CX, R14 + XORL DX, R15 + XORL R14, R13 + RORXL $0x06, CX, R14 + ANDL CX, R15 + ADDL DI, R10 + XORL R14, R13 + RORXL $0x0d, R10, R12 + XORL DX, R15 + RORXL $0x16, R10, R14 + MOVL R10, DI + XORL R12, R14 + RORXL $0x02, R10, R12 + ADDL 8(SP)(SI*1), R9 + ORL AX, DI + XORL R12, R14 + MOVL R10, R12 + ANDL R11, DI + ANDL AX, R12 + ADDL R13, R15 + ADDL R9, BX + ORL R12, DI + ADDL R14, R9 + ADDL R15, BX + ADDL R15, R9 + MOVL CX, R15 + RORXL $0x19, BX, R13 + RORXL $0x0b, BX, R14 + XORL R8, R15 + XORL R14, R13 + RORXL $0x06, BX, R14 + ANDL BX, R15 + ADDL DI, R9 + XORL R14, R13 + RORXL $0x0d, R9, R12 + XORL R8, R15 + RORXL $0x16, R9, R14 + MOVL R9, DI + XORL R12, R14 + RORXL $0x02, R9, R12 + ADDL 12(SP)(SI*1), DX + ORL R11, DI + XORL R12, R14 + MOVL R9, R12 + ANDL R10, DI + ANDL R11, R12 + ADDL R13, R15 + ADDL DX, AX + ORL R12, DI + ADDL R14, DX + ADDL R15, AX + ADDL R15, DX + ADDL DI, DX + VPADDD 32(BP)(SI*1), Y5, Y9 + VMOVDQU Y9, 32(SP)(SI*1) + MOVL BX, R15 + RORXL $0x19, AX, R13 + RORXL $0x0b, AX, R14 + XORL CX, R15 + XORL R14, R13 + RORXL $0x06, AX, R14 + ANDL AX, R15 + XORL R14, R13 + RORXL $0x0d, DX, R12 + XORL CX, R15 + RORXL $0x16, DX, R14 + MOVL DX, DI + XORL R12, R14 + RORXL $0x02, DX, R12 + ADDL 32(SP)(SI*1), R8 + ORL R10, DI + XORL R12, R14 + MOVL DX, R12 + ANDL R9, DI + ANDL R10, R12 + ADDL R13, R15 + ADDL R8, R11 + ORL R12, DI + ADDL R14, R8 + ADDL R15, R11 + ADDL R15, R8 + MOVL AX, R15 + RORXL $0x19, R11, R13 + RORXL $0x0b, R11, R14 + XORL BX, R15 + XORL R14, R13 + RORXL $0x06, R11, R14 + ANDL R11, R15 + ADDL DI, R8 + XORL R14, R13 + RORXL $0x0d, R8, R12 + XORL BX, R15 + RORXL $0x16, R8, R14 + MOVL R8, DI + XORL R12, R14 + RORXL $0x02, R8, R12 + ADDL 36(SP)(SI*1), CX + ORL R9, DI + XORL R12, R14 + MOVL R8, R12 + ANDL DX, DI + ANDL R9, R12 + ADDL R13, R15 + ADDL CX, R10 + ORL R12, DI + ADDL R14, CX + ADDL R15, R10 + ADDL R15, CX + MOVL R11, R15 + RORXL $0x19, R10, R13 + RORXL $0x0b, R10, R14 + XORL AX, R15 + XORL R14, R13 + RORXL $0x06, R10, R14 + ANDL R10, R15 + ADDL DI, CX + XORL R14, R13 + RORXL $0x0d, CX, R12 + XORL AX, R15 + RORXL $0x16, CX, R14 + MOVL CX, DI + XORL R12, R14 + RORXL $0x02, CX, R12 + ADDL 40(SP)(SI*1), BX + ORL DX, DI + XORL R12, R14 + MOVL CX, R12 + ANDL R8, DI + ANDL DX, R12 + ADDL R13, R15 + ADDL BX, R9 + ORL R12, DI + ADDL R14, BX + ADDL R15, R9 + ADDL R15, BX + MOVL R10, R15 + RORXL $0x19, R9, R13 + RORXL $0x0b, R9, R14 + XORL R11, R15 + XORL R14, R13 + RORXL $0x06, R9, R14 + ANDL R9, R15 + ADDL DI, BX + XORL R14, R13 + RORXL $0x0d, BX, R12 + XORL R11, R15 + RORXL $0x16, BX, R14 + MOVL BX, DI + XORL R12, R14 + RORXL $0x02, BX, R12 + ADDL 44(SP)(SI*1), AX + ORL R8, DI + XORL R12, R14 + MOVL BX, R12 + ANDL CX, DI + ANDL R8, R12 + ADDL R13, R15 + ADDL AX, DX + ORL R12, DI + ADDL R14, AX + ADDL R15, DX + ADDL R15, AX + ADDL DI, AX + ADDQ $0x40, SI + VMOVDQU Y6, Y4 + VMOVDQU Y7, Y5 + CMPQ SI, $0x00000200 + JB avx2_loop2 + MOVQ dig+0(FP), SI + MOVQ 520(SP), DI + ADDL AX, (SI) + MOVL (SI), AX + ADDL BX, 4(SI) + MOVL 4(SI), BX + ADDL CX, 8(SI) + MOVL 8(SI), CX + ADDL R8, 12(SI) + MOVL 12(SI), R8 + ADDL DX, 16(SI) + MOVL 16(SI), DX + ADDL R9, 20(SI) + MOVL 20(SI), R9 + ADDL R10, 24(SI) + MOVL 24(SI), R10 + ADDL R11, 28(SI) + MOVL 28(SI), R11 + CMPQ 512(SP), DI + JB done_hash + XORQ SI, SI - CMPQ _INP_END(SP), INP - JA avx2_loop0 - JB done_hash +avx2_loop3: + MOVL R9, R15 + RORXL $0x19, DX, R13 + RORXL $0x0b, DX, R14 + XORL R10, R15 + XORL R14, R13 + RORXL $0x06, DX, R14 + ANDL DX, R15 + XORL R14, R13 + RORXL $0x0d, AX, R12 + XORL R10, R15 + RORXL $0x16, AX, R14 + MOVL AX, DI + XORL R12, R14 + RORXL $0x02, AX, R12 + ADDL 16(SP)(SI*1), R11 + ORL CX, DI + XORL R12, R14 + MOVL AX, R12 + ANDL BX, DI + ANDL CX, R12 + ADDL R13, R15 + ADDL R11, R8 + ORL R12, DI + ADDL R14, R11 + ADDL R15, R8 + ADDL R15, R11 + MOVL DX, R15 + RORXL $0x19, R8, R13 + RORXL $0x0b, R8, R14 + XORL R9, R15 + XORL R14, R13 + RORXL $0x06, R8, R14 + ANDL R8, R15 + ADDL DI, R11 + XORL R14, R13 + RORXL $0x0d, R11, R12 + XORL R9, R15 + RORXL $0x16, R11, R14 + MOVL R11, DI + XORL R12, R14 + RORXL $0x02, R11, R12 + ADDL 20(SP)(SI*1), R10 + ORL BX, DI + XORL R12, R14 + MOVL R11, R12 + ANDL AX, DI + ANDL BX, R12 + ADDL R13, R15 + ADDL R10, CX + ORL R12, DI + ADDL R14, R10 + ADDL R15, CX + ADDL R15, R10 + MOVL R8, R15 + RORXL $0x19, CX, R13 + RORXL $0x0b, CX, R14 + XORL DX, R15 + XORL R14, R13 + RORXL $0x06, CX, R14 + ANDL CX, R15 + ADDL DI, R10 + XORL R14, R13 + RORXL $0x0d, R10, R12 + XORL DX, R15 + RORXL $0x16, R10, R14 + MOVL R10, DI + XORL R12, R14 + RORXL $0x02, R10, R12 + ADDL 24(SP)(SI*1), R9 + ORL AX, DI + XORL R12, R14 + MOVL R10, R12 + ANDL R11, DI + ANDL AX, R12 + ADDL R13, R15 + ADDL R9, BX + ORL R12, DI + ADDL R14, R9 + ADDL R15, BX + ADDL R15, R9 + MOVL CX, R15 + RORXL $0x19, BX, R13 + RORXL $0x0b, BX, R14 + XORL R8, R15 + XORL R14, R13 + RORXL $0x06, BX, R14 + ANDL BX, R15 + ADDL DI, R9 + XORL R14, R13 + RORXL $0x0d, R9, R12 + XORL R8, R15 + RORXL $0x16, R9, R14 + MOVL R9, DI + XORL R12, R14 + RORXL $0x02, R9, R12 + ADDL 28(SP)(SI*1), DX + ORL R11, DI + XORL R12, R14 + MOVL R9, R12 + ANDL R10, DI + ANDL R11, R12 + ADDL R13, R15 + ADDL DX, AX + ORL R12, DI + ADDL R14, DX + ADDL R15, AX + ADDL R15, DX + ADDL DI, DX + MOVL BX, R15 + RORXL $0x19, AX, R13 + RORXL $0x0b, AX, R14 + XORL CX, R15 + XORL R14, R13 + RORXL $0x06, AX, R14 + ANDL AX, R15 + XORL R14, R13 + RORXL $0x0d, DX, R12 + XORL CX, R15 + RORXL $0x16, DX, R14 + MOVL DX, DI + XORL R12, R14 + RORXL $0x02, DX, R12 + ADDL 48(SP)(SI*1), R8 + ORL R10, DI + XORL R12, R14 + MOVL DX, R12 + ANDL R9, DI + ANDL R10, R12 + ADDL R13, R15 + ADDL R8, R11 + ORL R12, DI + ADDL R14, R8 + ADDL R15, R11 + ADDL R15, R8 + MOVL AX, R15 + RORXL $0x19, R11, R13 + RORXL $0x0b, R11, R14 + XORL BX, R15 + XORL R14, R13 + RORXL $0x06, R11, R14 + ANDL R11, R15 + ADDL DI, R8 + XORL R14, R13 + RORXL $0x0d, R8, R12 + XORL BX, R15 + RORXL $0x16, R8, R14 + MOVL R8, DI + XORL R12, R14 + RORXL $0x02, R8, R12 + ADDL 52(SP)(SI*1), CX + ORL R9, DI + XORL R12, R14 + MOVL R8, R12 + ANDL DX, DI + ANDL R9, R12 + ADDL R13, R15 + ADDL CX, R10 + ORL R12, DI + ADDL R14, CX + ADDL R15, R10 + ADDL R15, CX + MOVL R11, R15 + RORXL $0x19, R10, R13 + RORXL $0x0b, R10, R14 + XORL AX, R15 + XORL R14, R13 + RORXL $0x06, R10, R14 + ANDL R10, R15 + ADDL DI, CX + XORL R14, R13 + RORXL $0x0d, CX, R12 + XORL AX, R15 + RORXL $0x16, CX, R14 + MOVL CX, DI + XORL R12, R14 + RORXL $0x02, CX, R12 + ADDL 56(SP)(SI*1), BX + ORL DX, DI + XORL R12, R14 + MOVL CX, R12 + ANDL R8, DI + ANDL DX, R12 + ADDL R13, R15 + ADDL BX, R9 + ORL R12, DI + ADDL R14, BX + ADDL R15, R9 + ADDL R15, BX + MOVL R10, R15 + RORXL $0x19, R9, R13 + RORXL $0x0b, R9, R14 + XORL R11, R15 + XORL R14, R13 + RORXL $0x06, R9, R14 + ANDL R9, R15 + ADDL DI, BX + XORL R14, R13 + RORXL $0x0d, BX, R12 + XORL R11, R15 + RORXL $0x16, BX, R14 + MOVL BX, DI + XORL R12, R14 + RORXL $0x02, BX, R12 + ADDL 60(SP)(SI*1), AX + ORL R8, DI + XORL R12, R14 + MOVL BX, R12 + ANDL CX, DI + ANDL R8, R12 + ADDL R13, R15 + ADDL AX, DX + ORL R12, DI + ADDL R14, AX + ADDL R15, DX + ADDL R15, AX + ADDL DI, AX + ADDQ $0x40, SI + CMPQ SI, $0x00000200 + JB avx2_loop3 + MOVQ dig+0(FP), SI + MOVQ 520(SP), DI + ADDQ $0x40, DI + ADDL AX, (SI) + MOVL (SI), AX + ADDL BX, 4(SI) + MOVL 4(SI), BX + ADDL CX, 8(SI) + MOVL 8(SI), CX + ADDL R8, 12(SI) + MOVL 12(SI), R8 + ADDL DX, 16(SI) + MOVL 16(SI), DX + ADDL R9, 20(SI) + MOVL 20(SI), R9 + ADDL R10, 24(SI) + MOVL 24(SI), R10 + ADDL R11, 28(SI) + MOVL 28(SI), R11 + CMPQ 512(SP), DI + JA avx2_loop0 + JB done_hash avx2_do_last_block: - - VMOVDQU 0(INP), XWORD0 - VMOVDQU 16(INP), XWORD1 - VMOVDQU 32(INP), XWORD2 - VMOVDQU 48(INP), XWORD3 - - VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK - - VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 - VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 - VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 - VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 - - MOVQ $K256<>(SB), TBL - - JMP avx2_last_block_enter + VMOVDQU (DI), X4 + VMOVDQU 16(DI), X5 + VMOVDQU 32(DI), X6 + VMOVDQU 48(DI), X7 + VMOVDQU flip_mask<>+0(SB), Y13 + VPSHUFB X13, X4, X4 + VPSHUFB X13, X5, X5 + VPSHUFB X13, X6, X6 + VPSHUFB X13, X7, X7 + LEAQ K256<>+0(SB), BP + JMP avx2_last_block_enter avx2_only_one_block: - // Load initial digest - MOVL 0(CTX), a // a = H0 - MOVL 4(CTX), b // b = H1 - MOVL 8(CTX), c // c = H2 - MOVL 12(CTX), d // d = H3 - MOVL 16(CTX), e // e = H4 - MOVL 20(CTX), f // f = H5 - MOVL 24(CTX), g // g = H6 - MOVL 28(CTX), h // h = H7 - - JMP avx2_do_last_block + MOVL (SI), AX + MOVL 4(SI), BX + MOVL 8(SI), CX + MOVL 12(SI), R8 + MOVL 16(SI), DX + MOVL 20(SI), R9 + MOVL 24(SI), R10 + MOVL 28(SI), R11 + JMP avx2_do_last_block done_hash: VZEROUPPER RET sha_ni: - MOVQ dig+0(FP), digestPtr // init digest hash vector H0, H1,..., H7 pointer - MOVQ p_base+8(FP), dataPtr // init input data base pointer - MOVQ p_len+16(FP), numBytes // get number of input bytes to hash - SHRQ $6, numBytes // force modulo 64 input buffer length - SHLQ $6, numBytes - CMPQ numBytes, $0 // exit early for zero-length input buffer - JEQ done - ADDQ dataPtr, numBytes // point numBytes to end of input buffer - VMOVDQU (0*16)(digestPtr), state0 // load initial hash values and reorder - VMOVDQU (1*16)(digestPtr), state1 // DCBA, HGFE -> ABEF, CDGH - PSHUFD $0xb1, state0, state0 // CDAB - PSHUFD $0x1b, state1, state1 // EFGH - VMOVDQA state0, m4 - PALIGNR $8, state1, state0 // ABEF - PBLENDW $0xf0, m4, state1 // CDGH - VMOVDQA flip_mask<>(SB), shufMask - LEAQ K256<>(SB), sha256Constants + MOVQ dig+0(FP), DI + MOVQ p_base+8(FP), SI + MOVQ p_len+16(FP), DX + SHRQ $0x06, DX + SHLQ $0x06, DX + CMPQ DX, $0x00 + JEQ done + ADDQ SI, DX + VMOVDQU (DI), X1 + VMOVDQU 16(DI), X2 + PSHUFD $0xb1, X1, X1 + PSHUFD $0x1b, X2, X2 + VMOVDQA X1, X7 + PALIGNR $0x08, X2, X1 + PBLENDW $0xf0, X7, X2 + VMOVDQA flip_mask<>+0(SB), X8 + LEAQ K256<>+0(SB), AX roundLoop: // save hash values for addition after rounds - VMOVDQA state0, abefSave - VMOVDQA state1, cdghSave + VMOVDQA X1, X9 + VMOVDQA X2, X10 // do rounds 0-59 - rounds0to11 (m0,-,0,nop) // 0-3 - rounds0to11 (m1,m0,1,sha256msg1) // 4-7 - rounds0to11 (m2,m1,2,sha256msg1) // 8-11 - VMOVDQU (3*16)(dataPtr), msg - PSHUFB shufMask, msg - rounds12to59 (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15 - rounds12to59 (m0,4,m3,m1,sha256msg1,vmov) // 16-19 - rounds12to59 (m1,5,m0,m2,sha256msg1,vmov) // 20-23 - rounds12to59 (m2,6,m1,m3,sha256msg1,vmov) // 24-27 - rounds12to59 (m3,7,m2,m0,sha256msg1,vmov) // 28-31 - rounds12to59 (m0,8,m3,m1,sha256msg1,vmov) // 32-35 - rounds12to59 (m1,9,m0,m2,sha256msg1,vmov) // 36-39 - rounds12to59 (m2,10,m1,m3,sha256msg1,vmov) // 40-43 - rounds12to59 (m3,11,m2,m0,sha256msg1,vmov) // 44-47 - rounds12to59 (m0,12,m3,m1,sha256msg1,vmov) // 48-51 - rounds12to59 (m1,13,m0,m2,nop,vmov) // 52-55 - rounds12to59 (m2,14,m1,m3,nop,vmov) // 56-59 + VMOVDQU (SI), X0 + PSHUFB X8, X0 + VMOVDQA X0, X3 + PADDD (AX), X0 + SHA256RNDS2 X0, X1, X2 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + VMOVDQU 16(SI), X0 + PSHUFB X8, X0 + VMOVDQA X0, X4 + PADDD 32(AX), X0 + SHA256RNDS2 X0, X1, X2 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X4, X3 + VMOVDQU 32(SI), X0 + PSHUFB X8, X0 + VMOVDQA X0, X5 + PADDD 64(AX), X0 + SHA256RNDS2 X0, X1, X2 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X5, X4 + VMOVDQU 48(SI), X0 + PSHUFB X8, X0 + VMOVDQA X0, X6 + PADDD 96(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X6, X7 + PALIGNR $0x04, X5, X7 + PADDD X7, X3 + SHA256MSG2 X6, X3 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X6, X5 + VMOVDQA X3, X0 + PADDD 128(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X3, X7 + PALIGNR $0x04, X6, X7 + PADDD X7, X4 + SHA256MSG2 X3, X4 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X3, X6 + VMOVDQA X4, X0 + PADDD 160(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X4, X7 + PALIGNR $0x04, X3, X7 + PADDD X7, X5 + SHA256MSG2 X4, X5 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X4, X3 + VMOVDQA X5, X0 + PADDD 192(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X5, X7 + PALIGNR $0x04, X4, X7 + PADDD X7, X6 + SHA256MSG2 X5, X6 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X5, X4 + VMOVDQA X6, X0 + PADDD 224(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X6, X7 + PALIGNR $0x04, X5, X7 + PADDD X7, X3 + SHA256MSG2 X6, X3 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X6, X5 + VMOVDQA X3, X0 + PADDD 256(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X3, X7 + PALIGNR $0x04, X6, X7 + PADDD X7, X4 + SHA256MSG2 X3, X4 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X3, X6 + VMOVDQA X4, X0 + PADDD 288(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X4, X7 + PALIGNR $0x04, X3, X7 + PADDD X7, X5 + SHA256MSG2 X4, X5 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X4, X3 + VMOVDQA X5, X0 + PADDD 320(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X5, X7 + PALIGNR $0x04, X4, X7 + PADDD X7, X6 + SHA256MSG2 X5, X6 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X5, X4 + VMOVDQA X6, X0 + PADDD 352(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X6, X7 + PALIGNR $0x04, X5, X7 + PADDD X7, X3 + SHA256MSG2 X6, X3 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X6, X5 + VMOVDQA X3, X0 + PADDD 384(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X3, X7 + PALIGNR $0x04, X6, X7 + PADDD X7, X4 + SHA256MSG2 X3, X4 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + SHA256MSG1 X3, X6 + VMOVDQA X4, X0 + PADDD 416(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X4, X7 + PALIGNR $0x04, X3, X7 + PADDD X7, X5 + SHA256MSG2 X4, X5 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 + VMOVDQA X5, X0 + PADDD 448(AX), X0 + SHA256RNDS2 X0, X1, X2 + VMOVDQA X5, X7 + PALIGNR $0x04, X4, X7 + PADDD X7, X6 + SHA256MSG2 X5, X6 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 // do rounds 60-63 - VMOVDQA m3, msg - PADDD (15*32)(sha256Constants), msg - SHA256RNDS2 msg, state0, state1 - PSHUFD $0x0e, msg, msg - SHA256RNDS2 msg, state1, state0 + VMOVDQA X6, X0 + PADDD 480(AX), X0 + SHA256RNDS2 X0, X1, X2 + PSHUFD $0x0e, X0, X0 + SHA256RNDS2 X0, X2, X1 // add current hash values with previously saved - PADDD abefSave, state0 - PADDD cdghSave, state1 + PADDD X9, X1 + PADDD X10, X2 // advance data pointer; loop until buffer empty - ADDQ $64, dataPtr - CMPQ numBytes, dataPtr - JNE roundLoop + ADDQ $0x40, SI + CMPQ DX, SI + JNE roundLoop // write hash values back in the correct order - PSHUFD $0x1b, state0, state0 // FEBA - PSHUFD $0xb1, state1, state1 // DCHG - VMOVDQA state0, m4 - PBLENDW $0xf0, state1, state0 // DCBA - PALIGNR $8, m4, state1 // HGFE - VMOVDQU state0, (0*16)(digestPtr) - VMOVDQU state1, (1*16)(digestPtr) + PSHUFD $0x1b, X1, X1 + PSHUFD $0xb1, X2, X2 + VMOVDQA X1, X7 + PBLENDW $0xf0, X2, X1 + PALIGNR $0x08, X7, X2 + VMOVDQU X1, (DI) + VMOVDQU X2, 16(DI) done: RET -// shuffle byte order from LE to BE -DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 -DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b -DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203 -DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b -GLOBL flip_mask<>(SB), 8, $32 - -// shuffle xBxA -> 00BA -DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100 -DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100 -DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF -GLOBL shuff_00BA<>(SB), 8, $32 - -// shuffle xDxC -> DC00 -DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100 -DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100 -GLOBL shuff_DC00<>(SB), 8, $32 - -// Round specific constants -DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1 -DATA K256<>+0x04(SB)/4, $0x71374491 // k2 -DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3 -DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4 -DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1 -DATA K256<>+0x14(SB)/4, $0x71374491 // k2 -DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3 -DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4 - -DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8 -DATA K256<>+0x24(SB)/4, $0x59f111f1 -DATA K256<>+0x28(SB)/4, $0x923f82a4 -DATA K256<>+0x2c(SB)/4, $0xab1c5ed5 -DATA K256<>+0x30(SB)/4, $0x3956c25b -DATA K256<>+0x34(SB)/4, $0x59f111f1 -DATA K256<>+0x38(SB)/4, $0x923f82a4 -DATA K256<>+0x3c(SB)/4, $0xab1c5ed5 - -DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12 -DATA K256<>+0x44(SB)/4, $0x12835b01 -DATA K256<>+0x48(SB)/4, $0x243185be -DATA K256<>+0x4c(SB)/4, $0x550c7dc3 -DATA K256<>+0x50(SB)/4, $0xd807aa98 -DATA K256<>+0x54(SB)/4, $0x12835b01 -DATA K256<>+0x58(SB)/4, $0x243185be -DATA K256<>+0x5c(SB)/4, $0x550c7dc3 - -DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16 -DATA K256<>+0x64(SB)/4, $0x80deb1fe -DATA K256<>+0x68(SB)/4, $0x9bdc06a7 -DATA K256<>+0x6c(SB)/4, $0xc19bf174 -DATA K256<>+0x70(SB)/4, $0x72be5d74 -DATA K256<>+0x74(SB)/4, $0x80deb1fe -DATA K256<>+0x78(SB)/4, $0x9bdc06a7 -DATA K256<>+0x7c(SB)/4, $0xc19bf174 - -DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20 -DATA K256<>+0x84(SB)/4, $0xefbe4786 -DATA K256<>+0x88(SB)/4, $0x0fc19dc6 -DATA K256<>+0x8c(SB)/4, $0x240ca1cc -DATA K256<>+0x90(SB)/4, $0xe49b69c1 -DATA K256<>+0x94(SB)/4, $0xefbe4786 -DATA K256<>+0x98(SB)/4, $0x0fc19dc6 -DATA K256<>+0x9c(SB)/4, $0x240ca1cc - -DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24 -DATA K256<>+0xa4(SB)/4, $0x4a7484aa -DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc -DATA K256<>+0xac(SB)/4, $0x76f988da -DATA K256<>+0xb0(SB)/4, $0x2de92c6f -DATA K256<>+0xb4(SB)/4, $0x4a7484aa -DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc -DATA K256<>+0xbc(SB)/4, $0x76f988da - -DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28 -DATA K256<>+0xc4(SB)/4, $0xa831c66d -DATA K256<>+0xc8(SB)/4, $0xb00327c8 -DATA K256<>+0xcc(SB)/4, $0xbf597fc7 -DATA K256<>+0xd0(SB)/4, $0x983e5152 -DATA K256<>+0xd4(SB)/4, $0xa831c66d -DATA K256<>+0xd8(SB)/4, $0xb00327c8 -DATA K256<>+0xdc(SB)/4, $0xbf597fc7 - -DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32 -DATA K256<>+0xe4(SB)/4, $0xd5a79147 -DATA K256<>+0xe8(SB)/4, $0x06ca6351 -DATA K256<>+0xec(SB)/4, $0x14292967 -DATA K256<>+0xf0(SB)/4, $0xc6e00bf3 -DATA K256<>+0xf4(SB)/4, $0xd5a79147 -DATA K256<>+0xf8(SB)/4, $0x06ca6351 -DATA K256<>+0xfc(SB)/4, $0x14292967 - -DATA K256<>+0x100(SB)/4, $0x27b70a85 -DATA K256<>+0x104(SB)/4, $0x2e1b2138 -DATA K256<>+0x108(SB)/4, $0x4d2c6dfc -DATA K256<>+0x10c(SB)/4, $0x53380d13 -DATA K256<>+0x110(SB)/4, $0x27b70a85 -DATA K256<>+0x114(SB)/4, $0x2e1b2138 -DATA K256<>+0x118(SB)/4, $0x4d2c6dfc -DATA K256<>+0x11c(SB)/4, $0x53380d13 - -DATA K256<>+0x120(SB)/4, $0x650a7354 -DATA K256<>+0x124(SB)/4, $0x766a0abb -DATA K256<>+0x128(SB)/4, $0x81c2c92e -DATA K256<>+0x12c(SB)/4, $0x92722c85 -DATA K256<>+0x130(SB)/4, $0x650a7354 -DATA K256<>+0x134(SB)/4, $0x766a0abb -DATA K256<>+0x138(SB)/4, $0x81c2c92e -DATA K256<>+0x13c(SB)/4, $0x92722c85 - -DATA K256<>+0x140(SB)/4, $0xa2bfe8a1 -DATA K256<>+0x144(SB)/4, $0xa81a664b -DATA K256<>+0x148(SB)/4, $0xc24b8b70 -DATA K256<>+0x14c(SB)/4, $0xc76c51a3 -DATA K256<>+0x150(SB)/4, $0xa2bfe8a1 -DATA K256<>+0x154(SB)/4, $0xa81a664b -DATA K256<>+0x158(SB)/4, $0xc24b8b70 -DATA K256<>+0x15c(SB)/4, $0xc76c51a3 - -DATA K256<>+0x160(SB)/4, $0xd192e819 -DATA K256<>+0x164(SB)/4, $0xd6990624 -DATA K256<>+0x168(SB)/4, $0xf40e3585 -DATA K256<>+0x16c(SB)/4, $0x106aa070 -DATA K256<>+0x170(SB)/4, $0xd192e819 -DATA K256<>+0x174(SB)/4, $0xd6990624 -DATA K256<>+0x178(SB)/4, $0xf40e3585 -DATA K256<>+0x17c(SB)/4, $0x106aa070 - -DATA K256<>+0x180(SB)/4, $0x19a4c116 -DATA K256<>+0x184(SB)/4, $0x1e376c08 -DATA K256<>+0x188(SB)/4, $0x2748774c -DATA K256<>+0x18c(SB)/4, $0x34b0bcb5 -DATA K256<>+0x190(SB)/4, $0x19a4c116 -DATA K256<>+0x194(SB)/4, $0x1e376c08 -DATA K256<>+0x198(SB)/4, $0x2748774c -DATA K256<>+0x19c(SB)/4, $0x34b0bcb5 - -DATA K256<>+0x1a0(SB)/4, $0x391c0cb3 -DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a -DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f -DATA K256<>+0x1ac(SB)/4, $0x682e6ff3 -DATA K256<>+0x1b0(SB)/4, $0x391c0cb3 -DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a -DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f -DATA K256<>+0x1bc(SB)/4, $0x682e6ff3 +DATA flip_mask<>+0(SB)/8, $0x0405060700010203 +DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b +DATA flip_mask<>+16(SB)/8, $0x0405060700010203 +DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b +GLOBL flip_mask<>(SB), RODATA, $32 -DATA K256<>+0x1c0(SB)/4, $0x748f82ee -DATA K256<>+0x1c4(SB)/4, $0x78a5636f -DATA K256<>+0x1c8(SB)/4, $0x84c87814 -DATA K256<>+0x1cc(SB)/4, $0x8cc70208 -DATA K256<>+0x1d0(SB)/4, $0x748f82ee -DATA K256<>+0x1d4(SB)/4, $0x78a5636f -DATA K256<>+0x1d8(SB)/4, $0x84c87814 -DATA K256<>+0x1dc(SB)/4, $0x8cc70208 +DATA K256<>+0(SB)/4, $0x428a2f98 +DATA K256<>+4(SB)/4, $0x71374491 +DATA K256<>+8(SB)/4, $0xb5c0fbcf +DATA K256<>+12(SB)/4, $0xe9b5dba5 +DATA K256<>+16(SB)/4, $0x428a2f98 +DATA K256<>+20(SB)/4, $0x71374491 +DATA K256<>+24(SB)/4, $0xb5c0fbcf +DATA K256<>+28(SB)/4, $0xe9b5dba5 +DATA K256<>+32(SB)/4, $0x3956c25b +DATA K256<>+36(SB)/4, $0x59f111f1 +DATA K256<>+40(SB)/4, $0x923f82a4 +DATA K256<>+44(SB)/4, $0xab1c5ed5 +DATA K256<>+48(SB)/4, $0x3956c25b +DATA K256<>+52(SB)/4, $0x59f111f1 +DATA K256<>+56(SB)/4, $0x923f82a4 +DATA K256<>+60(SB)/4, $0xab1c5ed5 +DATA K256<>+64(SB)/4, $0xd807aa98 +DATA K256<>+68(SB)/4, $0x12835b01 +DATA K256<>+72(SB)/4, $0x243185be +DATA K256<>+76(SB)/4, $0x550c7dc3 +DATA K256<>+80(SB)/4, $0xd807aa98 +DATA K256<>+84(SB)/4, $0x12835b01 +DATA K256<>+88(SB)/4, $0x243185be +DATA K256<>+92(SB)/4, $0x550c7dc3 +DATA K256<>+96(SB)/4, $0x72be5d74 +DATA K256<>+100(SB)/4, $0x80deb1fe +DATA K256<>+104(SB)/4, $0x9bdc06a7 +DATA K256<>+108(SB)/4, $0xc19bf174 +DATA K256<>+112(SB)/4, $0x72be5d74 +DATA K256<>+116(SB)/4, $0x80deb1fe +DATA K256<>+120(SB)/4, $0x9bdc06a7 +DATA K256<>+124(SB)/4, $0xc19bf174 +DATA K256<>+128(SB)/4, $0xe49b69c1 +DATA K256<>+132(SB)/4, $0xefbe4786 +DATA K256<>+136(SB)/4, $0x0fc19dc6 +DATA K256<>+140(SB)/4, $0x240ca1cc +DATA K256<>+144(SB)/4, $0xe49b69c1 +DATA K256<>+148(SB)/4, $0xefbe4786 +DATA K256<>+152(SB)/4, $0x0fc19dc6 +DATA K256<>+156(SB)/4, $0x240ca1cc +DATA K256<>+160(SB)/4, $0x2de92c6f +DATA K256<>+164(SB)/4, $0x4a7484aa +DATA K256<>+168(SB)/4, $0x5cb0a9dc +DATA K256<>+172(SB)/4, $0x76f988da +DATA K256<>+176(SB)/4, $0x2de92c6f +DATA K256<>+180(SB)/4, $0x4a7484aa +DATA K256<>+184(SB)/4, $0x5cb0a9dc +DATA K256<>+188(SB)/4, $0x76f988da +DATA K256<>+192(SB)/4, $0x983e5152 +DATA K256<>+196(SB)/4, $0xa831c66d +DATA K256<>+200(SB)/4, $0xb00327c8 +DATA K256<>+204(SB)/4, $0xbf597fc7 +DATA K256<>+208(SB)/4, $0x983e5152 +DATA K256<>+212(SB)/4, $0xa831c66d +DATA K256<>+216(SB)/4, $0xb00327c8 +DATA K256<>+220(SB)/4, $0xbf597fc7 +DATA K256<>+224(SB)/4, $0xc6e00bf3 +DATA K256<>+228(SB)/4, $0xd5a79147 +DATA K256<>+232(SB)/4, $0x06ca6351 +DATA K256<>+236(SB)/4, $0x14292967 +DATA K256<>+240(SB)/4, $0xc6e00bf3 +DATA K256<>+244(SB)/4, $0xd5a79147 +DATA K256<>+248(SB)/4, $0x06ca6351 +DATA K256<>+252(SB)/4, $0x14292967 +DATA K256<>+256(SB)/4, $0x27b70a85 +DATA K256<>+260(SB)/4, $0x2e1b2138 +DATA K256<>+264(SB)/4, $0x4d2c6dfc +DATA K256<>+268(SB)/4, $0x53380d13 +DATA K256<>+272(SB)/4, $0x27b70a85 +DATA K256<>+276(SB)/4, $0x2e1b2138 +DATA K256<>+280(SB)/4, $0x4d2c6dfc +DATA K256<>+284(SB)/4, $0x53380d13 +DATA K256<>+288(SB)/4, $0x650a7354 +DATA K256<>+292(SB)/4, $0x766a0abb +DATA K256<>+296(SB)/4, $0x81c2c92e +DATA K256<>+300(SB)/4, $0x92722c85 +DATA K256<>+304(SB)/4, $0x650a7354 +DATA K256<>+308(SB)/4, $0x766a0abb +DATA K256<>+312(SB)/4, $0x81c2c92e +DATA K256<>+316(SB)/4, $0x92722c85 +DATA K256<>+320(SB)/4, $0xa2bfe8a1 +DATA K256<>+324(SB)/4, $0xa81a664b +DATA K256<>+328(SB)/4, $0xc24b8b70 +DATA K256<>+332(SB)/4, $0xc76c51a3 +DATA K256<>+336(SB)/4, $0xa2bfe8a1 +DATA K256<>+340(SB)/4, $0xa81a664b +DATA K256<>+344(SB)/4, $0xc24b8b70 +DATA K256<>+348(SB)/4, $0xc76c51a3 +DATA K256<>+352(SB)/4, $0xd192e819 +DATA K256<>+356(SB)/4, $0xd6990624 +DATA K256<>+360(SB)/4, $0xf40e3585 +DATA K256<>+364(SB)/4, $0x106aa070 +DATA K256<>+368(SB)/4, $0xd192e819 +DATA K256<>+372(SB)/4, $0xd6990624 +DATA K256<>+376(SB)/4, $0xf40e3585 +DATA K256<>+380(SB)/4, $0x106aa070 +DATA K256<>+384(SB)/4, $0x19a4c116 +DATA K256<>+388(SB)/4, $0x1e376c08 +DATA K256<>+392(SB)/4, $0x2748774c +DATA K256<>+396(SB)/4, $0x34b0bcb5 +DATA K256<>+400(SB)/4, $0x19a4c116 +DATA K256<>+404(SB)/4, $0x1e376c08 +DATA K256<>+408(SB)/4, $0x2748774c +DATA K256<>+412(SB)/4, $0x34b0bcb5 +DATA K256<>+416(SB)/4, $0x391c0cb3 +DATA K256<>+420(SB)/4, $0x4ed8aa4a +DATA K256<>+424(SB)/4, $0x5b9cca4f +DATA K256<>+428(SB)/4, $0x682e6ff3 +DATA K256<>+432(SB)/4, $0x391c0cb3 +DATA K256<>+436(SB)/4, $0x4ed8aa4a +DATA K256<>+440(SB)/4, $0x5b9cca4f +DATA K256<>+444(SB)/4, $0x682e6ff3 +DATA K256<>+448(SB)/4, $0x748f82ee +DATA K256<>+452(SB)/4, $0x78a5636f +DATA K256<>+456(SB)/4, $0x84c87814 +DATA K256<>+460(SB)/4, $0x8cc70208 +DATA K256<>+464(SB)/4, $0x748f82ee +DATA K256<>+468(SB)/4, $0x78a5636f +DATA K256<>+472(SB)/4, $0x84c87814 +DATA K256<>+476(SB)/4, $0x8cc70208 +DATA K256<>+480(SB)/4, $0x90befffa +DATA K256<>+484(SB)/4, $0xa4506ceb +DATA K256<>+488(SB)/4, $0xbef9a3f7 +DATA K256<>+492(SB)/4, $0xc67178f2 +DATA K256<>+496(SB)/4, $0x90befffa +DATA K256<>+500(SB)/4, $0xa4506ceb +DATA K256<>+504(SB)/4, $0xbef9a3f7 +DATA K256<>+508(SB)/4, $0xc67178f2 +GLOBL K256<>(SB), RODATA|NOPTR, $512 -DATA K256<>+0x1e0(SB)/4, $0x90befffa -DATA K256<>+0x1e4(SB)/4, $0xa4506ceb -DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7 -DATA K256<>+0x1ec(SB)/4, $0xc67178f2 -DATA K256<>+0x1f0(SB)/4, $0x90befffa -DATA K256<>+0x1f4(SB)/4, $0xa4506ceb -DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7 -DATA K256<>+0x1fc(SB)/4, $0xc67178f2 +DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100 +DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff +DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100 +DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff +GLOBL shuff_00BA<>(SB), RODATA, $32 -GLOBL K256<>(SB), (NOPTR + RODATA), $512 +DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff +DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100 +DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff +DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100 +GLOBL shuff_DC00<>(SB), RODATA, $32 diff --git a/src/go/types/stdlib_test.go b/src/go/types/stdlib_test.go index a89cd858db..d0ed66a2e1 100644 --- a/src/go/types/stdlib_test.go +++ b/src/go/types/stdlib_test.go @@ -357,8 +357,9 @@ var excluded = map[string]bool{ "builtin": true, // See go.dev/issue/46027: some imports are missing for this submodule. - "crypto/internal/edwards25519/field/_asm": true, "crypto/internal/bigmod/_asm": true, + "crypto/internal/edwards25519/field/_asm": true, + "crypto/sha256/_asm": true, } // printPackageMu synchronizes the printing of type-checked package files in