--- /dev/null
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ . "github.com/mmcloughlin/avo/build"
+ . "github.com/mmcloughlin/avo/operand"
+ . "github.com/mmcloughlin/avo/reg"
+)
+
+//go:generate go run . -out ../sha256block_amd64.s -pkg sha256
+
+// SHA256 block routine. See sha256block.go for Go equivalent.
+//
+// The algorithm is detailed in FIPS 180-4:
+//
+// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
+
+// The avx2-version is described in an Intel White-Paper:
+// "Fast SHA-256 Implementations on Intel Architecture Processors"
+// To find it, surf to http://www.intel.com/p/en_US/embedded
+// and search for that title.
+// AVX2 version by Intel, same algorithm as code in Linux kernel:
+// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
+// by
+// James Guilford <james.guilford@intel.com>
+// Kirk Yap <kirk.s.yap@intel.com>
+// Tim Chen <tim.c.chen@linux.intel.com>
+
+// Wt = Mt; for 0 <= t <= 15
+// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
+//
+// a = H0
+// b = H1
+// c = H2
+// d = H3
+// e = H4
+// f = H5
+// g = H6
+// h = H7
+//
+// for t = 0 to 63 {
+// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
+// T2 = BIGSIGMA0(a) + Maj(a,b,c)
+// h = g
+// g = f
+// f = e
+// e = d + T1
+// d = c
+// c = b
+// b = a
+// a = T1 + T2
+// }
+//
+// H0 = a + H0
+// H1 = b + H1
+// H2 = c + H2
+// H3 = d + H3
+// H4 = e + H4
+// H5 = f + H5
+// H6 = g + H6
+// H7 = h + H7
+
+func main() {
+ Package("crypto/sha256")
+ ConstraintExpr("!purego")
+ block()
+ Generate()
+}
+
+// Wt = Mt; for 0 <= t <= 15
+func msgSchedule0(index int) {
+ MOVL(Mem{Base: SI}.Offset(index*4), EAX)
+ BSWAPL(EAX)
+ MOVL(EAX, Mem{Base: BP}.Offset(index*4))
+}
+
+// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
+//
+// SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
+// SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
+func msgSchedule1(index int) {
+ MOVL(Mem{Base: BP}.Offset((index-2)*4), EAX)
+ MOVL(EAX, ECX)
+ RORL(Imm(17), EAX)
+ MOVL(ECX, EDX)
+ RORL(Imm(19), ECX)
+ SHRL(Imm(10), EDX)
+ MOVL(Mem{Base: BP}.Offset((index-15)*4), EBX)
+ XORL(ECX, EAX)
+ MOVL(EBX, ECX)
+ XORL(EDX, EAX)
+ RORL(Imm(7), EBX)
+ MOVL(ECX, EDX)
+ SHRL(Imm(3), EDX)
+ RORL(Imm(18), ECX)
+ ADDL(Mem{Base: BP}.Offset((index-7)*4), EAX)
+ XORL(ECX, EBX)
+ XORL(EDX, EBX)
+ ADDL(Mem{Base: BP}.Offset((index-16)*4), EBX)
+ ADDL(EBX, EAX)
+ MOVL(EAX, Mem{Base: BP}.Offset((index)*4))
+}
+
+// Calculate T1 in AX - uses AX, CX and DX registers.
+// h is also used as an accumulator. Wt is passed in AX.
+//
+// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
+// BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
+// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
+func sha256T1(konst uint32, e, f, g, h GPPhysical) {
+ ADDL(EAX, h)
+ MOVL(e, EAX)
+ ADDL(U32(konst), h)
+ MOVL(e, ECX)
+ RORL(U8(6), EAX)
+ MOVL(e, EDX)
+ RORL(U8(11), ECX)
+ XORL(ECX, EAX)
+ MOVL(e, ECX)
+ RORL(U8(25), EDX)
+ ANDL(f, ECX)
+ XORL(EAX, EDX)
+ MOVL(e, EAX)
+ NOTL(EAX)
+ ADDL(EDX, h)
+ ANDL(g, EAX)
+ XORL(ECX, EAX)
+ ADDL(h, EAX)
+}
+
+// Calculate T2 in BX - uses BX, CX, DX and DI registers.
+//
+// T2 = BIGSIGMA0(a) + Maj(a, b, c)
+// BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
+// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
+func sha256T2(a, b, c GPPhysical) {
+ MOVL(a, EDI)
+ MOVL(c, EBX)
+ RORL(U8(2), EDI)
+ MOVL(a, EDX)
+ ANDL(b, EBX)
+ RORL(U8(13), EDX)
+ MOVL(a, ECX)
+ ANDL(c, ECX)
+ XORL(EDX, EDI)
+ XORL(ECX, EBX)
+ MOVL(a, EDX)
+ MOVL(b, ECX)
+ RORL(U8(22), EDX)
+ ANDL(a, ECX)
+ XORL(ECX, EBX)
+ XORL(EDX, EDI)
+ ADDL(EDI, EBX)
+}
+
+// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
+// The values for e and a are stored in d and h, ready for rotation.
+func sha256Round(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) {
+ sha256T1(konst, e, f, g, h)
+ sha256T2(a, b, c)
+ MOVL(EBX, h)
+ ADDL(EAX, d)
+ ADDL(EAX, h)
+}
+
+func sha256Round0(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) {
+ msgSchedule0(index)
+ sha256Round(index, konst, a, b, c, d, e, f, g, h)
+}
+
+func sha256Round1(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) {
+ msgSchedule1(index)
+ sha256Round(index, konst, a, b, c, d, e, f, g, h)
+}
+
+// Definitions for AVX2 version
+
+// addm (mem), reg
+// - Add reg to mem using reg-mem add and store
+func addm(P1 Mem, P2 GPPhysical) {
+ ADDL(P2, P1)
+ MOVL(P1, P2)
+}
+
+var (
+ XDWORD0 VecPhysical = Y4
+ XDWORD1 = Y5
+ XDWORD2 = Y6
+ XDWORD3 = Y7
+
+ XWORD0 = X4
+ XWORD1 = X5
+ XWORD2 = X6
+ XWORD3 = X7
+
+ XTMP0 = Y0
+ XTMP1 = Y1
+ XTMP2 = Y2
+ XTMP3 = Y3
+ XTMP4 = Y8
+ XTMP5 = Y11
+
+ XFER = Y9
+
+ BYTE_FLIP_MASK = Y13 // mask to convert LE -> BE
+ X_BYTE_FLIP_MASK = X13
+
+ NUM_BYTES GPPhysical = RDX
+ INP = RDI
+
+ CTX = RSI // Beginning of digest in memory (a, b, c, ... , h)
+
+ a = EAX
+ b = EBX
+ c = ECX
+ d = R8L
+ e = EDX
+ f = R9L
+ g = R10L
+ h = R11L
+
+ old_h = R11L
+
+ TBL = RBP
+
+ SRND = RSI // SRND is same register as CTX
+
+ T1 = R12L
+
+ y0 = R13L
+ y1 = R14L
+ y2 = R15L
+ y3 = EDI
+
+ // Offsets
+ XFER_SIZE = 2 * 64 * 4
+ INP_END_SIZE = 8
+ INP_SIZE = 8
+
+ _XFER = 0
+ _INP_END = _XFER + XFER_SIZE
+ _INP = _INP_END + INP_END_SIZE
+ STACK_SIZE = _INP + INP_SIZE
+)
+
+func roundAndSchedN0(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
+ // ############################# RND N + 0 ############################//
+ MOVL(a, y3) // y3 = a
+ RORXL(Imm(25), e, y0) // y0 = e >> 25
+ RORXL(Imm(11), e, y1) // y1 = e >> 11
+
+ ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+ ORL(c, y3) // y3 = a|c
+ VPALIGNR(Imm(4), XDWORD2, XDWORD3, XTMP0) // XTMP0 = W[-7]
+ MOVL(f, y2) // y2 = f
+ RORXL(Imm(13), a, T1) // T1 = a >> 13
+
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
+ XORL(g, y2) // y2 = f^g
+ VPADDD(XDWORD0, XTMP0, XTMP0) // XTMP0 = W[-7] + W[-16]
+ RORXL(Imm(6), e, y1) // y1 = (e >> 6)
+
+ ANDL(e, y2) // y2 = (f^g)&e
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+ RORXL(Imm(22), a, y1) // y1 = a >> 22
+ ADDL(h, d) // d = k + w + h + d
+
+ ANDL(b, y3) // y3 = (a|c)&b
+ VPALIGNR(Imm(4), XDWORD0, XDWORD1, XTMP1) // XTMP1 = W[-15]
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
+ RORXL(Imm(2), a, T1) // T1 = (a >> 2)
+
+ XORL(g, y2) // y2 = CH = ((f^g)&e)^g
+ VPSRLD(Imm(7), XTMP1, XTMP2) //
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+ MOVL(a, T1) // T1 = a
+ ANDL(c, T1) // T1 = a&c
+
+ ADDL(y0, y2) // y2 = S1 + CH
+ VPSLLD(Imm(32-7), XTMP1, XTMP3) //
+ ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
+ ADDL(y1, h) // h = k + w + h + S0
+
+ ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
+ VPOR(XTMP2, XTMP3, XTMP3) // XTMP3 = W[-15] ror 7
+
+ VPSRLD(Imm(18), XTMP1, XTMP2)
+ ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0
+ ADDL(y3, h) // h = t1 + S0 + MAJ
+}
+
+func roundAndSchedN1(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
+ // ################################### RND N + 1 ############################
+ MOVL(a, y3) // y3 = a
+ RORXL(Imm(25), e, y0) // y0 = e >> 25
+ RORXL(Imm(11), e, y1) // y1 = e >> 11
+ ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+ ORL(c, y3) // y3 = a|c
+
+ VPSRLD(Imm(3), XTMP1, XTMP4) // XTMP4 = W[-15] >> 3
+ MOVL(f, y2) // y2 = f
+ RORXL(Imm(13), a, T1) // T1 = a >> 13
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
+ XORL(g, y2) // y2 = f^g
+
+ RORXL(Imm(6), e, y1) // y1 = (e >> 6)
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+ RORXL(Imm(22), a, y1) // y1 = a >> 22
+ ANDL(e, y2) // y2 = (f^g)&e
+ ADDL(h, d) // d = k + w + h + d
+
+ VPSLLD(Imm(32-18), XTMP1, XTMP1)
+ ANDL(b, y3) // y3 = (a|c)&b
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
+
+ VPXOR(XTMP1, XTMP3, XTMP3)
+ RORXL(Imm(2), a, T1) // T1 = (a >> 2)
+ XORL(g, y2) // y2 = CH = ((f^g)&e)^g
+
+ VPXOR(XTMP2, XTMP3, XTMP3) // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+ MOVL(a, T1) // T1 = a
+ ANDL(c, T1) // T1 = a&c
+ ADDL(y0, y2) // y2 = S1 + CH
+
+ VPXOR(XTMP4, XTMP3, XTMP1) // XTMP1 = s0
+ VPSHUFD(Imm(0xFA), XDWORD3, XTMP2) // XTMP2 = W[-2] {BBAA}
+ ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
+ ADDL(y1, h) // h = k + w + h + S0
+
+ VPADDD(XTMP1, XTMP0, XTMP0) // XTMP0 = W[-16] + W[-7] + s0
+ ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
+ ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0
+ ADDL(y3, h) // h = t1 + S0 + MAJ
+
+ VPSRLD(Imm(10), XTMP2, XTMP4) // XTMP4 = W[-2] >> 10 {BBAA}
+}
+
+func roundAndSchedN2(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
+ // ################################### RND N + 2 ############################
+ var shuff_00BA Mem = shuff_00BA_DATA()
+
+ MOVL(a, y3) // y3 = a
+ RORXL(Imm(25), e, y0) // y0 = e >> 25
+ ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+
+ VPSRLQ(Imm(19), XTMP2, XTMP3) // XTMP3 = W[-2] ror 19 {xBxA}
+ RORXL(Imm(11), e, y1) // y1 = e >> 11
+ ORL(c, y3) // y3 = a|c
+ MOVL(f, y2) // y2 = f
+ XORL(g, y2) // y2 = f^g
+
+ RORXL(Imm(13), a, T1) // T1 = a >> 13
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
+ VPSRLQ(Imm(17), XTMP2, XTMP2) // XTMP2 = W[-2] ror 17 {xBxA}
+ ANDL(e, y2) // y2 = (f^g)&e
+
+ RORXL(Imm(6), e, y1) // y1 = (e >> 6)
+ VPXOR(XTMP3, XTMP2, XTMP2)
+ ADDL(h, d) // d = k + w + h + d
+ ANDL(b, y3) // y3 = (a|c)&b
+
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+ RORXL(Imm(22), a, y1) // y1 = a >> 22
+ VPXOR(XTMP2, XTMP4, XTMP4) // XTMP4 = s1 {xBxA}
+ XORL(g, y2) // y2 = CH = ((f^g)&e)^g
+
+ VPSHUFB(shuff_00BA, XTMP4, XTMP4) // XTMP4 = s1 {00BA}
+
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
+ RORXL(Imm(2), a, T1) // T1 = (a >> 2)
+ VPADDD(XTMP4, XTMP0, XTMP0) // XTMP0 = {..., ..., W[1], W[0]}
+
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+ MOVL(a, T1) // T1 = a
+ ANDL(c, T1) // T1 = a&c
+ ADDL(y0, y2) // y2 = S1 + CH
+ VPSHUFD(Imm(80), XTMP0, XTMP2) // XTMP2 = W[-2] {DDCC}
+
+ ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
+ ADDL(y1, h) // h = k + w + h + S0
+ ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
+ ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0
+
+ ADDL(y3, h) // h = t1 + S0 + MAJ
+}
+
+func roundAndSchedN3(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
+ // ################################### RND N + 3 ############################
+ var shuff_DC00 Mem = shuff_DC00_DATA()
+
+ MOVL(a, y3) // y3 = a
+ RORXL(Imm(25), e, y0) // y0 = e >> 25
+ RORXL(Imm(11), e, y1) // y1 = e >> 11
+ ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+ ORL(c, y3) // y3 = a|c
+
+ VPSRLD(Imm(10), XTMP2, XTMP5) // XTMP5 = W[-2] >> 10 {DDCC}
+ MOVL(f, y2) // y2 = f
+ RORXL(Imm(13), a, T1) // T1 = a >> 13
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
+ XORL(g, y2) // y2 = f^g
+
+ VPSRLQ(Imm(19), XTMP2, XTMP3) // XTMP3 = W[-2] ror 19 {xDxC}
+ RORXL(Imm(6), e, y1) // y1 = (e >> 6)
+ ANDL(e, y2) // y2 = (f^g)&e
+ ADDL(h, d) // d = k + w + h + d
+ ANDL(b, y3) // y3 = (a|c)&b
+
+ VPSRLQ(Imm(17), XTMP2, XTMP2) // XTMP2 = W[-2] ror 17 {xDxC}
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+ XORL(g, y2) // y2 = CH = ((f^g)&e)^g
+
+ VPXOR(XTMP3, XTMP2, XTMP2)
+ RORXL(Imm(22), a, y1) // y1 = a >> 22
+ ADDL(y0, y2) // y2 = S1 + CH
+
+ VPXOR(XTMP2, XTMP5, XTMP5) // XTMP5 = s1 {xDxC}
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
+ ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
+
+ RORXL(Imm(2), a, T1) // T1 = (a >> 2)
+
+ VPSHUFB(shuff_DC00, XTMP5, XTMP5) // XTMP5 = s1 {DC00}
+
+ VPADDD(XTMP0, XTMP5, XDWORD0) // XDWORD0 = {W[3], W[2], W[1], W[0]}
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+ MOVL(a, T1) // T1 = a
+ ANDL(c, T1) // T1 = a&c
+ ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
+
+ ADDL(y1, h) // h = k + w + h + S0
+ ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0
+ ADDL(y3, h) // h = t1 + S0 + MAJ
+}
+
+func doRoundN0(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
+ // ################################### RND N + 0 ###########################
+ MOVL(f, y2) // y2 = f
+ RORXL(Imm(25), e, y0) // y0 = e >> 25
+ RORXL(Imm(11), e, y1) // y1 = e >> 11
+ XORL(g, y2) // y2 = f^g
+
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
+ RORXL(Imm(6), e, y1) // y1 = (e >> 6)
+ ANDL(e, y2) // y2 = (f^g)&e
+
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+ RORXL(Imm(13), a, T1) // T1 = a >> 13
+ XORL(g, y2) // y2 = CH = ((f^g)&e)^g
+ RORXL(Imm(22), a, y1) // y1 = a >> 22
+ MOVL(a, y3) // y3 = a
+
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
+ RORXL(Imm(2), a, T1) // T1 = (a >> 2)
+ ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+ ORL(c, y3) // y3 = a|c
+
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+ MOVL(a, T1) // T1 = a
+ ANDL(b, y3) // y3 = (a|c)&b
+ ANDL(c, T1) // T1 = a&c
+ ADDL(y0, y2) // y2 = S1 + CH
+
+ ADDL(h, d) // d = k + w + h + d
+ ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
+ ADDL(y1, h) // h = k + w + h + S0
+ ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
+}
+
+func doRoundN1(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
+ // ################################### RND N + 1 ###########################
+ ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0
+ MOVL(f, y2) // y2 = f
+ RORXL(Imm(25), e, y0) // y0 = e >> 25
+ RORXL(Imm(11), e, y1) // y1 = e >> 11
+ XORL(g, y2) // y2 = f^g
+
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
+ RORXL(Imm(6), e, y1) // y1 = (e >> 6)
+ ANDL(e, y2) // y2 = (f^g)&e
+ ADDL(y3, old_h) // h = t1 + S0 + MAJ
+
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+ RORXL(Imm(13), a, T1) // T1 = a >> 13
+ XORL(g, y2) // y2 = CH = ((f^g)&e)^g
+ RORXL(Imm(22), a, y1) // y1 = a >> 22
+ MOVL(a, y3) // y3 = a
+
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
+ RORXL(Imm(2), a, T1) // T1 = (a >> 2)
+ ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+ ORL(c, y3) // y3 = a|c
+
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+ MOVL(a, T1) // T1 = a
+ ANDL(b, y3) // y3 = (a|c)&b
+ ANDL(c, T1) // T1 = a&c
+ ADDL(y0, y2) // y2 = S1 + CH
+
+ ADDL(h, d) // d = k + w + h + d
+ ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
+ ADDL(y1, h) // h = k + w + h + S0
+
+ ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
+}
+
+func doRoundN2(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
+ // ################################### RND N + 2 ##############################
+ ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0
+ MOVL(f, y2) // y2 = f
+ RORXL(Imm(25), e, y0) // y0 = e >> 25
+ RORXL(Imm(11), e, y1) // y1 = e >> 11
+ XORL(g, y2) // y2 = f^g
+
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
+ RORXL(Imm(6), e, y1) // y1 = (e >> 6)
+ ANDL(e, y2) // y2 = (f^g)&e
+ ADDL(y3, old_h) // h = t1 + S0 + MAJ
+
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+ RORXL(Imm(13), a, T1) // T1 = a >> 13
+ XORL(g, y2) // y2 = CH = ((f^g)&e)^g
+ RORXL(Imm(22), a, y1) // y1 = a >> 22
+ MOVL(a, y3) // y3 = a
+
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
+ RORXL(Imm(2), a, T1) // T1 = (a >> 2)
+ ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+ ORL(c, y3) // y3 = a|c
+
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+ MOVL(a, T1) // T1 = a
+ ANDL(b, y3) // y3 = (a|c)&b
+ ANDL(c, T1) // T1 = a&c
+ ADDL(y0, y2) // y2 = S1 + CH
+
+ ADDL(h, d) // d = k + w + h + d
+ ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
+ ADDL(y1, h) // h = k + w + h + S0
+
+ ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
+}
+
+func doRoundN3(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
+ // ################################### RND N + 3 ###########################
+ ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0
+ MOVL(f, y2) // y2 = f
+ RORXL(Imm(25), e, y0) // y0 = e >> 25
+ RORXL(Imm(11), e, y1) // y1 = e >> 11
+ XORL(g, y2) // y2 = f^g
+
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11)
+ RORXL(Imm(6), e, y1) // y1 = (e >> 6)
+ ANDL(e, y2) // y2 = (f^g)&e
+ ADDL(y3, old_h) // h = t1 + S0 + MAJ
+
+ XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+ RORXL(Imm(13), a, T1) // T1 = a >> 13
+ XORL(g, y2) // y2 = CH = ((f^g)&e)^g
+ RORXL(Imm(22), a, y1) // y1 = a >> 22
+ MOVL(a, y3) // y3 = a
+
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13)
+ RORXL(Imm(2), a, T1) // T1 = (a >> 2)
+ ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+ ORL(c, y3) // y3 = a|c
+
+ XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+ MOVL(a, T1) // T1 = a
+ ANDL(b, y3) // y3 = (a|c)&b
+ ANDL(c, T1) // T1 = a&c
+ ADDL(y0, y2) // y2 = S1 + CH
+
+ ADDL(h, d) // d = k + w + h + d
+ ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c)
+ ADDL(y1, h) // h = k + w + h + S0
+
+ ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1
+
+ ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0
+
+ ADDL(y3, h) // h = t1 + S0 + MAJ
+}
+
+// Definitions for sha-ni version
+//
+// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
+// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
+//
+// Reference
+// S. Gulley, et al, "New Instructions Supporting the Secure Hash
+// Algorithm on Intel® Architecture Processors", July 2013
+// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
+//
+
+var (
+ digestPtr GPPhysical = RDI // input/output, base pointer to digest hash vector H0, H1, ..., H7
+ dataPtr = RSI // input, base pointer to first input data block
+ numBytes = RDX // input, number of input bytes to be processed
+ sha256Constants = RAX // round contents from K256 table, indexed by round number x 32
+ msg VecPhysical = X0 // input data
+ state0 = X1 // round intermediates and outputs
+ state1 = X2
+ m0 = X3 // m0, m1,... m4 -- round message temps
+ m1 = X4
+ m2 = X5
+ m3 = X6
+ m4 = X7
+ shufMask = X8 // input data endian conversion control mask
+ abefSave = X9 // digest hash vector inter-block buffer abef
+ cdghSave = X10 // digest hash vector inter-block buffer cdgh
+)
+
+// nop instead of final SHA256MSG1 for first and last few rounds
+func nop(m, a VecPhysical) {
+}
+
+// final SHA256MSG1 for middle rounds that require it
+func sha256msg1(m, a VecPhysical) {
+ SHA256MSG1(m, a)
+}
+
+// msg copy for all but rounds 12-15
+func vmov(a, b VecPhysical) {
+ VMOVDQA(a, b)
+}
+
+// reverse copy for rounds 12-15
+func vmovrev(a, b VecPhysical) {
+ VMOVDQA(b, a)
+}
+
+type VecFunc func(a, b VecPhysical)
+
+// sha rounds 0 to 11
+//
+// identical with the exception of the final msg op
+// which is replaced with a nop for rounds where it is not needed
+// refer to Gulley, et al for more information
+func rounds0to11(m, a VecPhysical, c int, sha256msg1 VecFunc) {
+ VMOVDQU(Mem{Base: dataPtr}.Offset(c*16), msg)
+ PSHUFB(shufMask, msg)
+ VMOVDQA(msg, m)
+ PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg)
+ SHA256RNDS2(msg, state0, state1)
+ PSHUFD(U8(0x0e), msg, msg)
+ SHA256RNDS2(msg, state1, state0)
+ sha256msg1(m, a)
+}
+
+// sha rounds 12 to 59
+//
+// identical with the exception of the final msg op
+// and the reverse copy(m,msg) in round 12 which is required
+// after the last data load
+// refer to Gulley, et al for more information
+func rounds12to59(m VecPhysical, c int, a, t VecPhysical, sha256msg1, movop VecFunc) {
+ movop(m, msg)
+ PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg)
+ SHA256RNDS2(msg, state0, state1)
+ VMOVDQA(m, m4)
+ PALIGNR(Imm(4), a, m4)
+ PADDD(m4, t)
+ SHA256MSG2(m, t)
+ PSHUFD(Imm(0x0e), msg, msg)
+ SHA256RNDS2(msg, state1, state0)
+ sha256msg1(m, a)
+}
+
+func block() {
+ Implement("block")
+ AllocLocal(536)
+
+ checkArchFlags()
+ sha256()
+ avx2()
+ sha_ni()
+}
+
+func checkArchFlags() {
+ CMPB(Mem{Symbol: Symbol{Name: "·useSHA"}, Base: StaticBase}, Imm(1))
+ JE(LabelRef("sha_ni"))
+ CMPB(Mem{Symbol: Symbol{Name: "·useAVX2"}, Base: StaticBase}, Imm(1))
+ JE(LabelRef("avx2"))
+}
+
+func sha256() {
+ Load(Param("p").Base(), RSI)
+ Load(Param("p").Len(), RDX)
+ SHRQ(Imm(6), RDX)
+ SHLQ(Imm(6), RDX)
+
+ // Return if p is empty
+ LEAQ(Mem{Base: RSI, Index: RDX, Scale: 1}, RDI)
+ MOVQ(RDI, Mem{Base: SP}.Offset(256))
+ CMPQ(RSI, RDI)
+ JEQ(LabelRef("end"))
+
+ BP := Mem{Base: BP}
+ Load(Param("dig"), RBP)
+ MOVL(BP.Offset(0*4), R8L) // a = H0
+ MOVL(BP.Offset(1*4), R9L) // b = H1
+ MOVL(BP.Offset(2*4), R10L) // c = H2
+ MOVL(BP.Offset(3*4), R11L) // d = H3
+ MOVL(BP.Offset(4*4), R12L) // e = H4
+ MOVL(BP.Offset(5*4), R13L) // f = H5
+ MOVL(BP.Offset(6*4), R14L) // g = H6
+ MOVL(BP.Offset(7*4), R15L) // h = H7
+
+ loop()
+ end()
+}
+
+func rotateRight(slice *[]GPPhysical) []GPPhysical {
+ n := len(*slice)
+ new := make([]GPPhysical, n)
+ for i, reg := range *slice {
+ new[(i+1)%n] = reg
+ }
+ return new
+}
+
+func loop() {
+ Label("loop")
+ MOVQ(RSP, RBP)
+
+ regs := []GPPhysical{R8L, R9L, R10L, R11L, R12L, R13L, R14L, R15L}
+ n := len(_K)
+
+ for i := 0; i < 16; i++ {
+ sha256Round0(i, _K[i], regs[0], regs[1], regs[2], regs[3], regs[4], regs[5], regs[6], regs[7])
+ regs = rotateRight(®s)
+ }
+
+ for i := 16; i < n; i++ {
+ sha256Round1(i, _K[i], regs[0], regs[1], regs[2], regs[3], regs[4], regs[5], regs[6], regs[7])
+ regs = rotateRight(®s)
+ }
+
+ Load(Param("dig"), RBP)
+ BP := Mem{Base: BP}
+ ADDL(BP.Offset(0*4), R8L) // H0 = a + H0
+ MOVL(R8L, BP.Offset(0*4))
+ ADDL(BP.Offset(1*4), R9L) // H1 = b + H1
+ MOVL(R9L, BP.Offset(1*4))
+ ADDL(BP.Offset(2*4), R10L) // H2 = c + H2
+ MOVL(R10L, BP.Offset(2*4))
+ ADDL(BP.Offset(3*4), R11L) // H3 = d + H3
+ MOVL(R11L, BP.Offset(3*4))
+ ADDL(BP.Offset(4*4), R12L) // H4 = e + H4
+ MOVL(R12L, BP.Offset(4*4))
+ ADDL(BP.Offset(5*4), R13L) // H5 = f + H5
+ MOVL(R13L, BP.Offset(5*4))
+ ADDL(BP.Offset(6*4), R14L) // H6 = g + H6
+ MOVL(R14L, BP.Offset(6*4))
+ ADDL(BP.Offset(7*4), R15L) // H7 = h + H7
+ MOVL(R15L, BP.Offset(7*4))
+
+ ADDQ(Imm(64), RSI)
+ CMPQ(RSI, Mem{Base: SP}.Offset(256))
+ JB(LabelRef("loop"))
+}
+
+func end() {
+ Label("end")
+ RET()
+}
+
+func avx2() {
+ Label("avx2")
+ Load(Param("dig"), CTX) // d.h[8]
+ Load(Param("p").Base(), INP)
+ Load(Param("p").Len(), NUM_BYTES)
+
+ LEAQ(Mem{Base: INP, Index: NUM_BYTES, Scale: 1, Disp: -64}, NUM_BYTES) // Pointer to the last block
+ MOVQ(NUM_BYTES, Mem{Base: SP}.Offset(_INP_END))
+
+ CMPQ(NUM_BYTES, INP)
+ JE(LabelRef("avx2_only_one_block"))
+
+ Comment("Load initial digest")
+ CTX := Mem{Base: CTX}
+ MOVL(CTX.Offset(0), a) // a = H0
+ MOVL(CTX.Offset(4), b) // b = H1
+ MOVL(CTX.Offset(8), c) // c = H2
+ MOVL(CTX.Offset(12), d) // d = H3
+ MOVL(CTX.Offset(16), e) // e = H4
+ MOVL(CTX.Offset(20), f) // f = H5
+ MOVL(CTX.Offset(24), g) // g = H6
+ MOVL(CTX.Offset(28), h) // h = H7
+
+ avx2_loop0()
+ avx2_last_block_enter()
+ avx2_loop1()
+ avx2_loop2()
+ avx2_loop3()
+ avx2_do_last_block()
+ avx2_only_one_block()
+ done_hash()
+}
+
+func avx2_loop0() {
+ Label("avx2_loop0")
+ Comment("at each iteration works with one block (512 bit)")
+ VMOVDQU(Mem{Base: INP}.Offset(0*32), XTMP0)
+ VMOVDQU(Mem{Base: INP}.Offset(1*32), XTMP1)
+ VMOVDQU(Mem{Base: INP}.Offset(2*32), XTMP2)
+ VMOVDQU(Mem{Base: INP}.Offset(3*32), XTMP3)
+
+ flip_mask := flip_mask_DATA()
+
+ VMOVDQU(flip_mask, BYTE_FLIP_MASK)
+
+ Comment("Apply Byte Flip Mask: LE -> BE")
+ VPSHUFB(BYTE_FLIP_MASK, XTMP0, XTMP0)
+ VPSHUFB(BYTE_FLIP_MASK, XTMP1, XTMP1)
+ VPSHUFB(BYTE_FLIP_MASK, XTMP2, XTMP2)
+ VPSHUFB(BYTE_FLIP_MASK, XTMP3, XTMP3)
+
+ Comment("Transpose data into high/low parts")
+ VPERM2I128(Imm(0x20), XTMP2, XTMP0, XDWORD0) // w3, w2, w1, w0
+ VPERM2I128(Imm(0x31), XTMP2, XTMP0, XDWORD1) // w7, w6, w5, w4
+ VPERM2I128(Imm(0x20), XTMP3, XTMP1, XDWORD2) // w11, w10, w9, w8
+ VPERM2I128(Imm(0x31), XTMP3, XTMP1, XDWORD3) // w15, w14, w13, w12
+
+ K256 := K256_DATA()
+ LEAQ(K256, TBL) // Loading address of table with round-specific constants
+}
+
+func avx2_last_block_enter() {
+ Label("avx2_last_block_enter")
+ ADDQ(Imm(64), INP)
+ MOVQ(INP, Mem{Base: SP}.Offset(_INP))
+ XORQ(SRND, SRND)
+}
+
+// for w0 - w47
+func avx2_loop1() {
+ Label("avx2_loop1")
+
+ Comment("Do 4 rounds and scheduling")
+ VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((0 * 32)), XDWORD0, XFER)
+ VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
+ roundAndSchedN0(_XFER+0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
+ roundAndSchedN1(_XFER+0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
+ roundAndSchedN2(_XFER+0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
+ roundAndSchedN3(_XFER+0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
+
+ Comment("Do 4 rounds and scheduling")
+ VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
+ VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
+ roundAndSchedN0(_XFER+1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
+ roundAndSchedN1(_XFER+1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
+ roundAndSchedN2(_XFER+1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
+ roundAndSchedN3(_XFER+1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
+
+ Comment("Do 4 rounds and scheduling")
+ VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((2 * 32)), XDWORD2, XFER)
+ VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+2*32))
+ roundAndSchedN0(_XFER+2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
+ roundAndSchedN1(_XFER+2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
+ roundAndSchedN2(_XFER+2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
+ roundAndSchedN3(_XFER+2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
+
+ Comment("Do 4 rounds and scheduling")
+ VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((3 * 32)), XDWORD3, XFER)
+ VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+3*32))
+ roundAndSchedN0(_XFER+3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
+ roundAndSchedN1(_XFER+3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
+ roundAndSchedN2(_XFER+3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
+ roundAndSchedN3(_XFER+3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
+
+ ADDQ(Imm(4*32), SRND)
+ CMPQ(SRND, U32(3*4*32))
+ JB(LabelRef("avx2_loop1"))
+}
+
+// w48 - w63 processed with no scheduling (last 16 rounds)
+func avx2_loop2() {
+ Label("avx2_loop2")
+ VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(0*32), XDWORD0, XFER)
+ VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
+ doRoundN0(_XFER+0*32, a, b, c, d, e, f, g, h, h)
+ doRoundN1(_XFER+0*32, h, a, b, c, d, e, f, g, h)
+ doRoundN2(_XFER+0*32, g, h, a, b, c, d, e, f, g)
+ doRoundN3(_XFER+0*32, f, g, h, a, b, c, d, e, f)
+
+ VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
+ VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
+ doRoundN0(_XFER+1*32, e, f, g, h, a, b, c, d, e)
+ doRoundN1(_XFER+1*32, d, e, f, g, h, a, b, c, d)
+ doRoundN2(_XFER+1*32, c, d, e, f, g, h, a, b, c)
+ doRoundN3(_XFER+1*32, b, c, d, e, f, g, h, a, b)
+
+ ADDQ(Imm(2*32), SRND)
+
+ VMOVDQU(XDWORD2, XDWORD0)
+ VMOVDQU(XDWORD3, XDWORD1)
+
+ CMPQ(SRND, U32(4*4*32))
+ JB(LabelRef("avx2_loop2"))
+
+ Load(Param("dig"), CTX) // d.h[8]
+ MOVQ(Mem{Base: SP}.Offset(_INP), INP)
+
+ registers := []GPPhysical{a, b, c, d, e, f, g, h}
+ for i, reg := range registers {
+ addm(Mem{Base: CTX}.Offset(i*4), reg)
+ }
+
+ CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
+ JB(LabelRef("done_hash"))
+
+ XORQ(SRND, SRND)
+}
+
+// Do second block using previously scheduled results
+func avx2_loop3() {
+ Label("avx2_loop3")
+ doRoundN0(_XFER+0*32+16, a, b, c, d, e, f, g, h, a)
+ doRoundN1(_XFER+0*32+16, h, a, b, c, d, e, f, g, h)
+ doRoundN2(_XFER+0*32+16, g, h, a, b, c, d, e, f, g)
+ doRoundN3(_XFER+0*32+16, f, g, h, a, b, c, d, e, f)
+
+ doRoundN0(_XFER+1*32+16, e, f, g, h, a, b, c, d, e)
+ doRoundN1(_XFER+1*32+16, d, e, f, g, h, a, b, c, d)
+ doRoundN2(_XFER+1*32+16, c, d, e, f, g, h, a, b, c)
+ doRoundN3(_XFER+1*32+16, b, c, d, e, f, g, h, a, b)
+
+ ADDQ(Imm(2*32), SRND)
+ CMPQ(SRND, U32(4*4*32))
+ JB(LabelRef("avx2_loop3"))
+
+ Load(Param("dig"), CTX) // d.h[8]
+ MOVQ(Mem{Base: SP}.Offset(_INP), INP)
+ ADDQ(Imm(64), INP)
+
+ registers := []GPPhysical{a, b, c, d, e, f, g, h}
+ for i, reg := range registers {
+ addm(Mem{Base: CTX}.Offset(i*4), reg)
+ }
+
+ CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
+ JA(LabelRef("avx2_loop0"))
+ JB(LabelRef("done_hash"))
+}
+
+func avx2_do_last_block() {
+ Label("avx2_do_last_block")
+ VMOVDQU(Mem{Base: INP}.Offset(0), XWORD0)
+ VMOVDQU(Mem{Base: INP}.Offset(16), XWORD1)
+ VMOVDQU(Mem{Base: INP}.Offset(32), XWORD2)
+ VMOVDQU(Mem{Base: INP}.Offset(48), XWORD3)
+
+ flip_mask := flip_mask_DATA()
+ VMOVDQU(flip_mask, BYTE_FLIP_MASK)
+
+ VPSHUFB(X_BYTE_FLIP_MASK, XWORD0, XWORD0)
+ VPSHUFB(X_BYTE_FLIP_MASK, XWORD1, XWORD1)
+ VPSHUFB(X_BYTE_FLIP_MASK, XWORD2, XWORD2)
+ VPSHUFB(X_BYTE_FLIP_MASK, XWORD3, XWORD3)
+
+ K256 := K256_DATA()
+ LEAQ(K256, TBL)
+
+ JMP(LabelRef("avx2_last_block_enter"))
+}
+
+// Load initial digest
+func avx2_only_one_block() {
+ Label("avx2_only_one_block")
+ registers := []GPPhysical{a, b, c, d, e, f, g, h}
+ for i, reg := range registers {
+ MOVL(Mem{Base: CTX}.Offset(i*4), reg)
+ }
+ JMP(LabelRef("avx2_do_last_block"))
+}
+
+func done_hash() {
+ Label("done_hash")
+ VZEROUPPER()
+ RET()
+}
+
+func sha_ni() {
+ Label("sha_ni")
+ Load(Param("dig"), digestPtr) // init digest hash vector H0, H1,..., H7 pointer
+ Load(Param("p").Base(), dataPtr) // init input data base pointer
+ Load(Param("p").Len(), numBytes) // get number of input bytes to hash
+ SHRQ(Imm(6), numBytes) // force modulo 64 input buffer length
+ SHLQ(Imm(6), numBytes)
+ CMPQ(numBytes, Imm(0)) // exit early for zero-length input buffer
+ JEQ(LabelRef("done"))
+ ADDQ(dataPtr, numBytes) // point numBytes to end of input buffer
+ VMOVDQU(Mem{Base: digestPtr}.Offset(0*16), state0) // load initial hash values and reorder
+ VMOVDQU(Mem{Base: digestPtr}.Offset(1*16), state1) // DCBA, HGFE -> ABEF, CDGH
+ PSHUFD(Imm(0xb1), state0, state0) // CDAB
+ PSHUFD(Imm(0x1b), state1, state1) // EFGH
+ VMOVDQA(state0, m4)
+ PALIGNR(Imm(8), state1, state0) // ABEF
+ PBLENDW(Imm(0xf0), m4, state1) // CDGH
+ flip_mask := flip_mask_DATA()
+ VMOVDQA(flip_mask, shufMask)
+ LEAQ(K256_DATA(), sha256Constants)
+
+ roundLoop()
+ done()
+}
+
+func roundLoop() {
+ Label("roundLoop")
+ Comment("save hash values for addition after rounds")
+ VMOVDQA(state0, abefSave)
+ VMOVDQA(state1, cdghSave)
+
+ Comment("do rounds 0-59")
+ rounds0to11(m0, nil, 0, nop) // 0-3
+ rounds0to11(m1, m0, 1, sha256msg1) // 4-7
+ rounds0to11(m2, m1, 2, sha256msg1) // 8-11
+ VMOVDQU(Mem{Base: dataPtr}.Offset(3*16), msg)
+ PSHUFB(shufMask, msg)
+ rounds12to59(m3, 3, m2, m0, sha256msg1, vmovrev) // 12-15
+ rounds12to59(m0, 4, m3, m1, sha256msg1, vmov) // 16-19
+ rounds12to59(m1, 5, m0, m2, sha256msg1, vmov) // 20-23
+ rounds12to59(m2, 6, m1, m3, sha256msg1, vmov) // 24-27
+ rounds12to59(m3, 7, m2, m0, sha256msg1, vmov) // 28-31
+ rounds12to59(m0, 8, m3, m1, sha256msg1, vmov) // 32-35
+ rounds12to59(m1, 9, m0, m2, sha256msg1, vmov) // 36-39
+ rounds12to59(m2, 10, m1, m3, sha256msg1, vmov) // 40-43
+ rounds12to59(m3, 11, m2, m0, sha256msg1, vmov) // 44-47
+ rounds12to59(m0, 12, m3, m1, sha256msg1, vmov) // 48-51
+ rounds12to59(m1, 13, m0, m2, nop, vmov) // 52-55
+ rounds12to59(m2, 14, m1, m3, nop, vmov) // 56-59
+
+ Comment("do rounds 60-63")
+ VMOVDQA(m3, msg)
+ PADDD(Mem{Base: sha256Constants}.Offset(15*32), msg)
+ SHA256RNDS2(msg, state0, state1)
+ PSHUFD(Imm(0x0e), msg, msg)
+ SHA256RNDS2(msg, state1, state0)
+
+ Comment("add current hash values with previously saved")
+ PADDD(abefSave, state0)
+ PADDD(cdghSave, state1)
+
+ Comment("advance data pointer; loop until buffer empty")
+ ADDQ(Imm(64), dataPtr)
+ CMPQ(numBytes, dataPtr)
+ JNE(LabelRef("roundLoop"))
+
+ Comment("write hash values back in the correct order")
+ PSHUFD(Imm(0x1b), state0, state0)
+ PSHUFD(Imm(0xb1), state1, state1)
+ VMOVDQA(state0, m4)
+ PBLENDW(Imm(0xf0), state1, state0)
+ PALIGNR(Imm(8), m4, state1)
+ VMOVDQU(state0, Mem{Base: digestPtr}.Offset(0*16))
+ VMOVDQU(state1, Mem{Base: digestPtr}.Offset(1*16))
+}
+
+func done() {
+ Label("done")
+ RET()
+}
+
+/**~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~**/
+
+// Pointers for memoizing Data section symbols
+var flip_maskPtr, shuff_00BAPtr, shuff_DC00Ptr, K256Ptr *Mem
+
+// shuffle byte order from LE to BE
+func flip_mask_DATA() Mem {
+ if flip_maskPtr != nil {
+ return *flip_maskPtr
+ }
+
+ flip_mask := GLOBL("flip_mask", RODATA)
+ flip_maskPtr = &flip_mask
+
+ DATA(0x00, U64(0x0405060700010203))
+ DATA(0x08, U64(0x0c0d0e0f08090a0b))
+ DATA(0x10, U64(0x0405060700010203))
+ DATA(0x18, U64(0x0c0d0e0f08090a0b))
+ return flip_mask
+}
+
+// shuffle xBxA -> 00BA
+func shuff_00BA_DATA() Mem {
+ if shuff_00BAPtr != nil {
+ return *shuff_00BAPtr
+ }
+
+ shuff_00BA := GLOBL("shuff_00BA", RODATA)
+ shuff_00BAPtr = &shuff_00BA
+
+ DATA(0x00, U64(0x0b0a090803020100))
+ DATA(0x08, U64(0xFFFFFFFFFFFFFFFF))
+ DATA(0x10, U64(0x0b0a090803020100))
+ DATA(0x18, U64(0xFFFFFFFFFFFFFFFF))
+ return shuff_00BA
+}
+
+// shuffle xDxC -> DC00
+func shuff_DC00_DATA() Mem {
+ if shuff_DC00Ptr != nil {
+ return *shuff_DC00Ptr
+ }
+
+ shuff_DC00 := GLOBL("shuff_DC00", RODATA)
+ shuff_DC00Ptr = &shuff_DC00
+
+ DATA(0x00, U64(0xFFFFFFFFFFFFFFFF))
+ DATA(0x08, U64(0x0b0a090803020100))
+ DATA(0x10, U64(0xFFFFFFFFFFFFFFFF))
+ DATA(0x18, U64(0x0b0a090803020100))
+ return shuff_DC00
+}
+
+var _K = []uint32{
+ 0x428a2f98,
+ 0x71374491,
+ 0xb5c0fbcf,
+ 0xe9b5dba5,
+ 0x3956c25b,
+ 0x59f111f1,
+ 0x923f82a4,
+ 0xab1c5ed5,
+ 0xd807aa98,
+ 0x12835b01,
+ 0x243185be,
+ 0x550c7dc3,
+ 0x72be5d74,
+ 0x80deb1fe,
+ 0x9bdc06a7,
+ 0xc19bf174,
+ 0xe49b69c1,
+ 0xefbe4786,
+ 0x0fc19dc6,
+ 0x240ca1cc,
+ 0x2de92c6f,
+ 0x4a7484aa,
+ 0x5cb0a9dc,
+ 0x76f988da,
+ 0x983e5152,
+ 0xa831c66d,
+ 0xb00327c8,
+ 0xbf597fc7,
+ 0xc6e00bf3,
+ 0xd5a79147,
+ 0x06ca6351,
+ 0x14292967,
+ 0x27b70a85,
+ 0x2e1b2138,
+ 0x4d2c6dfc,
+ 0x53380d13,
+ 0x650a7354,
+ 0x766a0abb,
+ 0x81c2c92e,
+ 0x92722c85,
+ 0xa2bfe8a1,
+ 0xa81a664b,
+ 0xc24b8b70,
+ 0xc76c51a3,
+ 0xd192e819,
+ 0xd6990624,
+ 0xf40e3585,
+ 0x106aa070,
+ 0x19a4c116,
+ 0x1e376c08,
+ 0x2748774c,
+ 0x34b0bcb5,
+ 0x391c0cb3,
+ 0x4ed8aa4a,
+ 0x5b9cca4f,
+ 0x682e6ff3,
+ 0x748f82ee,
+ 0x78a5636f,
+ 0x84c87814,
+ 0x8cc70208,
+ 0x90befffa,
+ 0xa4506ceb,
+ 0xbef9a3f7,
+ 0xc67178f2,
+}
+
+// Round specific constants
+func K256_DATA() Mem {
+ if K256Ptr != nil {
+ return *K256Ptr
+ }
+
+ K256 := GLOBL("K256", NOPTR+RODATA)
+ K256Ptr = &K256
+
+ offset_idx := 0
+
+ for i := 0; i < len(_K); i += 4 {
+ DATA((offset_idx+0)*4, U32(_K[i+0])) // k1
+ DATA((offset_idx+1)*4, U32(_K[i+1])) // k2
+ DATA((offset_idx+2)*4, U32(_K[i+2])) // k3
+ DATA((offset_idx+3)*4, U32(_K[i+3])) // k4
+
+ DATA((offset_idx+4)*4, U32(_K[i+0])) // k1
+ DATA((offset_idx+5)*4, U32(_K[i+1])) // k2
+ DATA((offset_idx+6)*4, U32(_K[i+2])) // k3
+ DATA((offset_idx+7)*4, U32(_K[i+3])) // k4
+ offset_idx += 8
+ }
+ return K256
+}
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run sha256block_amd64_asm.go -out ../sha256block_amd64.s -pkg sha256. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
-// SHA256 block routine. See sha256block.go for Go equivalent.
-//
-// The algorithm is detailed in FIPS 180-4:
-//
-// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
-
-// The avx2-version is described in an Intel White-Paper:
-// "Fast SHA-256 Implementations on Intel Architecture Processors"
-// To find it, surf to http://www.intel.com/p/en_US/embedded
-// and search for that title.
-// AVX2 version by Intel, same algorithm as code in Linux kernel:
-// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
-// by
-// James Guilford <james.guilford@intel.com>
-// Kirk Yap <kirk.s.yap@intel.com>
-// Tim Chen <tim.c.chen@linux.intel.com>
-
-// Wt = Mt; for 0 <= t <= 15
-// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
-//
-// a = H0
-// b = H1
-// c = H2
-// d = H3
-// e = H4
-// f = H5
-// g = H6
-// h = H7
-//
-// for t = 0 to 63 {
-// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
-// T2 = BIGSIGMA0(a) + Maj(a,b,c)
-// h = g
-// g = f
-// f = e
-// e = d + T1
-// d = c
-// c = b
-// b = a
-// a = T1 + T2
-// }
-//
-// H0 = a + H0
-// H1 = b + H1
-// H2 = c + H2
-// H3 = d + H3
-// H4 = e + H4
-// H5 = f + H5
-// H6 = g + H6
-// H7 = h + H7
-
-// Wt = Mt; for 0 <= t <= 15
-#define MSGSCHEDULE0(index) \
- MOVL (index*4)(SI), AX; \
- BSWAPL AX; \
- MOVL AX, (index*4)(BP)
-
-// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
-// SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
-// SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
-#define MSGSCHEDULE1(index) \
- MOVL ((index-2)*4)(BP), AX; \
- MOVL AX, CX; \
- RORL $17, AX; \
- MOVL CX, DX; \
- RORL $19, CX; \
- SHRL $10, DX; \
- MOVL ((index-15)*4)(BP), BX; \
- XORL CX, AX; \
- MOVL BX, CX; \
- XORL DX, AX; \
- RORL $7, BX; \
- MOVL CX, DX; \
- SHRL $3, DX; \
- RORL $18, CX; \
- ADDL ((index-7)*4)(BP), AX; \
- XORL CX, BX; \
- XORL DX, BX; \
- ADDL ((index-16)*4)(BP), BX; \
- ADDL BX, AX; \
- MOVL AX, ((index)*4)(BP)
-
-// Calculate T1 in AX - uses AX, CX and DX registers.
-// h is also used as an accumulator. Wt is passed in AX.
-// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
-// BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
-// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
-#define SHA256T1(const, e, f, g, h) \
- ADDL AX, h; \
- MOVL e, AX; \
- ADDL $const, h; \
- MOVL e, CX; \
- RORL $6, AX; \
- MOVL e, DX; \
- RORL $11, CX; \
- XORL CX, AX; \
- MOVL e, CX; \
- RORL $25, DX; \
- ANDL f, CX; \
- XORL AX, DX; \
- MOVL e, AX; \
- NOTL AX; \
- ADDL DX, h; \
- ANDL g, AX; \
- XORL CX, AX; \
- ADDL h, AX
-
-// Calculate T2 in BX - uses BX, CX, DX and DI registers.
-// T2 = BIGSIGMA0(a) + Maj(a, b, c)
-// BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
-// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
-#define SHA256T2(a, b, c) \
- MOVL a, DI; \
- MOVL c, BX; \
- RORL $2, DI; \
- MOVL a, DX; \
- ANDL b, BX; \
- RORL $13, DX; \
- MOVL a, CX; \
- ANDL c, CX; \
- XORL DX, DI; \
- XORL CX, BX; \
- MOVL a, DX; \
- MOVL b, CX; \
- RORL $22, DX; \
- ANDL a, CX; \
- XORL CX, BX; \
- XORL DX, DI; \
- ADDL DI, BX
-
-// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
-// The values for e and a are stored in d and h, ready for rotation.
-#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
- SHA256T1(const, e, f, g, h); \
- SHA256T2(a, b, c); \
- MOVL BX, h; \
- ADDL AX, d; \
- ADDL AX, h
-
-#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
- MSGSCHEDULE0(index); \
- SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
-
-#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
- MSGSCHEDULE1(index); \
- SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
-
-
-// Definitions for AVX2 version
-
-// addm (mem), reg
-// Add reg to mem using reg-mem add and store
-#define addm(P1, P2) \
- ADDL P2, P1; \
- MOVL P1, P2
-
-#define XDWORD0 Y4
-#define XDWORD1 Y5
-#define XDWORD2 Y6
-#define XDWORD3 Y7
-
-#define XWORD0 X4
-#define XWORD1 X5
-#define XWORD2 X6
-#define XWORD3 X7
-
-#define XTMP0 Y0
-#define XTMP1 Y1
-#define XTMP2 Y2
-#define XTMP3 Y3
-#define XTMP4 Y8
-#define XTMP5 Y11
-
-#define XFER Y9
-
-#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
-#define X_BYTE_FLIP_MASK X13
-
-#define NUM_BYTES DX
-#define INP DI
-
-#define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
-
-#define a AX
-#define b BX
-#define c CX
-#define d R8
-#define e DX
-#define f R9
-#define g R10
-#define h R11
-
-#define old_h R11
-
-#define TBL BP
-
-#define SRND SI // SRND is same register as CTX
-
-#define T1 R12
-
-#define y0 R13
-#define y1 R14
-#define y2 R15
-#define y3 DI
-
-// Offsets
-#define XFER_SIZE 2*64*4
-#define INP_END_SIZE 8
-#define INP_SIZE 8
-
-#define _XFER 0
-#define _INP_END _XFER + XFER_SIZE
-#define _INP _INP_END + INP_END_SIZE
-#define STACK_SIZE _INP + INP_SIZE
-
-#define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
- ; \ // ############################# RND N + 0 ############################//
- MOVL a, y3; \ // y3 = a // MAJA
- RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
- RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
- ; \
- ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w
- ORL c, y3; \ // y3 = a|c // MAJA
- VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
- MOVL f, y2; \ // y2 = f // CH
- RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
- ; \
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
- XORL g, y2; \ // y2 = f^g // CH
- VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1
- RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
- ; \
- ANDL e, y2; \ // y2 = (f^g)&e // CH
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
- RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
- ADDL h, d; \ // d = k + w + h + d // --
- ; \
- ANDL b, y3; \ // y3 = (a|c)&b // MAJA
- VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
- RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
- ; \
- XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
- VPSRLD $7, XTMP1, XTMP2; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
- MOVL a, T1; \ // T1 = a // MAJB
- ANDL c, T1; \ // T1 = a&c // MAJB
- ; \
- ADDL y0, y2; \ // y2 = S1 + CH // --
- VPSLLD $(32-7), XTMP1, XTMP3; \
- ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
- ADDL y1, h; \ // h = k + w + h + S0 // --
- ; \
- ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
- VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7
- ; \
- VPSRLD $18, XTMP1, XTMP2; \
- ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
- ADDL y3, h // h = t1 + S0 + MAJ // --
-
-#define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
- ; \ // ################################### RND N + 1 ############################
- ; \
- MOVL a, y3; \ // y3 = a // MAJA
- RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
- RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
- ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
- ORL c, y3; \ // y3 = a|c // MAJA
- ; \
- VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
- MOVL f, y2; \ // y2 = f // CH
- RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
- XORL g, y2; \ // y2 = f^g // CH
- ; \
- RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
- RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
- ANDL e, y2; \ // y2 = (f^g)&e // CH
- ADDL h, d; \ // d = k + w + h + d // --
- ; \
- VPSLLD $(32-18), XTMP1, XTMP1; \
- ANDL b, y3; \ // y3 = (a|c)&b // MAJA
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
- ; \
- VPXOR XTMP1, XTMP3, XTMP3; \
- RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
- XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
- ; \
- VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
- MOVL a, T1; \ // T1 = a // MAJB
- ANDL c, T1; \ // T1 = a&c // MAJB
- ADDL y0, y2; \ // y2 = S1 + CH // --
- ; \
- VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0
- VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA}
- ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
- ADDL y1, h; \ // h = k + w + h + S0 // --
- ; \
- VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0
- ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
- ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
- ADDL y3, h; \ // h = t1 + S0 + MAJ // --
- ; \
- VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA}
-
-#define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
- ; \ // ################################### RND N + 2 ############################
- ; \
- MOVL a, y3; \ // y3 = a // MAJA
- RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
- ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
- ; \
- VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
- RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
- ORL c, y3; \ // y3 = a|c // MAJA
- MOVL f, y2; \ // y2 = f // CH
- XORL g, y2; \ // y2 = f^g // CH
- ; \
- RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
- VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA}
- ANDL e, y2; \ // y2 = (f^g)&e // CH
- ; \
- RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
- VPXOR XTMP3, XTMP2, XTMP2; \
- ADDL h, d; \ // d = k + w + h + d // --
- ANDL b, y3; \ // y3 = (a|c)&b // MAJA
- ; \
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
- RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
- VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA}
- XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
- ; \
- VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
- ; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
- RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
- VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]}
- ; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
- MOVL a, T1; \ // T1 = a // MAJB
- ANDL c, T1; \ // T1 = a&c // MAJB
- ADDL y0, y2; \ // y2 = S1 + CH // --
- VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC}
- ; \
- ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
- ADDL y1, h; \ // h = k + w + h + S0 // --
- ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
- ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
- ; \
- ADDL y3, h // h = t1 + S0 + MAJ // --
-
-#define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
- ; \ // ################################### RND N + 3 ############################
- ; \
- MOVL a, y3; \ // y3 = a // MAJA
- RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
- RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
- ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
- ORL c, y3; \ // y3 = a|c // MAJA
- ; \
- VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC}
- MOVL f, y2; \ // y2 = f // CH
- RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
- XORL g, y2; \ // y2 = f^g // CH
- ; \
- VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC}
- RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
- ANDL e, y2; \ // y2 = (f^g)&e // CH
- ADDL h, d; \ // d = k + w + h + d // --
- ANDL b, y3; \ // y3 = (a|c)&b // MAJA
- ; \
- VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC}
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
- XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
- ; \
- VPXOR XTMP3, XTMP2, XTMP2; \
- RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
- ADDL y0, y2; \ // y2 = S1 + CH // --
- ; \
- VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC}
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
- ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
- ; \
- RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
- ; \
- VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
- ; \
- VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
- MOVL a, T1; \ // T1 = a // MAJB
- ANDL c, T1; \ // T1 = a&c // MAJB
- ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
- ; \
- ADDL y1, h; \ // h = k + w + h + S0 // --
- ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
- ADDL y3, h // h = t1 + S0 + MAJ // --
-
-#define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
- ; \ // ################################### RND N + 0 ###########################
- MOVL f, y2; \ // y2 = f // CH
- RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
- RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
- XORL g, y2; \ // y2 = f^g // CH
- ; \
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
- RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
- ANDL e, y2; \ // y2 = (f^g)&e // CH
- ; \
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
- RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
- XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
- RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
- MOVL a, y3; \ // y3 = a // MAJA
- ; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
- RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
- ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
- ORL c, y3; \ // y3 = a|c // MAJA
- ; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
- MOVL a, T1; \ // T1 = a // MAJB
- ANDL b, y3; \ // y3 = (a|c)&b // MAJA
- ANDL c, T1; \ // T1 = a&c // MAJB
- ADDL y0, y2; \ // y2 = S1 + CH // --
- ; \
- ADDL h, d; \ // d = k + w + h + d // --
- ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
- ADDL y1, h; \ // h = k + w + h + S0 // --
- ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
-
-#define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
- ; \ // ################################### RND N + 1 ###########################
- ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
- MOVL f, y2; \ // y2 = f // CH
- RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
- RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
- XORL g, y2; \ // y2 = f^g // CH
- ; \
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
- RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
- ANDL e, y2; \ // y2 = (f^g)&e // CH
- ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
- ; \
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
- RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
- XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
- RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
- MOVL a, y3; \ // y3 = a // MAJA
- ; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
- RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
- ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
- ORL c, y3; \ // y3 = a|c // MAJA
- ; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
- MOVL a, T1; \ // T1 = a // MAJB
- ANDL b, y3; \ // y3 = (a|c)&b // MAJA
- ANDL c, T1; \ // T1 = a&c // MAJB
- ADDL y0, y2; \ // y2 = S1 + CH // --
- ; \
- ADDL h, d; \ // d = k + w + h + d // --
- ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
- ADDL y1, h; \ // h = k + w + h + S0 // --
- ; \
- ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
-
-#define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
- ; \ // ################################### RND N + 2 ##############################
- ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
- MOVL f, y2; \ // y2 = f // CH
- RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
- RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
- XORL g, y2; \ // y2 = f^g // CH
- ; \
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
- RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
- ANDL e, y2; \ // y2 = (f^g)&e // CH
- ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
- ; \
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
- RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
- XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
- RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
- MOVL a, y3; \ // y3 = a // MAJA
- ; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
- RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
- ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
- ORL c, y3; \ // y3 = a|c // MAJA
- ; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
- MOVL a, T1; \ // T1 = a // MAJB
- ANDL b, y3; \ // y3 = (a|c)&b // MAJA
- ANDL c, T1; \ // T1 = a&c // MAJB
- ADDL y0, y2; \ // y2 = S1 + CH // --
- ; \
- ADDL h, d; \ // d = k + w + h + d // --
- ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
- ADDL y1, h; \ // h = k + w + h + S0 // --
- ; \
- ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
-
-#define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
- ; \ // ################################### RND N + 3 ###########################
- ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
- MOVL f, y2; \ // y2 = f // CH
- RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
- RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
- XORL g, y2; \ // y2 = f^g // CH
- ; \
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
- RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
- ANDL e, y2; \ // y2 = (f^g)&e // CH
- ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
- ; \
- XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
- RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
- XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
- RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
- MOVL a, y3; \ // y3 = a // MAJA
- ; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
- RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
- ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
- ORL c, y3; \ // y3 = a|c // MAJA
- ; \
- XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
- MOVL a, T1; \ // T1 = a // MAJB
- ANDL b, y3; \ // y3 = (a|c)&b // MAJA
- ANDL c, T1; \ // T1 = a&c // MAJB
- ADDL y0, y2; \ // y2 = S1 + CH // --
- ; \
- ADDL h, d; \ // d = k + w + h + d // --
- ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
- ADDL y1, h; \ // h = k + w + h + S0 // --
- ; \
- ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
- ; \
- ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
- ; \
- ADDL y3, h // h = t1 + S0 + MAJ // --
-
-// Definitions for sha-ni version
-//
-// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
-// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
-//
-// Reference
-// S. Gulley, et al, "New Instructions Supporting the Secure Hash
-// Algorithm on Intel® Architecture Processors", July 2013
-// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
-//
-
-#define digestPtr DI // input/output, base pointer to digest hash vector H0, H1, ..., H7
-#define dataPtr SI // input, base pointer to first input data block
-#define numBytes DX // input, number of input bytes to be processed
-#define sha256Constants AX // round contents from K256 table, indexed by round number x 32
-#define msg X0 // input data
-#define state0 X1 // round intermediates and outputs
-#define state1 X2
-#define m0 X3 // m0, m1,... m4 -- round message temps
-#define m1 X4
-#define m2 X5
-#define m3 X6
-#define m4 X7
-#define shufMask X8 // input data endian conversion control mask
-#define abefSave X9 // digest hash vector inter-block buffer abef
-#define cdghSave X10 // digest hash vector inter-block buffer cdgh
-
-#define nop(m,a) // nop instead of final SHA256MSG1 for first and last few rounds
-
-#define sha256msg1(m,a) \ // final SHA256MSG1 for middle rounds that require it
- SHA256MSG1 m, a
-
-#define vmov(a,b) \ // msg copy for all but rounds 12-15
- VMOVDQA a, b
-
-#define vmovrev(a,b) \ // reverse copy for rounds 12-15
- VMOVDQA b, a
-
-// sha rounds 0 to 11
-// identical with the exception of the final msg op
-// which is replaced with a nop for rounds where it is not needed
-// refer to Gulley, et al for more information
-#define rounds0to11(m,a,c,sha256Msg1) \
- VMOVDQU c*16(dataPtr), msg \
- PSHUFB shufMask, msg \
- VMOVDQA msg, m \
- PADDD (c*32)(sha256Constants), msg \
- SHA256RNDS2 msg, state0, state1 \
- PSHUFD $0x0e, msg, msg \
- SHA256RNDS2 msg, state1, state0 \
- sha256Msg1 (m,a)
-
-// sha rounds 12 to 59
-// identical with the exception of the final msg op
-// and the reverse copy(m,msg) in round 12 which is required
-// after the last data load
-// refer to Gulley, et al for more information
-#define rounds12to59(m,c,a,t,sha256Msg1,movop) \
- movop (m,msg) \
- PADDD (c*32)(sha256Constants), msg \
- SHA256RNDS2 msg, state0, state1 \
- VMOVDQA m, m4 \
- PALIGNR $4, a, m4 \
- PADDD m4, t \
- SHA256MSG2 m, t \
- PSHUFD $0x0e, msg, msg \
- SHA256RNDS2 msg, state1, state0 \
- sha256Msg1 (m,a)
-
-TEXT ·block(SB), 0, $536-32
- CMPB ·useSHA(SB), $1
- JE sha_ni
- CMPB ·useAVX2(SB), $1
- JE avx2
-
+// func block(dig *digest, p []byte)
+// Requires: AVX, AVX2, BMI2, SHA, SSE2, SSE4.1, SSSE3
+TEXT ·block(SB), $536-32
+ CMPB ·useSHA+0(SB), $0x01
+ JE sha_ni
+ CMPB ·useAVX2+0(SB), $0x01
+ JE avx2
MOVQ p_base+8(FP), SI
MOVQ p_len+16(FP), DX
- SHRQ $6, DX
- SHLQ $6, DX
-
+ SHRQ $0x06, DX
+ SHLQ $0x06, DX
LEAQ (SI)(DX*1), DI
MOVQ DI, 256(SP)
CMPQ SI, DI
JEQ end
-
MOVQ dig+0(FP), BP
- MOVL (0*4)(BP), R8 // a = H0
- MOVL (1*4)(BP), R9 // b = H1
- MOVL (2*4)(BP), R10 // c = H2
- MOVL (3*4)(BP), R11 // d = H3
- MOVL (4*4)(BP), R12 // e = H4
- MOVL (5*4)(BP), R13 // f = H5
- MOVL (6*4)(BP), R14 // g = H6
- MOVL (7*4)(BP), R15 // h = H7
+ MOVL (BP), R8
+ MOVL 4(BP), R9
+ MOVL 8(BP), R10
+ MOVL 12(BP), R11
+ MOVL 16(BP), R12
+ MOVL 20(BP), R13
+ MOVL 24(BP), R14
+ MOVL 28(BP), R15
loop:
- MOVQ SP, BP
-
- SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
- SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
- SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
- SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
- SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
- SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
- SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
- SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
- SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
- SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
- SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
- SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
- SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
- SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
- SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
- SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
-
- SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
- SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
- SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
- SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
- SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
- SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
- SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
- SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
- SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
- SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
- SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
- SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
- SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
- SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
- SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
- SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
- SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
- SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
- SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
- SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
- SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
- SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
- SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
- SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
- SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
- SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
- SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
- SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
- SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
- SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
- SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
- SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
- SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
- SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
- SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
- SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
- SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
- SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
- SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
- SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
- SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
- SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
- SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
- SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
- SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
- SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
- SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
- SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
-
- MOVQ dig+0(FP), BP
- ADDL (0*4)(BP), R8 // H0 = a + H0
- MOVL R8, (0*4)(BP)
- ADDL (1*4)(BP), R9 // H1 = b + H1
- MOVL R9, (1*4)(BP)
- ADDL (2*4)(BP), R10 // H2 = c + H2
- MOVL R10, (2*4)(BP)
- ADDL (3*4)(BP), R11 // H3 = d + H3
- MOVL R11, (3*4)(BP)
- ADDL (4*4)(BP), R12 // H4 = e + H4
- MOVL R12, (4*4)(BP)
- ADDL (5*4)(BP), R13 // H5 = f + H5
- MOVL R13, (5*4)(BP)
- ADDL (6*4)(BP), R14 // H6 = g + H6
- MOVL R14, (6*4)(BP)
- ADDL (7*4)(BP), R15 // H7 = h + H7
- MOVL R15, (7*4)(BP)
-
- ADDQ $64, SI
- CMPQ SI, 256(SP)
- JB loop
+ MOVQ SP, BP
+ MOVL (SI), AX
+ BSWAPL AX
+ MOVL AX, (BP)
+ ADDL AX, R15
+ MOVL R12, AX
+ ADDL $0x428a2f98, R15
+ MOVL R12, CX
+ RORL $0x06, AX
+ MOVL R12, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R12, CX
+ RORL $0x19, DX
+ ANDL R13, CX
+ XORL AX, DX
+ MOVL R12, AX
+ NOTL AX
+ ADDL DX, R15
+ ANDL R14, AX
+ XORL CX, AX
+ ADDL R15, AX
+ MOVL R8, DI
+ MOVL R10, BX
+ RORL $0x02, DI
+ MOVL R8, DX
+ ANDL R9, BX
+ RORL $0x0d, DX
+ MOVL R8, CX
+ ANDL R10, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R8, DX
+ MOVL R9, CX
+ RORL $0x16, DX
+ ANDL R8, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R15
+ ADDL AX, R11
+ ADDL AX, R15
+ MOVL 4(SI), AX
+ BSWAPL AX
+ MOVL AX, 4(BP)
+ ADDL AX, R14
+ MOVL R11, AX
+ ADDL $0x71374491, R14
+ MOVL R11, CX
+ RORL $0x06, AX
+ MOVL R11, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R11, CX
+ RORL $0x19, DX
+ ANDL R12, CX
+ XORL AX, DX
+ MOVL R11, AX
+ NOTL AX
+ ADDL DX, R14
+ ANDL R13, AX
+ XORL CX, AX
+ ADDL R14, AX
+ MOVL R15, DI
+ MOVL R9, BX
+ RORL $0x02, DI
+ MOVL R15, DX
+ ANDL R8, BX
+ RORL $0x0d, DX
+ MOVL R15, CX
+ ANDL R9, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R15, DX
+ MOVL R8, CX
+ RORL $0x16, DX
+ ANDL R15, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R14
+ ADDL AX, R10
+ ADDL AX, R14
+ MOVL 8(SI), AX
+ BSWAPL AX
+ MOVL AX, 8(BP)
+ ADDL AX, R13
+ MOVL R10, AX
+ ADDL $0xb5c0fbcf, R13
+ MOVL R10, CX
+ RORL $0x06, AX
+ MOVL R10, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R10, CX
+ RORL $0x19, DX
+ ANDL R11, CX
+ XORL AX, DX
+ MOVL R10, AX
+ NOTL AX
+ ADDL DX, R13
+ ANDL R12, AX
+ XORL CX, AX
+ ADDL R13, AX
+ MOVL R14, DI
+ MOVL R8, BX
+ RORL $0x02, DI
+ MOVL R14, DX
+ ANDL R15, BX
+ RORL $0x0d, DX
+ MOVL R14, CX
+ ANDL R8, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R14, DX
+ MOVL R15, CX
+ RORL $0x16, DX
+ ANDL R14, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R13
+ ADDL AX, R9
+ ADDL AX, R13
+ MOVL 12(SI), AX
+ BSWAPL AX
+ MOVL AX, 12(BP)
+ ADDL AX, R12
+ MOVL R9, AX
+ ADDL $0xe9b5dba5, R12
+ MOVL R9, CX
+ RORL $0x06, AX
+ MOVL R9, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R9, CX
+ RORL $0x19, DX
+ ANDL R10, CX
+ XORL AX, DX
+ MOVL R9, AX
+ NOTL AX
+ ADDL DX, R12
+ ANDL R11, AX
+ XORL CX, AX
+ ADDL R12, AX
+ MOVL R13, DI
+ MOVL R15, BX
+ RORL $0x02, DI
+ MOVL R13, DX
+ ANDL R14, BX
+ RORL $0x0d, DX
+ MOVL R13, CX
+ ANDL R15, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R13, DX
+ MOVL R14, CX
+ RORL $0x16, DX
+ ANDL R13, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R12
+ ADDL AX, R8
+ ADDL AX, R12
+ MOVL 16(SI), AX
+ BSWAPL AX
+ MOVL AX, 16(BP)
+ ADDL AX, R11
+ MOVL R8, AX
+ ADDL $0x3956c25b, R11
+ MOVL R8, CX
+ RORL $0x06, AX
+ MOVL R8, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R8, CX
+ RORL $0x19, DX
+ ANDL R9, CX
+ XORL AX, DX
+ MOVL R8, AX
+ NOTL AX
+ ADDL DX, R11
+ ANDL R10, AX
+ XORL CX, AX
+ ADDL R11, AX
+ MOVL R12, DI
+ MOVL R14, BX
+ RORL $0x02, DI
+ MOVL R12, DX
+ ANDL R13, BX
+ RORL $0x0d, DX
+ MOVL R12, CX
+ ANDL R14, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R12, DX
+ MOVL R13, CX
+ RORL $0x16, DX
+ ANDL R12, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R11
+ ADDL AX, R15
+ ADDL AX, R11
+ MOVL 20(SI), AX
+ BSWAPL AX
+ MOVL AX, 20(BP)
+ ADDL AX, R10
+ MOVL R15, AX
+ ADDL $0x59f111f1, R10
+ MOVL R15, CX
+ RORL $0x06, AX
+ MOVL R15, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R15, CX
+ RORL $0x19, DX
+ ANDL R8, CX
+ XORL AX, DX
+ MOVL R15, AX
+ NOTL AX
+ ADDL DX, R10
+ ANDL R9, AX
+ XORL CX, AX
+ ADDL R10, AX
+ MOVL R11, DI
+ MOVL R13, BX
+ RORL $0x02, DI
+ MOVL R11, DX
+ ANDL R12, BX
+ RORL $0x0d, DX
+ MOVL R11, CX
+ ANDL R13, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R11, DX
+ MOVL R12, CX
+ RORL $0x16, DX
+ ANDL R11, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R10
+ ADDL AX, R14
+ ADDL AX, R10
+ MOVL 24(SI), AX
+ BSWAPL AX
+ MOVL AX, 24(BP)
+ ADDL AX, R9
+ MOVL R14, AX
+ ADDL $0x923f82a4, R9
+ MOVL R14, CX
+ RORL $0x06, AX
+ MOVL R14, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R14, CX
+ RORL $0x19, DX
+ ANDL R15, CX
+ XORL AX, DX
+ MOVL R14, AX
+ NOTL AX
+ ADDL DX, R9
+ ANDL R8, AX
+ XORL CX, AX
+ ADDL R9, AX
+ MOVL R10, DI
+ MOVL R12, BX
+ RORL $0x02, DI
+ MOVL R10, DX
+ ANDL R11, BX
+ RORL $0x0d, DX
+ MOVL R10, CX
+ ANDL R12, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R10, DX
+ MOVL R11, CX
+ RORL $0x16, DX
+ ANDL R10, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R9
+ ADDL AX, R13
+ ADDL AX, R9
+ MOVL 28(SI), AX
+ BSWAPL AX
+ MOVL AX, 28(BP)
+ ADDL AX, R8
+ MOVL R13, AX
+ ADDL $0xab1c5ed5, R8
+ MOVL R13, CX
+ RORL $0x06, AX
+ MOVL R13, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R13, CX
+ RORL $0x19, DX
+ ANDL R14, CX
+ XORL AX, DX
+ MOVL R13, AX
+ NOTL AX
+ ADDL DX, R8
+ ANDL R15, AX
+ XORL CX, AX
+ ADDL R8, AX
+ MOVL R9, DI
+ MOVL R11, BX
+ RORL $0x02, DI
+ MOVL R9, DX
+ ANDL R10, BX
+ RORL $0x0d, DX
+ MOVL R9, CX
+ ANDL R11, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R9, DX
+ MOVL R10, CX
+ RORL $0x16, DX
+ ANDL R9, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R8
+ ADDL AX, R12
+ ADDL AX, R8
+ MOVL 32(SI), AX
+ BSWAPL AX
+ MOVL AX, 32(BP)
+ ADDL AX, R15
+ MOVL R12, AX
+ ADDL $0xd807aa98, R15
+ MOVL R12, CX
+ RORL $0x06, AX
+ MOVL R12, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R12, CX
+ RORL $0x19, DX
+ ANDL R13, CX
+ XORL AX, DX
+ MOVL R12, AX
+ NOTL AX
+ ADDL DX, R15
+ ANDL R14, AX
+ XORL CX, AX
+ ADDL R15, AX
+ MOVL R8, DI
+ MOVL R10, BX
+ RORL $0x02, DI
+ MOVL R8, DX
+ ANDL R9, BX
+ RORL $0x0d, DX
+ MOVL R8, CX
+ ANDL R10, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R8, DX
+ MOVL R9, CX
+ RORL $0x16, DX
+ ANDL R8, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R15
+ ADDL AX, R11
+ ADDL AX, R15
+ MOVL 36(SI), AX
+ BSWAPL AX
+ MOVL AX, 36(BP)
+ ADDL AX, R14
+ MOVL R11, AX
+ ADDL $0x12835b01, R14
+ MOVL R11, CX
+ RORL $0x06, AX
+ MOVL R11, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R11, CX
+ RORL $0x19, DX
+ ANDL R12, CX
+ XORL AX, DX
+ MOVL R11, AX
+ NOTL AX
+ ADDL DX, R14
+ ANDL R13, AX
+ XORL CX, AX
+ ADDL R14, AX
+ MOVL R15, DI
+ MOVL R9, BX
+ RORL $0x02, DI
+ MOVL R15, DX
+ ANDL R8, BX
+ RORL $0x0d, DX
+ MOVL R15, CX
+ ANDL R9, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R15, DX
+ MOVL R8, CX
+ RORL $0x16, DX
+ ANDL R15, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R14
+ ADDL AX, R10
+ ADDL AX, R14
+ MOVL 40(SI), AX
+ BSWAPL AX
+ MOVL AX, 40(BP)
+ ADDL AX, R13
+ MOVL R10, AX
+ ADDL $0x243185be, R13
+ MOVL R10, CX
+ RORL $0x06, AX
+ MOVL R10, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R10, CX
+ RORL $0x19, DX
+ ANDL R11, CX
+ XORL AX, DX
+ MOVL R10, AX
+ NOTL AX
+ ADDL DX, R13
+ ANDL R12, AX
+ XORL CX, AX
+ ADDL R13, AX
+ MOVL R14, DI
+ MOVL R8, BX
+ RORL $0x02, DI
+ MOVL R14, DX
+ ANDL R15, BX
+ RORL $0x0d, DX
+ MOVL R14, CX
+ ANDL R8, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R14, DX
+ MOVL R15, CX
+ RORL $0x16, DX
+ ANDL R14, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R13
+ ADDL AX, R9
+ ADDL AX, R13
+ MOVL 44(SI), AX
+ BSWAPL AX
+ MOVL AX, 44(BP)
+ ADDL AX, R12
+ MOVL R9, AX
+ ADDL $0x550c7dc3, R12
+ MOVL R9, CX
+ RORL $0x06, AX
+ MOVL R9, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R9, CX
+ RORL $0x19, DX
+ ANDL R10, CX
+ XORL AX, DX
+ MOVL R9, AX
+ NOTL AX
+ ADDL DX, R12
+ ANDL R11, AX
+ XORL CX, AX
+ ADDL R12, AX
+ MOVL R13, DI
+ MOVL R15, BX
+ RORL $0x02, DI
+ MOVL R13, DX
+ ANDL R14, BX
+ RORL $0x0d, DX
+ MOVL R13, CX
+ ANDL R15, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R13, DX
+ MOVL R14, CX
+ RORL $0x16, DX
+ ANDL R13, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R12
+ ADDL AX, R8
+ ADDL AX, R12
+ MOVL 48(SI), AX
+ BSWAPL AX
+ MOVL AX, 48(BP)
+ ADDL AX, R11
+ MOVL R8, AX
+ ADDL $0x72be5d74, R11
+ MOVL R8, CX
+ RORL $0x06, AX
+ MOVL R8, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R8, CX
+ RORL $0x19, DX
+ ANDL R9, CX
+ XORL AX, DX
+ MOVL R8, AX
+ NOTL AX
+ ADDL DX, R11
+ ANDL R10, AX
+ XORL CX, AX
+ ADDL R11, AX
+ MOVL R12, DI
+ MOVL R14, BX
+ RORL $0x02, DI
+ MOVL R12, DX
+ ANDL R13, BX
+ RORL $0x0d, DX
+ MOVL R12, CX
+ ANDL R14, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R12, DX
+ MOVL R13, CX
+ RORL $0x16, DX
+ ANDL R12, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R11
+ ADDL AX, R15
+ ADDL AX, R11
+ MOVL 52(SI), AX
+ BSWAPL AX
+ MOVL AX, 52(BP)
+ ADDL AX, R10
+ MOVL R15, AX
+ ADDL $0x80deb1fe, R10
+ MOVL R15, CX
+ RORL $0x06, AX
+ MOVL R15, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R15, CX
+ RORL $0x19, DX
+ ANDL R8, CX
+ XORL AX, DX
+ MOVL R15, AX
+ NOTL AX
+ ADDL DX, R10
+ ANDL R9, AX
+ XORL CX, AX
+ ADDL R10, AX
+ MOVL R11, DI
+ MOVL R13, BX
+ RORL $0x02, DI
+ MOVL R11, DX
+ ANDL R12, BX
+ RORL $0x0d, DX
+ MOVL R11, CX
+ ANDL R13, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R11, DX
+ MOVL R12, CX
+ RORL $0x16, DX
+ ANDL R11, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R10
+ ADDL AX, R14
+ ADDL AX, R10
+ MOVL 56(SI), AX
+ BSWAPL AX
+ MOVL AX, 56(BP)
+ ADDL AX, R9
+ MOVL R14, AX
+ ADDL $0x9bdc06a7, R9
+ MOVL R14, CX
+ RORL $0x06, AX
+ MOVL R14, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R14, CX
+ RORL $0x19, DX
+ ANDL R15, CX
+ XORL AX, DX
+ MOVL R14, AX
+ NOTL AX
+ ADDL DX, R9
+ ANDL R8, AX
+ XORL CX, AX
+ ADDL R9, AX
+ MOVL R10, DI
+ MOVL R12, BX
+ RORL $0x02, DI
+ MOVL R10, DX
+ ANDL R11, BX
+ RORL $0x0d, DX
+ MOVL R10, CX
+ ANDL R12, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R10, DX
+ MOVL R11, CX
+ RORL $0x16, DX
+ ANDL R10, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R9
+ ADDL AX, R13
+ ADDL AX, R9
+ MOVL 60(SI), AX
+ BSWAPL AX
+ MOVL AX, 60(BP)
+ ADDL AX, R8
+ MOVL R13, AX
+ ADDL $0xc19bf174, R8
+ MOVL R13, CX
+ RORL $0x06, AX
+ MOVL R13, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R13, CX
+ RORL $0x19, DX
+ ANDL R14, CX
+ XORL AX, DX
+ MOVL R13, AX
+ NOTL AX
+ ADDL DX, R8
+ ANDL R15, AX
+ XORL CX, AX
+ ADDL R8, AX
+ MOVL R9, DI
+ MOVL R11, BX
+ RORL $0x02, DI
+ MOVL R9, DX
+ ANDL R10, BX
+ RORL $0x0d, DX
+ MOVL R9, CX
+ ANDL R11, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R9, DX
+ MOVL R10, CX
+ RORL $0x16, DX
+ ANDL R9, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R8
+ ADDL AX, R12
+ ADDL AX, R8
+ MOVL 56(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 4(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 36(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL (BP), BX
+ ADDL BX, AX
+ MOVL AX, 64(BP)
+ ADDL AX, R15
+ MOVL R12, AX
+ ADDL $0xe49b69c1, R15
+ MOVL R12, CX
+ RORL $0x06, AX
+ MOVL R12, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R12, CX
+ RORL $0x19, DX
+ ANDL R13, CX
+ XORL AX, DX
+ MOVL R12, AX
+ NOTL AX
+ ADDL DX, R15
+ ANDL R14, AX
+ XORL CX, AX
+ ADDL R15, AX
+ MOVL R8, DI
+ MOVL R10, BX
+ RORL $0x02, DI
+ MOVL R8, DX
+ ANDL R9, BX
+ RORL $0x0d, DX
+ MOVL R8, CX
+ ANDL R10, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R8, DX
+ MOVL R9, CX
+ RORL $0x16, DX
+ ANDL R8, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R15
+ ADDL AX, R11
+ ADDL AX, R15
+ MOVL 60(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 8(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 40(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 4(BP), BX
+ ADDL BX, AX
+ MOVL AX, 68(BP)
+ ADDL AX, R14
+ MOVL R11, AX
+ ADDL $0xefbe4786, R14
+ MOVL R11, CX
+ RORL $0x06, AX
+ MOVL R11, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R11, CX
+ RORL $0x19, DX
+ ANDL R12, CX
+ XORL AX, DX
+ MOVL R11, AX
+ NOTL AX
+ ADDL DX, R14
+ ANDL R13, AX
+ XORL CX, AX
+ ADDL R14, AX
+ MOVL R15, DI
+ MOVL R9, BX
+ RORL $0x02, DI
+ MOVL R15, DX
+ ANDL R8, BX
+ RORL $0x0d, DX
+ MOVL R15, CX
+ ANDL R9, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R15, DX
+ MOVL R8, CX
+ RORL $0x16, DX
+ ANDL R15, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R14
+ ADDL AX, R10
+ ADDL AX, R14
+ MOVL 64(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 12(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 44(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 8(BP), BX
+ ADDL BX, AX
+ MOVL AX, 72(BP)
+ ADDL AX, R13
+ MOVL R10, AX
+ ADDL $0x0fc19dc6, R13
+ MOVL R10, CX
+ RORL $0x06, AX
+ MOVL R10, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R10, CX
+ RORL $0x19, DX
+ ANDL R11, CX
+ XORL AX, DX
+ MOVL R10, AX
+ NOTL AX
+ ADDL DX, R13
+ ANDL R12, AX
+ XORL CX, AX
+ ADDL R13, AX
+ MOVL R14, DI
+ MOVL R8, BX
+ RORL $0x02, DI
+ MOVL R14, DX
+ ANDL R15, BX
+ RORL $0x0d, DX
+ MOVL R14, CX
+ ANDL R8, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R14, DX
+ MOVL R15, CX
+ RORL $0x16, DX
+ ANDL R14, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R13
+ ADDL AX, R9
+ ADDL AX, R13
+ MOVL 68(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 16(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 48(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 12(BP), BX
+ ADDL BX, AX
+ MOVL AX, 76(BP)
+ ADDL AX, R12
+ MOVL R9, AX
+ ADDL $0x240ca1cc, R12
+ MOVL R9, CX
+ RORL $0x06, AX
+ MOVL R9, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R9, CX
+ RORL $0x19, DX
+ ANDL R10, CX
+ XORL AX, DX
+ MOVL R9, AX
+ NOTL AX
+ ADDL DX, R12
+ ANDL R11, AX
+ XORL CX, AX
+ ADDL R12, AX
+ MOVL R13, DI
+ MOVL R15, BX
+ RORL $0x02, DI
+ MOVL R13, DX
+ ANDL R14, BX
+ RORL $0x0d, DX
+ MOVL R13, CX
+ ANDL R15, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R13, DX
+ MOVL R14, CX
+ RORL $0x16, DX
+ ANDL R13, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R12
+ ADDL AX, R8
+ ADDL AX, R12
+ MOVL 72(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 20(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 52(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 16(BP), BX
+ ADDL BX, AX
+ MOVL AX, 80(BP)
+ ADDL AX, R11
+ MOVL R8, AX
+ ADDL $0x2de92c6f, R11
+ MOVL R8, CX
+ RORL $0x06, AX
+ MOVL R8, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R8, CX
+ RORL $0x19, DX
+ ANDL R9, CX
+ XORL AX, DX
+ MOVL R8, AX
+ NOTL AX
+ ADDL DX, R11
+ ANDL R10, AX
+ XORL CX, AX
+ ADDL R11, AX
+ MOVL R12, DI
+ MOVL R14, BX
+ RORL $0x02, DI
+ MOVL R12, DX
+ ANDL R13, BX
+ RORL $0x0d, DX
+ MOVL R12, CX
+ ANDL R14, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R12, DX
+ MOVL R13, CX
+ RORL $0x16, DX
+ ANDL R12, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R11
+ ADDL AX, R15
+ ADDL AX, R11
+ MOVL 76(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 24(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 56(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 20(BP), BX
+ ADDL BX, AX
+ MOVL AX, 84(BP)
+ ADDL AX, R10
+ MOVL R15, AX
+ ADDL $0x4a7484aa, R10
+ MOVL R15, CX
+ RORL $0x06, AX
+ MOVL R15, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R15, CX
+ RORL $0x19, DX
+ ANDL R8, CX
+ XORL AX, DX
+ MOVL R15, AX
+ NOTL AX
+ ADDL DX, R10
+ ANDL R9, AX
+ XORL CX, AX
+ ADDL R10, AX
+ MOVL R11, DI
+ MOVL R13, BX
+ RORL $0x02, DI
+ MOVL R11, DX
+ ANDL R12, BX
+ RORL $0x0d, DX
+ MOVL R11, CX
+ ANDL R13, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R11, DX
+ MOVL R12, CX
+ RORL $0x16, DX
+ ANDL R11, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R10
+ ADDL AX, R14
+ ADDL AX, R10
+ MOVL 80(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 28(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 60(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 24(BP), BX
+ ADDL BX, AX
+ MOVL AX, 88(BP)
+ ADDL AX, R9
+ MOVL R14, AX
+ ADDL $0x5cb0a9dc, R9
+ MOVL R14, CX
+ RORL $0x06, AX
+ MOVL R14, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R14, CX
+ RORL $0x19, DX
+ ANDL R15, CX
+ XORL AX, DX
+ MOVL R14, AX
+ NOTL AX
+ ADDL DX, R9
+ ANDL R8, AX
+ XORL CX, AX
+ ADDL R9, AX
+ MOVL R10, DI
+ MOVL R12, BX
+ RORL $0x02, DI
+ MOVL R10, DX
+ ANDL R11, BX
+ RORL $0x0d, DX
+ MOVL R10, CX
+ ANDL R12, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R10, DX
+ MOVL R11, CX
+ RORL $0x16, DX
+ ANDL R10, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R9
+ ADDL AX, R13
+ ADDL AX, R9
+ MOVL 84(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 32(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 64(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 28(BP), BX
+ ADDL BX, AX
+ MOVL AX, 92(BP)
+ ADDL AX, R8
+ MOVL R13, AX
+ ADDL $0x76f988da, R8
+ MOVL R13, CX
+ RORL $0x06, AX
+ MOVL R13, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R13, CX
+ RORL $0x19, DX
+ ANDL R14, CX
+ XORL AX, DX
+ MOVL R13, AX
+ NOTL AX
+ ADDL DX, R8
+ ANDL R15, AX
+ XORL CX, AX
+ ADDL R8, AX
+ MOVL R9, DI
+ MOVL R11, BX
+ RORL $0x02, DI
+ MOVL R9, DX
+ ANDL R10, BX
+ RORL $0x0d, DX
+ MOVL R9, CX
+ ANDL R11, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R9, DX
+ MOVL R10, CX
+ RORL $0x16, DX
+ ANDL R9, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R8
+ ADDL AX, R12
+ ADDL AX, R8
+ MOVL 88(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 36(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 68(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 32(BP), BX
+ ADDL BX, AX
+ MOVL AX, 96(BP)
+ ADDL AX, R15
+ MOVL R12, AX
+ ADDL $0x983e5152, R15
+ MOVL R12, CX
+ RORL $0x06, AX
+ MOVL R12, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R12, CX
+ RORL $0x19, DX
+ ANDL R13, CX
+ XORL AX, DX
+ MOVL R12, AX
+ NOTL AX
+ ADDL DX, R15
+ ANDL R14, AX
+ XORL CX, AX
+ ADDL R15, AX
+ MOVL R8, DI
+ MOVL R10, BX
+ RORL $0x02, DI
+ MOVL R8, DX
+ ANDL R9, BX
+ RORL $0x0d, DX
+ MOVL R8, CX
+ ANDL R10, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R8, DX
+ MOVL R9, CX
+ RORL $0x16, DX
+ ANDL R8, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R15
+ ADDL AX, R11
+ ADDL AX, R15
+ MOVL 92(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 40(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 72(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 36(BP), BX
+ ADDL BX, AX
+ MOVL AX, 100(BP)
+ ADDL AX, R14
+ MOVL R11, AX
+ ADDL $0xa831c66d, R14
+ MOVL R11, CX
+ RORL $0x06, AX
+ MOVL R11, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R11, CX
+ RORL $0x19, DX
+ ANDL R12, CX
+ XORL AX, DX
+ MOVL R11, AX
+ NOTL AX
+ ADDL DX, R14
+ ANDL R13, AX
+ XORL CX, AX
+ ADDL R14, AX
+ MOVL R15, DI
+ MOVL R9, BX
+ RORL $0x02, DI
+ MOVL R15, DX
+ ANDL R8, BX
+ RORL $0x0d, DX
+ MOVL R15, CX
+ ANDL R9, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R15, DX
+ MOVL R8, CX
+ RORL $0x16, DX
+ ANDL R15, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R14
+ ADDL AX, R10
+ ADDL AX, R14
+ MOVL 96(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 44(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 76(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 40(BP), BX
+ ADDL BX, AX
+ MOVL AX, 104(BP)
+ ADDL AX, R13
+ MOVL R10, AX
+ ADDL $0xb00327c8, R13
+ MOVL R10, CX
+ RORL $0x06, AX
+ MOVL R10, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R10, CX
+ RORL $0x19, DX
+ ANDL R11, CX
+ XORL AX, DX
+ MOVL R10, AX
+ NOTL AX
+ ADDL DX, R13
+ ANDL R12, AX
+ XORL CX, AX
+ ADDL R13, AX
+ MOVL R14, DI
+ MOVL R8, BX
+ RORL $0x02, DI
+ MOVL R14, DX
+ ANDL R15, BX
+ RORL $0x0d, DX
+ MOVL R14, CX
+ ANDL R8, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R14, DX
+ MOVL R15, CX
+ RORL $0x16, DX
+ ANDL R14, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R13
+ ADDL AX, R9
+ ADDL AX, R13
+ MOVL 100(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 48(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 80(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 44(BP), BX
+ ADDL BX, AX
+ MOVL AX, 108(BP)
+ ADDL AX, R12
+ MOVL R9, AX
+ ADDL $0xbf597fc7, R12
+ MOVL R9, CX
+ RORL $0x06, AX
+ MOVL R9, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R9, CX
+ RORL $0x19, DX
+ ANDL R10, CX
+ XORL AX, DX
+ MOVL R9, AX
+ NOTL AX
+ ADDL DX, R12
+ ANDL R11, AX
+ XORL CX, AX
+ ADDL R12, AX
+ MOVL R13, DI
+ MOVL R15, BX
+ RORL $0x02, DI
+ MOVL R13, DX
+ ANDL R14, BX
+ RORL $0x0d, DX
+ MOVL R13, CX
+ ANDL R15, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R13, DX
+ MOVL R14, CX
+ RORL $0x16, DX
+ ANDL R13, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R12
+ ADDL AX, R8
+ ADDL AX, R12
+ MOVL 104(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 52(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 84(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 48(BP), BX
+ ADDL BX, AX
+ MOVL AX, 112(BP)
+ ADDL AX, R11
+ MOVL R8, AX
+ ADDL $0xc6e00bf3, R11
+ MOVL R8, CX
+ RORL $0x06, AX
+ MOVL R8, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R8, CX
+ RORL $0x19, DX
+ ANDL R9, CX
+ XORL AX, DX
+ MOVL R8, AX
+ NOTL AX
+ ADDL DX, R11
+ ANDL R10, AX
+ XORL CX, AX
+ ADDL R11, AX
+ MOVL R12, DI
+ MOVL R14, BX
+ RORL $0x02, DI
+ MOVL R12, DX
+ ANDL R13, BX
+ RORL $0x0d, DX
+ MOVL R12, CX
+ ANDL R14, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R12, DX
+ MOVL R13, CX
+ RORL $0x16, DX
+ ANDL R12, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R11
+ ADDL AX, R15
+ ADDL AX, R11
+ MOVL 108(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 56(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 88(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 52(BP), BX
+ ADDL BX, AX
+ MOVL AX, 116(BP)
+ ADDL AX, R10
+ MOVL R15, AX
+ ADDL $0xd5a79147, R10
+ MOVL R15, CX
+ RORL $0x06, AX
+ MOVL R15, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R15, CX
+ RORL $0x19, DX
+ ANDL R8, CX
+ XORL AX, DX
+ MOVL R15, AX
+ NOTL AX
+ ADDL DX, R10
+ ANDL R9, AX
+ XORL CX, AX
+ ADDL R10, AX
+ MOVL R11, DI
+ MOVL R13, BX
+ RORL $0x02, DI
+ MOVL R11, DX
+ ANDL R12, BX
+ RORL $0x0d, DX
+ MOVL R11, CX
+ ANDL R13, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R11, DX
+ MOVL R12, CX
+ RORL $0x16, DX
+ ANDL R11, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R10
+ ADDL AX, R14
+ ADDL AX, R10
+ MOVL 112(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 60(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 92(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 56(BP), BX
+ ADDL BX, AX
+ MOVL AX, 120(BP)
+ ADDL AX, R9
+ MOVL R14, AX
+ ADDL $0x06ca6351, R9
+ MOVL R14, CX
+ RORL $0x06, AX
+ MOVL R14, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R14, CX
+ RORL $0x19, DX
+ ANDL R15, CX
+ XORL AX, DX
+ MOVL R14, AX
+ NOTL AX
+ ADDL DX, R9
+ ANDL R8, AX
+ XORL CX, AX
+ ADDL R9, AX
+ MOVL R10, DI
+ MOVL R12, BX
+ RORL $0x02, DI
+ MOVL R10, DX
+ ANDL R11, BX
+ RORL $0x0d, DX
+ MOVL R10, CX
+ ANDL R12, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R10, DX
+ MOVL R11, CX
+ RORL $0x16, DX
+ ANDL R10, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R9
+ ADDL AX, R13
+ ADDL AX, R9
+ MOVL 116(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 64(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 96(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 60(BP), BX
+ ADDL BX, AX
+ MOVL AX, 124(BP)
+ ADDL AX, R8
+ MOVL R13, AX
+ ADDL $0x14292967, R8
+ MOVL R13, CX
+ RORL $0x06, AX
+ MOVL R13, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R13, CX
+ RORL $0x19, DX
+ ANDL R14, CX
+ XORL AX, DX
+ MOVL R13, AX
+ NOTL AX
+ ADDL DX, R8
+ ANDL R15, AX
+ XORL CX, AX
+ ADDL R8, AX
+ MOVL R9, DI
+ MOVL R11, BX
+ RORL $0x02, DI
+ MOVL R9, DX
+ ANDL R10, BX
+ RORL $0x0d, DX
+ MOVL R9, CX
+ ANDL R11, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R9, DX
+ MOVL R10, CX
+ RORL $0x16, DX
+ ANDL R9, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R8
+ ADDL AX, R12
+ ADDL AX, R8
+ MOVL 120(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 68(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 100(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 64(BP), BX
+ ADDL BX, AX
+ MOVL AX, 128(BP)
+ ADDL AX, R15
+ MOVL R12, AX
+ ADDL $0x27b70a85, R15
+ MOVL R12, CX
+ RORL $0x06, AX
+ MOVL R12, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R12, CX
+ RORL $0x19, DX
+ ANDL R13, CX
+ XORL AX, DX
+ MOVL R12, AX
+ NOTL AX
+ ADDL DX, R15
+ ANDL R14, AX
+ XORL CX, AX
+ ADDL R15, AX
+ MOVL R8, DI
+ MOVL R10, BX
+ RORL $0x02, DI
+ MOVL R8, DX
+ ANDL R9, BX
+ RORL $0x0d, DX
+ MOVL R8, CX
+ ANDL R10, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R8, DX
+ MOVL R9, CX
+ RORL $0x16, DX
+ ANDL R8, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R15
+ ADDL AX, R11
+ ADDL AX, R15
+ MOVL 124(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 72(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 104(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 68(BP), BX
+ ADDL BX, AX
+ MOVL AX, 132(BP)
+ ADDL AX, R14
+ MOVL R11, AX
+ ADDL $0x2e1b2138, R14
+ MOVL R11, CX
+ RORL $0x06, AX
+ MOVL R11, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R11, CX
+ RORL $0x19, DX
+ ANDL R12, CX
+ XORL AX, DX
+ MOVL R11, AX
+ NOTL AX
+ ADDL DX, R14
+ ANDL R13, AX
+ XORL CX, AX
+ ADDL R14, AX
+ MOVL R15, DI
+ MOVL R9, BX
+ RORL $0x02, DI
+ MOVL R15, DX
+ ANDL R8, BX
+ RORL $0x0d, DX
+ MOVL R15, CX
+ ANDL R9, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R15, DX
+ MOVL R8, CX
+ RORL $0x16, DX
+ ANDL R15, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R14
+ ADDL AX, R10
+ ADDL AX, R14
+ MOVL 128(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 76(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 108(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 72(BP), BX
+ ADDL BX, AX
+ MOVL AX, 136(BP)
+ ADDL AX, R13
+ MOVL R10, AX
+ ADDL $0x4d2c6dfc, R13
+ MOVL R10, CX
+ RORL $0x06, AX
+ MOVL R10, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R10, CX
+ RORL $0x19, DX
+ ANDL R11, CX
+ XORL AX, DX
+ MOVL R10, AX
+ NOTL AX
+ ADDL DX, R13
+ ANDL R12, AX
+ XORL CX, AX
+ ADDL R13, AX
+ MOVL R14, DI
+ MOVL R8, BX
+ RORL $0x02, DI
+ MOVL R14, DX
+ ANDL R15, BX
+ RORL $0x0d, DX
+ MOVL R14, CX
+ ANDL R8, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R14, DX
+ MOVL R15, CX
+ RORL $0x16, DX
+ ANDL R14, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R13
+ ADDL AX, R9
+ ADDL AX, R13
+ MOVL 132(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 80(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 112(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 76(BP), BX
+ ADDL BX, AX
+ MOVL AX, 140(BP)
+ ADDL AX, R12
+ MOVL R9, AX
+ ADDL $0x53380d13, R12
+ MOVL R9, CX
+ RORL $0x06, AX
+ MOVL R9, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R9, CX
+ RORL $0x19, DX
+ ANDL R10, CX
+ XORL AX, DX
+ MOVL R9, AX
+ NOTL AX
+ ADDL DX, R12
+ ANDL R11, AX
+ XORL CX, AX
+ ADDL R12, AX
+ MOVL R13, DI
+ MOVL R15, BX
+ RORL $0x02, DI
+ MOVL R13, DX
+ ANDL R14, BX
+ RORL $0x0d, DX
+ MOVL R13, CX
+ ANDL R15, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R13, DX
+ MOVL R14, CX
+ RORL $0x16, DX
+ ANDL R13, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R12
+ ADDL AX, R8
+ ADDL AX, R12
+ MOVL 136(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 84(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 116(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 80(BP), BX
+ ADDL BX, AX
+ MOVL AX, 144(BP)
+ ADDL AX, R11
+ MOVL R8, AX
+ ADDL $0x650a7354, R11
+ MOVL R8, CX
+ RORL $0x06, AX
+ MOVL R8, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R8, CX
+ RORL $0x19, DX
+ ANDL R9, CX
+ XORL AX, DX
+ MOVL R8, AX
+ NOTL AX
+ ADDL DX, R11
+ ANDL R10, AX
+ XORL CX, AX
+ ADDL R11, AX
+ MOVL R12, DI
+ MOVL R14, BX
+ RORL $0x02, DI
+ MOVL R12, DX
+ ANDL R13, BX
+ RORL $0x0d, DX
+ MOVL R12, CX
+ ANDL R14, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R12, DX
+ MOVL R13, CX
+ RORL $0x16, DX
+ ANDL R12, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R11
+ ADDL AX, R15
+ ADDL AX, R11
+ MOVL 140(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 88(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 120(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 84(BP), BX
+ ADDL BX, AX
+ MOVL AX, 148(BP)
+ ADDL AX, R10
+ MOVL R15, AX
+ ADDL $0x766a0abb, R10
+ MOVL R15, CX
+ RORL $0x06, AX
+ MOVL R15, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R15, CX
+ RORL $0x19, DX
+ ANDL R8, CX
+ XORL AX, DX
+ MOVL R15, AX
+ NOTL AX
+ ADDL DX, R10
+ ANDL R9, AX
+ XORL CX, AX
+ ADDL R10, AX
+ MOVL R11, DI
+ MOVL R13, BX
+ RORL $0x02, DI
+ MOVL R11, DX
+ ANDL R12, BX
+ RORL $0x0d, DX
+ MOVL R11, CX
+ ANDL R13, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R11, DX
+ MOVL R12, CX
+ RORL $0x16, DX
+ ANDL R11, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R10
+ ADDL AX, R14
+ ADDL AX, R10
+ MOVL 144(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 92(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 124(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 88(BP), BX
+ ADDL BX, AX
+ MOVL AX, 152(BP)
+ ADDL AX, R9
+ MOVL R14, AX
+ ADDL $0x81c2c92e, R9
+ MOVL R14, CX
+ RORL $0x06, AX
+ MOVL R14, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R14, CX
+ RORL $0x19, DX
+ ANDL R15, CX
+ XORL AX, DX
+ MOVL R14, AX
+ NOTL AX
+ ADDL DX, R9
+ ANDL R8, AX
+ XORL CX, AX
+ ADDL R9, AX
+ MOVL R10, DI
+ MOVL R12, BX
+ RORL $0x02, DI
+ MOVL R10, DX
+ ANDL R11, BX
+ RORL $0x0d, DX
+ MOVL R10, CX
+ ANDL R12, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R10, DX
+ MOVL R11, CX
+ RORL $0x16, DX
+ ANDL R10, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R9
+ ADDL AX, R13
+ ADDL AX, R9
+ MOVL 148(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 96(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 128(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 92(BP), BX
+ ADDL BX, AX
+ MOVL AX, 156(BP)
+ ADDL AX, R8
+ MOVL R13, AX
+ ADDL $0x92722c85, R8
+ MOVL R13, CX
+ RORL $0x06, AX
+ MOVL R13, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R13, CX
+ RORL $0x19, DX
+ ANDL R14, CX
+ XORL AX, DX
+ MOVL R13, AX
+ NOTL AX
+ ADDL DX, R8
+ ANDL R15, AX
+ XORL CX, AX
+ ADDL R8, AX
+ MOVL R9, DI
+ MOVL R11, BX
+ RORL $0x02, DI
+ MOVL R9, DX
+ ANDL R10, BX
+ RORL $0x0d, DX
+ MOVL R9, CX
+ ANDL R11, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R9, DX
+ MOVL R10, CX
+ RORL $0x16, DX
+ ANDL R9, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R8
+ ADDL AX, R12
+ ADDL AX, R8
+ MOVL 152(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 100(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 132(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 96(BP), BX
+ ADDL BX, AX
+ MOVL AX, 160(BP)
+ ADDL AX, R15
+ MOVL R12, AX
+ ADDL $0xa2bfe8a1, R15
+ MOVL R12, CX
+ RORL $0x06, AX
+ MOVL R12, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R12, CX
+ RORL $0x19, DX
+ ANDL R13, CX
+ XORL AX, DX
+ MOVL R12, AX
+ NOTL AX
+ ADDL DX, R15
+ ANDL R14, AX
+ XORL CX, AX
+ ADDL R15, AX
+ MOVL R8, DI
+ MOVL R10, BX
+ RORL $0x02, DI
+ MOVL R8, DX
+ ANDL R9, BX
+ RORL $0x0d, DX
+ MOVL R8, CX
+ ANDL R10, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R8, DX
+ MOVL R9, CX
+ RORL $0x16, DX
+ ANDL R8, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R15
+ ADDL AX, R11
+ ADDL AX, R15
+ MOVL 156(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 104(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 136(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 100(BP), BX
+ ADDL BX, AX
+ MOVL AX, 164(BP)
+ ADDL AX, R14
+ MOVL R11, AX
+ ADDL $0xa81a664b, R14
+ MOVL R11, CX
+ RORL $0x06, AX
+ MOVL R11, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R11, CX
+ RORL $0x19, DX
+ ANDL R12, CX
+ XORL AX, DX
+ MOVL R11, AX
+ NOTL AX
+ ADDL DX, R14
+ ANDL R13, AX
+ XORL CX, AX
+ ADDL R14, AX
+ MOVL R15, DI
+ MOVL R9, BX
+ RORL $0x02, DI
+ MOVL R15, DX
+ ANDL R8, BX
+ RORL $0x0d, DX
+ MOVL R15, CX
+ ANDL R9, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R15, DX
+ MOVL R8, CX
+ RORL $0x16, DX
+ ANDL R15, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R14
+ ADDL AX, R10
+ ADDL AX, R14
+ MOVL 160(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 108(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 140(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 104(BP), BX
+ ADDL BX, AX
+ MOVL AX, 168(BP)
+ ADDL AX, R13
+ MOVL R10, AX
+ ADDL $0xc24b8b70, R13
+ MOVL R10, CX
+ RORL $0x06, AX
+ MOVL R10, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R10, CX
+ RORL $0x19, DX
+ ANDL R11, CX
+ XORL AX, DX
+ MOVL R10, AX
+ NOTL AX
+ ADDL DX, R13
+ ANDL R12, AX
+ XORL CX, AX
+ ADDL R13, AX
+ MOVL R14, DI
+ MOVL R8, BX
+ RORL $0x02, DI
+ MOVL R14, DX
+ ANDL R15, BX
+ RORL $0x0d, DX
+ MOVL R14, CX
+ ANDL R8, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R14, DX
+ MOVL R15, CX
+ RORL $0x16, DX
+ ANDL R14, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R13
+ ADDL AX, R9
+ ADDL AX, R13
+ MOVL 164(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 112(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 144(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 108(BP), BX
+ ADDL BX, AX
+ MOVL AX, 172(BP)
+ ADDL AX, R12
+ MOVL R9, AX
+ ADDL $0xc76c51a3, R12
+ MOVL R9, CX
+ RORL $0x06, AX
+ MOVL R9, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R9, CX
+ RORL $0x19, DX
+ ANDL R10, CX
+ XORL AX, DX
+ MOVL R9, AX
+ NOTL AX
+ ADDL DX, R12
+ ANDL R11, AX
+ XORL CX, AX
+ ADDL R12, AX
+ MOVL R13, DI
+ MOVL R15, BX
+ RORL $0x02, DI
+ MOVL R13, DX
+ ANDL R14, BX
+ RORL $0x0d, DX
+ MOVL R13, CX
+ ANDL R15, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R13, DX
+ MOVL R14, CX
+ RORL $0x16, DX
+ ANDL R13, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R12
+ ADDL AX, R8
+ ADDL AX, R12
+ MOVL 168(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 116(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 148(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 112(BP), BX
+ ADDL BX, AX
+ MOVL AX, 176(BP)
+ ADDL AX, R11
+ MOVL R8, AX
+ ADDL $0xd192e819, R11
+ MOVL R8, CX
+ RORL $0x06, AX
+ MOVL R8, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R8, CX
+ RORL $0x19, DX
+ ANDL R9, CX
+ XORL AX, DX
+ MOVL R8, AX
+ NOTL AX
+ ADDL DX, R11
+ ANDL R10, AX
+ XORL CX, AX
+ ADDL R11, AX
+ MOVL R12, DI
+ MOVL R14, BX
+ RORL $0x02, DI
+ MOVL R12, DX
+ ANDL R13, BX
+ RORL $0x0d, DX
+ MOVL R12, CX
+ ANDL R14, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R12, DX
+ MOVL R13, CX
+ RORL $0x16, DX
+ ANDL R12, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R11
+ ADDL AX, R15
+ ADDL AX, R11
+ MOVL 172(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 120(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 152(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 116(BP), BX
+ ADDL BX, AX
+ MOVL AX, 180(BP)
+ ADDL AX, R10
+ MOVL R15, AX
+ ADDL $0xd6990624, R10
+ MOVL R15, CX
+ RORL $0x06, AX
+ MOVL R15, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R15, CX
+ RORL $0x19, DX
+ ANDL R8, CX
+ XORL AX, DX
+ MOVL R15, AX
+ NOTL AX
+ ADDL DX, R10
+ ANDL R9, AX
+ XORL CX, AX
+ ADDL R10, AX
+ MOVL R11, DI
+ MOVL R13, BX
+ RORL $0x02, DI
+ MOVL R11, DX
+ ANDL R12, BX
+ RORL $0x0d, DX
+ MOVL R11, CX
+ ANDL R13, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R11, DX
+ MOVL R12, CX
+ RORL $0x16, DX
+ ANDL R11, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R10
+ ADDL AX, R14
+ ADDL AX, R10
+ MOVL 176(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 124(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 156(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 120(BP), BX
+ ADDL BX, AX
+ MOVL AX, 184(BP)
+ ADDL AX, R9
+ MOVL R14, AX
+ ADDL $0xf40e3585, R9
+ MOVL R14, CX
+ RORL $0x06, AX
+ MOVL R14, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R14, CX
+ RORL $0x19, DX
+ ANDL R15, CX
+ XORL AX, DX
+ MOVL R14, AX
+ NOTL AX
+ ADDL DX, R9
+ ANDL R8, AX
+ XORL CX, AX
+ ADDL R9, AX
+ MOVL R10, DI
+ MOVL R12, BX
+ RORL $0x02, DI
+ MOVL R10, DX
+ ANDL R11, BX
+ RORL $0x0d, DX
+ MOVL R10, CX
+ ANDL R12, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R10, DX
+ MOVL R11, CX
+ RORL $0x16, DX
+ ANDL R10, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R9
+ ADDL AX, R13
+ ADDL AX, R9
+ MOVL 180(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 128(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 160(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 124(BP), BX
+ ADDL BX, AX
+ MOVL AX, 188(BP)
+ ADDL AX, R8
+ MOVL R13, AX
+ ADDL $0x106aa070, R8
+ MOVL R13, CX
+ RORL $0x06, AX
+ MOVL R13, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R13, CX
+ RORL $0x19, DX
+ ANDL R14, CX
+ XORL AX, DX
+ MOVL R13, AX
+ NOTL AX
+ ADDL DX, R8
+ ANDL R15, AX
+ XORL CX, AX
+ ADDL R8, AX
+ MOVL R9, DI
+ MOVL R11, BX
+ RORL $0x02, DI
+ MOVL R9, DX
+ ANDL R10, BX
+ RORL $0x0d, DX
+ MOVL R9, CX
+ ANDL R11, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R9, DX
+ MOVL R10, CX
+ RORL $0x16, DX
+ ANDL R9, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R8
+ ADDL AX, R12
+ ADDL AX, R8
+ MOVL 184(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 132(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 164(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 128(BP), BX
+ ADDL BX, AX
+ MOVL AX, 192(BP)
+ ADDL AX, R15
+ MOVL R12, AX
+ ADDL $0x19a4c116, R15
+ MOVL R12, CX
+ RORL $0x06, AX
+ MOVL R12, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R12, CX
+ RORL $0x19, DX
+ ANDL R13, CX
+ XORL AX, DX
+ MOVL R12, AX
+ NOTL AX
+ ADDL DX, R15
+ ANDL R14, AX
+ XORL CX, AX
+ ADDL R15, AX
+ MOVL R8, DI
+ MOVL R10, BX
+ RORL $0x02, DI
+ MOVL R8, DX
+ ANDL R9, BX
+ RORL $0x0d, DX
+ MOVL R8, CX
+ ANDL R10, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R8, DX
+ MOVL R9, CX
+ RORL $0x16, DX
+ ANDL R8, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R15
+ ADDL AX, R11
+ ADDL AX, R15
+ MOVL 188(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 136(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 168(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 132(BP), BX
+ ADDL BX, AX
+ MOVL AX, 196(BP)
+ ADDL AX, R14
+ MOVL R11, AX
+ ADDL $0x1e376c08, R14
+ MOVL R11, CX
+ RORL $0x06, AX
+ MOVL R11, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R11, CX
+ RORL $0x19, DX
+ ANDL R12, CX
+ XORL AX, DX
+ MOVL R11, AX
+ NOTL AX
+ ADDL DX, R14
+ ANDL R13, AX
+ XORL CX, AX
+ ADDL R14, AX
+ MOVL R15, DI
+ MOVL R9, BX
+ RORL $0x02, DI
+ MOVL R15, DX
+ ANDL R8, BX
+ RORL $0x0d, DX
+ MOVL R15, CX
+ ANDL R9, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R15, DX
+ MOVL R8, CX
+ RORL $0x16, DX
+ ANDL R15, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R14
+ ADDL AX, R10
+ ADDL AX, R14
+ MOVL 192(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 140(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 172(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 136(BP), BX
+ ADDL BX, AX
+ MOVL AX, 200(BP)
+ ADDL AX, R13
+ MOVL R10, AX
+ ADDL $0x2748774c, R13
+ MOVL R10, CX
+ RORL $0x06, AX
+ MOVL R10, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R10, CX
+ RORL $0x19, DX
+ ANDL R11, CX
+ XORL AX, DX
+ MOVL R10, AX
+ NOTL AX
+ ADDL DX, R13
+ ANDL R12, AX
+ XORL CX, AX
+ ADDL R13, AX
+ MOVL R14, DI
+ MOVL R8, BX
+ RORL $0x02, DI
+ MOVL R14, DX
+ ANDL R15, BX
+ RORL $0x0d, DX
+ MOVL R14, CX
+ ANDL R8, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R14, DX
+ MOVL R15, CX
+ RORL $0x16, DX
+ ANDL R14, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R13
+ ADDL AX, R9
+ ADDL AX, R13
+ MOVL 196(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 144(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 176(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 140(BP), BX
+ ADDL BX, AX
+ MOVL AX, 204(BP)
+ ADDL AX, R12
+ MOVL R9, AX
+ ADDL $0x34b0bcb5, R12
+ MOVL R9, CX
+ RORL $0x06, AX
+ MOVL R9, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R9, CX
+ RORL $0x19, DX
+ ANDL R10, CX
+ XORL AX, DX
+ MOVL R9, AX
+ NOTL AX
+ ADDL DX, R12
+ ANDL R11, AX
+ XORL CX, AX
+ ADDL R12, AX
+ MOVL R13, DI
+ MOVL R15, BX
+ RORL $0x02, DI
+ MOVL R13, DX
+ ANDL R14, BX
+ RORL $0x0d, DX
+ MOVL R13, CX
+ ANDL R15, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R13, DX
+ MOVL R14, CX
+ RORL $0x16, DX
+ ANDL R13, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R12
+ ADDL AX, R8
+ ADDL AX, R12
+ MOVL 200(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 148(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 180(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 144(BP), BX
+ ADDL BX, AX
+ MOVL AX, 208(BP)
+ ADDL AX, R11
+ MOVL R8, AX
+ ADDL $0x391c0cb3, R11
+ MOVL R8, CX
+ RORL $0x06, AX
+ MOVL R8, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R8, CX
+ RORL $0x19, DX
+ ANDL R9, CX
+ XORL AX, DX
+ MOVL R8, AX
+ NOTL AX
+ ADDL DX, R11
+ ANDL R10, AX
+ XORL CX, AX
+ ADDL R11, AX
+ MOVL R12, DI
+ MOVL R14, BX
+ RORL $0x02, DI
+ MOVL R12, DX
+ ANDL R13, BX
+ RORL $0x0d, DX
+ MOVL R12, CX
+ ANDL R14, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R12, DX
+ MOVL R13, CX
+ RORL $0x16, DX
+ ANDL R12, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R11
+ ADDL AX, R15
+ ADDL AX, R11
+ MOVL 204(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 152(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 184(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 148(BP), BX
+ ADDL BX, AX
+ MOVL AX, 212(BP)
+ ADDL AX, R10
+ MOVL R15, AX
+ ADDL $0x4ed8aa4a, R10
+ MOVL R15, CX
+ RORL $0x06, AX
+ MOVL R15, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R15, CX
+ RORL $0x19, DX
+ ANDL R8, CX
+ XORL AX, DX
+ MOVL R15, AX
+ NOTL AX
+ ADDL DX, R10
+ ANDL R9, AX
+ XORL CX, AX
+ ADDL R10, AX
+ MOVL R11, DI
+ MOVL R13, BX
+ RORL $0x02, DI
+ MOVL R11, DX
+ ANDL R12, BX
+ RORL $0x0d, DX
+ MOVL R11, CX
+ ANDL R13, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R11, DX
+ MOVL R12, CX
+ RORL $0x16, DX
+ ANDL R11, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R10
+ ADDL AX, R14
+ ADDL AX, R10
+ MOVL 208(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 156(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 188(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 152(BP), BX
+ ADDL BX, AX
+ MOVL AX, 216(BP)
+ ADDL AX, R9
+ MOVL R14, AX
+ ADDL $0x5b9cca4f, R9
+ MOVL R14, CX
+ RORL $0x06, AX
+ MOVL R14, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R14, CX
+ RORL $0x19, DX
+ ANDL R15, CX
+ XORL AX, DX
+ MOVL R14, AX
+ NOTL AX
+ ADDL DX, R9
+ ANDL R8, AX
+ XORL CX, AX
+ ADDL R9, AX
+ MOVL R10, DI
+ MOVL R12, BX
+ RORL $0x02, DI
+ MOVL R10, DX
+ ANDL R11, BX
+ RORL $0x0d, DX
+ MOVL R10, CX
+ ANDL R12, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R10, DX
+ MOVL R11, CX
+ RORL $0x16, DX
+ ANDL R10, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R9
+ ADDL AX, R13
+ ADDL AX, R9
+ MOVL 212(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 160(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 192(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 156(BP), BX
+ ADDL BX, AX
+ MOVL AX, 220(BP)
+ ADDL AX, R8
+ MOVL R13, AX
+ ADDL $0x682e6ff3, R8
+ MOVL R13, CX
+ RORL $0x06, AX
+ MOVL R13, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R13, CX
+ RORL $0x19, DX
+ ANDL R14, CX
+ XORL AX, DX
+ MOVL R13, AX
+ NOTL AX
+ ADDL DX, R8
+ ANDL R15, AX
+ XORL CX, AX
+ ADDL R8, AX
+ MOVL R9, DI
+ MOVL R11, BX
+ RORL $0x02, DI
+ MOVL R9, DX
+ ANDL R10, BX
+ RORL $0x0d, DX
+ MOVL R9, CX
+ ANDL R11, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R9, DX
+ MOVL R10, CX
+ RORL $0x16, DX
+ ANDL R9, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R8
+ ADDL AX, R12
+ ADDL AX, R8
+ MOVL 216(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 164(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 196(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 160(BP), BX
+ ADDL BX, AX
+ MOVL AX, 224(BP)
+ ADDL AX, R15
+ MOVL R12, AX
+ ADDL $0x748f82ee, R15
+ MOVL R12, CX
+ RORL $0x06, AX
+ MOVL R12, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R12, CX
+ RORL $0x19, DX
+ ANDL R13, CX
+ XORL AX, DX
+ MOVL R12, AX
+ NOTL AX
+ ADDL DX, R15
+ ANDL R14, AX
+ XORL CX, AX
+ ADDL R15, AX
+ MOVL R8, DI
+ MOVL R10, BX
+ RORL $0x02, DI
+ MOVL R8, DX
+ ANDL R9, BX
+ RORL $0x0d, DX
+ MOVL R8, CX
+ ANDL R10, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R8, DX
+ MOVL R9, CX
+ RORL $0x16, DX
+ ANDL R8, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R15
+ ADDL AX, R11
+ ADDL AX, R15
+ MOVL 220(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 168(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 200(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 164(BP), BX
+ ADDL BX, AX
+ MOVL AX, 228(BP)
+ ADDL AX, R14
+ MOVL R11, AX
+ ADDL $0x78a5636f, R14
+ MOVL R11, CX
+ RORL $0x06, AX
+ MOVL R11, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R11, CX
+ RORL $0x19, DX
+ ANDL R12, CX
+ XORL AX, DX
+ MOVL R11, AX
+ NOTL AX
+ ADDL DX, R14
+ ANDL R13, AX
+ XORL CX, AX
+ ADDL R14, AX
+ MOVL R15, DI
+ MOVL R9, BX
+ RORL $0x02, DI
+ MOVL R15, DX
+ ANDL R8, BX
+ RORL $0x0d, DX
+ MOVL R15, CX
+ ANDL R9, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R15, DX
+ MOVL R8, CX
+ RORL $0x16, DX
+ ANDL R15, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R14
+ ADDL AX, R10
+ ADDL AX, R14
+ MOVL 224(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 172(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 204(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 168(BP), BX
+ ADDL BX, AX
+ MOVL AX, 232(BP)
+ ADDL AX, R13
+ MOVL R10, AX
+ ADDL $0x84c87814, R13
+ MOVL R10, CX
+ RORL $0x06, AX
+ MOVL R10, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R10, CX
+ RORL $0x19, DX
+ ANDL R11, CX
+ XORL AX, DX
+ MOVL R10, AX
+ NOTL AX
+ ADDL DX, R13
+ ANDL R12, AX
+ XORL CX, AX
+ ADDL R13, AX
+ MOVL R14, DI
+ MOVL R8, BX
+ RORL $0x02, DI
+ MOVL R14, DX
+ ANDL R15, BX
+ RORL $0x0d, DX
+ MOVL R14, CX
+ ANDL R8, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R14, DX
+ MOVL R15, CX
+ RORL $0x16, DX
+ ANDL R14, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R13
+ ADDL AX, R9
+ ADDL AX, R13
+ MOVL 228(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 176(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 208(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 172(BP), BX
+ ADDL BX, AX
+ MOVL AX, 236(BP)
+ ADDL AX, R12
+ MOVL R9, AX
+ ADDL $0x8cc70208, R12
+ MOVL R9, CX
+ RORL $0x06, AX
+ MOVL R9, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R9, CX
+ RORL $0x19, DX
+ ANDL R10, CX
+ XORL AX, DX
+ MOVL R9, AX
+ NOTL AX
+ ADDL DX, R12
+ ANDL R11, AX
+ XORL CX, AX
+ ADDL R12, AX
+ MOVL R13, DI
+ MOVL R15, BX
+ RORL $0x02, DI
+ MOVL R13, DX
+ ANDL R14, BX
+ RORL $0x0d, DX
+ MOVL R13, CX
+ ANDL R15, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R13, DX
+ MOVL R14, CX
+ RORL $0x16, DX
+ ANDL R13, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R12
+ ADDL AX, R8
+ ADDL AX, R12
+ MOVL 232(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 180(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 212(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 176(BP), BX
+ ADDL BX, AX
+ MOVL AX, 240(BP)
+ ADDL AX, R11
+ MOVL R8, AX
+ ADDL $0x90befffa, R11
+ MOVL R8, CX
+ RORL $0x06, AX
+ MOVL R8, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R8, CX
+ RORL $0x19, DX
+ ANDL R9, CX
+ XORL AX, DX
+ MOVL R8, AX
+ NOTL AX
+ ADDL DX, R11
+ ANDL R10, AX
+ XORL CX, AX
+ ADDL R11, AX
+ MOVL R12, DI
+ MOVL R14, BX
+ RORL $0x02, DI
+ MOVL R12, DX
+ ANDL R13, BX
+ RORL $0x0d, DX
+ MOVL R12, CX
+ ANDL R14, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R12, DX
+ MOVL R13, CX
+ RORL $0x16, DX
+ ANDL R12, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R11
+ ADDL AX, R15
+ ADDL AX, R11
+ MOVL 236(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 184(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 216(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 180(BP), BX
+ ADDL BX, AX
+ MOVL AX, 244(BP)
+ ADDL AX, R10
+ MOVL R15, AX
+ ADDL $0xa4506ceb, R10
+ MOVL R15, CX
+ RORL $0x06, AX
+ MOVL R15, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R15, CX
+ RORL $0x19, DX
+ ANDL R8, CX
+ XORL AX, DX
+ MOVL R15, AX
+ NOTL AX
+ ADDL DX, R10
+ ANDL R9, AX
+ XORL CX, AX
+ ADDL R10, AX
+ MOVL R11, DI
+ MOVL R13, BX
+ RORL $0x02, DI
+ MOVL R11, DX
+ ANDL R12, BX
+ RORL $0x0d, DX
+ MOVL R11, CX
+ ANDL R13, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R11, DX
+ MOVL R12, CX
+ RORL $0x16, DX
+ ANDL R11, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R10
+ ADDL AX, R14
+ ADDL AX, R10
+ MOVL 240(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 188(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 220(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 184(BP), BX
+ ADDL BX, AX
+ MOVL AX, 248(BP)
+ ADDL AX, R9
+ MOVL R14, AX
+ ADDL $0xbef9a3f7, R9
+ MOVL R14, CX
+ RORL $0x06, AX
+ MOVL R14, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R14, CX
+ RORL $0x19, DX
+ ANDL R15, CX
+ XORL AX, DX
+ MOVL R14, AX
+ NOTL AX
+ ADDL DX, R9
+ ANDL R8, AX
+ XORL CX, AX
+ ADDL R9, AX
+ MOVL R10, DI
+ MOVL R12, BX
+ RORL $0x02, DI
+ MOVL R10, DX
+ ANDL R11, BX
+ RORL $0x0d, DX
+ MOVL R10, CX
+ ANDL R12, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R10, DX
+ MOVL R11, CX
+ RORL $0x16, DX
+ ANDL R10, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R9
+ ADDL AX, R13
+ ADDL AX, R9
+ MOVL 244(BP), AX
+ MOVL AX, CX
+ RORL $0x11, AX
+ MOVL CX, DX
+ RORL $0x13, CX
+ SHRL $0x0a, DX
+ MOVL 192(BP), BX
+ XORL CX, AX
+ MOVL BX, CX
+ XORL DX, AX
+ RORL $0x07, BX
+ MOVL CX, DX
+ SHRL $0x03, DX
+ RORL $0x12, CX
+ ADDL 224(BP), AX
+ XORL CX, BX
+ XORL DX, BX
+ ADDL 188(BP), BX
+ ADDL BX, AX
+ MOVL AX, 252(BP)
+ ADDL AX, R8
+ MOVL R13, AX
+ ADDL $0xc67178f2, R8
+ MOVL R13, CX
+ RORL $0x06, AX
+ MOVL R13, DX
+ RORL $0x0b, CX
+ XORL CX, AX
+ MOVL R13, CX
+ RORL $0x19, DX
+ ANDL R14, CX
+ XORL AX, DX
+ MOVL R13, AX
+ NOTL AX
+ ADDL DX, R8
+ ANDL R15, AX
+ XORL CX, AX
+ ADDL R8, AX
+ MOVL R9, DI
+ MOVL R11, BX
+ RORL $0x02, DI
+ MOVL R9, DX
+ ANDL R10, BX
+ RORL $0x0d, DX
+ MOVL R9, CX
+ ANDL R11, CX
+ XORL DX, DI
+ XORL CX, BX
+ MOVL R9, DX
+ MOVL R10, CX
+ RORL $0x16, DX
+ ANDL R9, CX
+ XORL CX, BX
+ XORL DX, DI
+ ADDL DI, BX
+ MOVL BX, R8
+ ADDL AX, R12
+ ADDL AX, R8
+ MOVQ dig+0(FP), BP
+ ADDL (BP), R8
+ MOVL R8, (BP)
+ ADDL 4(BP), R9
+ MOVL R9, 4(BP)
+ ADDL 8(BP), R10
+ MOVL R10, 8(BP)
+ ADDL 12(BP), R11
+ MOVL R11, 12(BP)
+ ADDL 16(BP), R12
+ MOVL R12, 16(BP)
+ ADDL 20(BP), R13
+ MOVL R13, 20(BP)
+ ADDL 24(BP), R14
+ MOVL R14, 24(BP)
+ ADDL 28(BP), R15
+ MOVL R15, 28(BP)
+ ADDQ $0x40, SI
+ CMPQ SI, 256(SP)
+ JB loop
end:
RET
avx2:
- MOVQ dig+0(FP), CTX // d.h[8]
- MOVQ p_base+8(FP), INP
- MOVQ p_len+16(FP), NUM_BYTES
-
- LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
- MOVQ NUM_BYTES, _INP_END(SP)
-
- CMPQ NUM_BYTES, INP
+ MOVQ dig+0(FP), SI
+ MOVQ p_base+8(FP), DI
+ MOVQ p_len+16(FP), DX
+ LEAQ -64(DI)(DX*1), DX
+ MOVQ DX, 512(SP)
+ CMPQ DX, DI
JE avx2_only_one_block
// Load initial digest
- MOVL 0(CTX), a // a = H0
- MOVL 4(CTX), b // b = H1
- MOVL 8(CTX), c // c = H2
- MOVL 12(CTX), d // d = H3
- MOVL 16(CTX), e // e = H4
- MOVL 20(CTX), f // f = H5
- MOVL 24(CTX), g // g = H6
- MOVL 28(CTX), h // h = H7
-
-avx2_loop0: // at each iteration works with one block (512 bit)
-
- VMOVDQU (0*32)(INP), XTMP0
- VMOVDQU (1*32)(INP), XTMP1
- VMOVDQU (2*32)(INP), XTMP2
- VMOVDQU (3*32)(INP), XTMP3
+ MOVL (SI), AX
+ MOVL 4(SI), BX
+ MOVL 8(SI), CX
+ MOVL 12(SI), R8
+ MOVL 16(SI), DX
+ MOVL 20(SI), R9
+ MOVL 24(SI), R10
+ MOVL 28(SI), R11
- VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
+avx2_loop0:
+ // at each iteration works with one block (512 bit)
+ VMOVDQU (DI), Y0
+ VMOVDQU 32(DI), Y1
+ VMOVDQU 64(DI), Y2
+ VMOVDQU 96(DI), Y3
+ VMOVDQU flip_mask<>+0(SB), Y13
// Apply Byte Flip Mask: LE -> BE
- VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
- VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
- VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
- VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
+ VPSHUFB Y13, Y0, Y0
+ VPSHUFB Y13, Y1, Y1
+ VPSHUFB Y13, Y2, Y2
+ VPSHUFB Y13, Y3, Y3
// Transpose data into high/low parts
- VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
- VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
- VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
- VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
-
- MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
+ VPERM2I128 $0x20, Y2, Y0, Y4
+ VPERM2I128 $0x31, Y2, Y0, Y5
+ VPERM2I128 $0x20, Y3, Y1, Y6
+ VPERM2I128 $0x31, Y3, Y1, Y7
+ LEAQ K256<>+0(SB), BP
avx2_last_block_enter:
- ADDQ $64, INP
- MOVQ INP, _INP(SP)
- XORQ SRND, SRND
+ ADDQ $0x40, DI
+ MOVQ DI, 520(SP)
+ XORQ SI, SI
-avx2_loop1: // for w0 - w47
+avx2_loop1:
// Do 4 rounds and scheduling
- VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER
- VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
- ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
- ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
- ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
- ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
+ VPADDD (BP)(SI*1), Y4, Y9
+ VMOVDQU Y9, (SP)(SI*1)
+ MOVL AX, DI
+ RORXL $0x19, DX, R13
+ RORXL $0x0b, DX, R14
+ ADDL (SP)(SI*1), R11
+ ORL CX, DI
+ VPALIGNR $0x04, Y6, Y7, Y0
+ MOVL R9, R15
+ RORXL $0x0d, AX, R12
+ XORL R14, R13
+ XORL R10, R15
+ VPADDD Y4, Y0, Y0
+ RORXL $0x06, DX, R14
+ ANDL DX, R15
+ XORL R14, R13
+ RORXL $0x16, AX, R14
+ ADDL R11, R8
+ ANDL BX, DI
+ VPALIGNR $0x04, Y4, Y5, Y1
+ XORL R12, R14
+ RORXL $0x02, AX, R12
+ XORL R10, R15
+ VPSRLD $0x07, Y1, Y2
+ XORL R12, R14
+ MOVL AX, R12
+ ANDL CX, R12
+ ADDL R13, R15
+ VPSLLD $0x19, Y1, Y3
+ ORL R12, DI
+ ADDL R14, R11
+ ADDL R15, R8
+ VPOR Y2, Y3, Y3
+ VPSRLD $0x12, Y1, Y2
+ ADDL R15, R11
+ ADDL DI, R11
+ MOVL R11, DI
+ RORXL $0x19, R8, R13
+ RORXL $0x0b, R8, R14
+ ADDL 4(SP)(SI*1), R10
+ ORL BX, DI
+ VPSRLD $0x03, Y1, Y8
+ MOVL DX, R15
+ RORXL $0x0d, R11, R12
+ XORL R14, R13
+ XORL R9, R15
+ RORXL $0x06, R8, R14
+ XORL R14, R13
+ RORXL $0x16, R11, R14
+ ANDL R8, R15
+ ADDL R10, CX
+ VPSLLD $0x0e, Y1, Y1
+ ANDL AX, DI
+ XORL R12, R14
+ VPXOR Y1, Y3, Y3
+ RORXL $0x02, R11, R12
+ XORL R9, R15
+ VPXOR Y2, Y3, Y3
+ XORL R12, R14
+ MOVL R11, R12
+ ANDL BX, R12
+ ADDL R13, R15
+ VPXOR Y8, Y3, Y1
+ VPSHUFD $0xfa, Y7, Y2
+ ORL R12, DI
+ ADDL R14, R10
+ VPADDD Y1, Y0, Y0
+ ADDL R15, CX
+ ADDL R15, R10
+ ADDL DI, R10
+ VPSRLD $0x0a, Y2, Y8
+ MOVL R10, DI
+ RORXL $0x19, CX, R13
+ ADDL 8(SP)(SI*1), R9
+ VPSRLQ $0x13, Y2, Y3
+ RORXL $0x0b, CX, R14
+ ORL AX, DI
+ MOVL R8, R15
+ XORL DX, R15
+ RORXL $0x0d, R10, R12
+ XORL R14, R13
+ VPSRLQ $0x11, Y2, Y2
+ ANDL CX, R15
+ RORXL $0x06, CX, R14
+ VPXOR Y3, Y2, Y2
+ ADDL R9, BX
+ ANDL R11, DI
+ XORL R14, R13
+ RORXL $0x16, R10, R14
+ VPXOR Y2, Y8, Y8
+ XORL DX, R15
+ VPSHUFB shuff_00BA<>+0(SB), Y8, Y8
+ XORL R12, R14
+ RORXL $0x02, R10, R12
+ VPADDD Y8, Y0, Y0
+ XORL R12, R14
+ MOVL R10, R12
+ ANDL AX, R12
+ ADDL R13, R15
+ VPSHUFD $0x50, Y0, Y2
+ ORL R12, DI
+ ADDL R14, R9
+ ADDL R15, BX
+ ADDL R15, R9
+ ADDL DI, R9
+ MOVL R9, DI
+ RORXL $0x19, BX, R13
+ RORXL $0x0b, BX, R14
+ ADDL 12(SP)(SI*1), DX
+ ORL R11, DI
+ VPSRLD $0x0a, Y2, Y11
+ MOVL CX, R15
+ RORXL $0x0d, R9, R12
+ XORL R14, R13
+ XORL R8, R15
+ VPSRLQ $0x13, Y2, Y3
+ RORXL $0x06, BX, R14
+ ANDL BX, R15
+ ADDL DX, AX
+ ANDL R10, DI
+ VPSRLQ $0x11, Y2, Y2
+ XORL R14, R13
+ XORL R8, R15
+ VPXOR Y3, Y2, Y2
+ RORXL $0x16, R9, R14
+ ADDL R13, R15
+ VPXOR Y2, Y11, Y11
+ XORL R12, R14
+ ADDL R15, AX
+ RORXL $0x02, R9, R12
+ VPSHUFB shuff_DC00<>+0(SB), Y11, Y11
+ VPADDD Y0, Y11, Y4
+ XORL R12, R14
+ MOVL R9, R12
+ ANDL R11, R12
+ ORL R12, DI
+ ADDL R14, DX
+ ADDL R15, DX
+ ADDL DI, DX
// Do 4 rounds and scheduling
- VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER
- VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
- ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
- ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
- ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
- ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
+ VPADDD 32(BP)(SI*1), Y5, Y9
+ VMOVDQU Y9, 32(SP)(SI*1)
+ MOVL DX, DI
+ RORXL $0x19, AX, R13
+ RORXL $0x0b, AX, R14
+ ADDL 32(SP)(SI*1), R8
+ ORL R10, DI
+ VPALIGNR $0x04, Y7, Y4, Y0
+ MOVL BX, R15
+ RORXL $0x0d, DX, R12
+ XORL R14, R13
+ XORL CX, R15
+ VPADDD Y5, Y0, Y0
+ RORXL $0x06, AX, R14
+ ANDL AX, R15
+ XORL R14, R13
+ RORXL $0x16, DX, R14
+ ADDL R8, R11
+ ANDL R9, DI
+ VPALIGNR $0x04, Y5, Y6, Y1
+ XORL R12, R14
+ RORXL $0x02, DX, R12
+ XORL CX, R15
+ VPSRLD $0x07, Y1, Y2
+ XORL R12, R14
+ MOVL DX, R12
+ ANDL R10, R12
+ ADDL R13, R15
+ VPSLLD $0x19, Y1, Y3
+ ORL R12, DI
+ ADDL R14, R8
+ ADDL R15, R11
+ VPOR Y2, Y3, Y3
+ VPSRLD $0x12, Y1, Y2
+ ADDL R15, R8
+ ADDL DI, R8
+ MOVL R8, DI
+ RORXL $0x19, R11, R13
+ RORXL $0x0b, R11, R14
+ ADDL 36(SP)(SI*1), CX
+ ORL R9, DI
+ VPSRLD $0x03, Y1, Y8
+ MOVL AX, R15
+ RORXL $0x0d, R8, R12
+ XORL R14, R13
+ XORL BX, R15
+ RORXL $0x06, R11, R14
+ XORL R14, R13
+ RORXL $0x16, R8, R14
+ ANDL R11, R15
+ ADDL CX, R10
+ VPSLLD $0x0e, Y1, Y1
+ ANDL DX, DI
+ XORL R12, R14
+ VPXOR Y1, Y3, Y3
+ RORXL $0x02, R8, R12
+ XORL BX, R15
+ VPXOR Y2, Y3, Y3
+ XORL R12, R14
+ MOVL R8, R12
+ ANDL R9, R12
+ ADDL R13, R15
+ VPXOR Y8, Y3, Y1
+ VPSHUFD $0xfa, Y4, Y2
+ ORL R12, DI
+ ADDL R14, CX
+ VPADDD Y1, Y0, Y0
+ ADDL R15, R10
+ ADDL R15, CX
+ ADDL DI, CX
+ VPSRLD $0x0a, Y2, Y8
+ MOVL CX, DI
+ RORXL $0x19, R10, R13
+ ADDL 40(SP)(SI*1), BX
+ VPSRLQ $0x13, Y2, Y3
+ RORXL $0x0b, R10, R14
+ ORL DX, DI
+ MOVL R11, R15
+ XORL AX, R15
+ RORXL $0x0d, CX, R12
+ XORL R14, R13
+ VPSRLQ $0x11, Y2, Y2
+ ANDL R10, R15
+ RORXL $0x06, R10, R14
+ VPXOR Y3, Y2, Y2
+ ADDL BX, R9
+ ANDL R8, DI
+ XORL R14, R13
+ RORXL $0x16, CX, R14
+ VPXOR Y2, Y8, Y8
+ XORL AX, R15
+ VPSHUFB shuff_00BA<>+0(SB), Y8, Y8
+ XORL R12, R14
+ RORXL $0x02, CX, R12
+ VPADDD Y8, Y0, Y0
+ XORL R12, R14
+ MOVL CX, R12
+ ANDL DX, R12
+ ADDL R13, R15
+ VPSHUFD $0x50, Y0, Y2
+ ORL R12, DI
+ ADDL R14, BX
+ ADDL R15, R9
+ ADDL R15, BX
+ ADDL DI, BX
+ MOVL BX, DI
+ RORXL $0x19, R9, R13
+ RORXL $0x0b, R9, R14
+ ADDL 44(SP)(SI*1), AX
+ ORL R8, DI
+ VPSRLD $0x0a, Y2, Y11
+ MOVL R10, R15
+ RORXL $0x0d, BX, R12
+ XORL R14, R13
+ XORL R11, R15
+ VPSRLQ $0x13, Y2, Y3
+ RORXL $0x06, R9, R14
+ ANDL R9, R15
+ ADDL AX, DX
+ ANDL CX, DI
+ VPSRLQ $0x11, Y2, Y2
+ XORL R14, R13
+ XORL R11, R15
+ VPXOR Y3, Y2, Y2
+ RORXL $0x16, BX, R14
+ ADDL R13, R15
+ VPXOR Y2, Y11, Y11
+ XORL R12, R14
+ ADDL R15, DX
+ RORXL $0x02, BX, R12
+ VPSHUFB shuff_DC00<>+0(SB), Y11, Y11
+ VPADDD Y0, Y11, Y5
+ XORL R12, R14
+ MOVL BX, R12
+ ANDL R8, R12
+ ORL R12, DI
+ ADDL R14, AX
+ ADDL R15, AX
+ ADDL DI, AX
// Do 4 rounds and scheduling
- VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER
- VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
- ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
- ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
- ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
- ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
+ VPADDD 64(BP)(SI*1), Y6, Y9
+ VMOVDQU Y9, 64(SP)(SI*1)
+ MOVL AX, DI
+ RORXL $0x19, DX, R13
+ RORXL $0x0b, DX, R14
+ ADDL 64(SP)(SI*1), R11
+ ORL CX, DI
+ VPALIGNR $0x04, Y4, Y5, Y0
+ MOVL R9, R15
+ RORXL $0x0d, AX, R12
+ XORL R14, R13
+ XORL R10, R15
+ VPADDD Y6, Y0, Y0
+ RORXL $0x06, DX, R14
+ ANDL DX, R15
+ XORL R14, R13
+ RORXL $0x16, AX, R14
+ ADDL R11, R8
+ ANDL BX, DI
+ VPALIGNR $0x04, Y6, Y7, Y1
+ XORL R12, R14
+ RORXL $0x02, AX, R12
+ XORL R10, R15
+ VPSRLD $0x07, Y1, Y2
+ XORL R12, R14
+ MOVL AX, R12
+ ANDL CX, R12
+ ADDL R13, R15
+ VPSLLD $0x19, Y1, Y3
+ ORL R12, DI
+ ADDL R14, R11
+ ADDL R15, R8
+ VPOR Y2, Y3, Y3
+ VPSRLD $0x12, Y1, Y2
+ ADDL R15, R11
+ ADDL DI, R11
+ MOVL R11, DI
+ RORXL $0x19, R8, R13
+ RORXL $0x0b, R8, R14
+ ADDL 68(SP)(SI*1), R10
+ ORL BX, DI
+ VPSRLD $0x03, Y1, Y8
+ MOVL DX, R15
+ RORXL $0x0d, R11, R12
+ XORL R14, R13
+ XORL R9, R15
+ RORXL $0x06, R8, R14
+ XORL R14, R13
+ RORXL $0x16, R11, R14
+ ANDL R8, R15
+ ADDL R10, CX
+ VPSLLD $0x0e, Y1, Y1
+ ANDL AX, DI
+ XORL R12, R14
+ VPXOR Y1, Y3, Y3
+ RORXL $0x02, R11, R12
+ XORL R9, R15
+ VPXOR Y2, Y3, Y3
+ XORL R12, R14
+ MOVL R11, R12
+ ANDL BX, R12
+ ADDL R13, R15
+ VPXOR Y8, Y3, Y1
+ VPSHUFD $0xfa, Y5, Y2
+ ORL R12, DI
+ ADDL R14, R10
+ VPADDD Y1, Y0, Y0
+ ADDL R15, CX
+ ADDL R15, R10
+ ADDL DI, R10
+ VPSRLD $0x0a, Y2, Y8
+ MOVL R10, DI
+ RORXL $0x19, CX, R13
+ ADDL 72(SP)(SI*1), R9
+ VPSRLQ $0x13, Y2, Y3
+ RORXL $0x0b, CX, R14
+ ORL AX, DI
+ MOVL R8, R15
+ XORL DX, R15
+ RORXL $0x0d, R10, R12
+ XORL R14, R13
+ VPSRLQ $0x11, Y2, Y2
+ ANDL CX, R15
+ RORXL $0x06, CX, R14
+ VPXOR Y3, Y2, Y2
+ ADDL R9, BX
+ ANDL R11, DI
+ XORL R14, R13
+ RORXL $0x16, R10, R14
+ VPXOR Y2, Y8, Y8
+ XORL DX, R15
+ VPSHUFB shuff_00BA<>+0(SB), Y8, Y8
+ XORL R12, R14
+ RORXL $0x02, R10, R12
+ VPADDD Y8, Y0, Y0
+ XORL R12, R14
+ MOVL R10, R12
+ ANDL AX, R12
+ ADDL R13, R15
+ VPSHUFD $0x50, Y0, Y2
+ ORL R12, DI
+ ADDL R14, R9
+ ADDL R15, BX
+ ADDL R15, R9
+ ADDL DI, R9
+ MOVL R9, DI
+ RORXL $0x19, BX, R13
+ RORXL $0x0b, BX, R14
+ ADDL 76(SP)(SI*1), DX
+ ORL R11, DI
+ VPSRLD $0x0a, Y2, Y11
+ MOVL CX, R15
+ RORXL $0x0d, R9, R12
+ XORL R14, R13
+ XORL R8, R15
+ VPSRLQ $0x13, Y2, Y3
+ RORXL $0x06, BX, R14
+ ANDL BX, R15
+ ADDL DX, AX
+ ANDL R10, DI
+ VPSRLQ $0x11, Y2, Y2
+ XORL R14, R13
+ XORL R8, R15
+ VPXOR Y3, Y2, Y2
+ RORXL $0x16, R9, R14
+ ADDL R13, R15
+ VPXOR Y2, Y11, Y11
+ XORL R12, R14
+ ADDL R15, AX
+ RORXL $0x02, R9, R12
+ VPSHUFB shuff_DC00<>+0(SB), Y11, Y11
+ VPADDD Y0, Y11, Y6
+ XORL R12, R14
+ MOVL R9, R12
+ ANDL R11, R12
+ ORL R12, DI
+ ADDL R14, DX
+ ADDL R15, DX
+ ADDL DI, DX
// Do 4 rounds and scheduling
- VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER
- VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
- ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
- ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
- ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
- ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
-
- ADDQ $4*32, SRND
- CMPQ SRND, $3*4*32
- JB avx2_loop1
+ VPADDD 96(BP)(SI*1), Y7, Y9
+ VMOVDQU Y9, 96(SP)(SI*1)
+ MOVL DX, DI
+ RORXL $0x19, AX, R13
+ RORXL $0x0b, AX, R14
+ ADDL 96(SP)(SI*1), R8
+ ORL R10, DI
+ VPALIGNR $0x04, Y5, Y6, Y0
+ MOVL BX, R15
+ RORXL $0x0d, DX, R12
+ XORL R14, R13
+ XORL CX, R15
+ VPADDD Y7, Y0, Y0
+ RORXL $0x06, AX, R14
+ ANDL AX, R15
+ XORL R14, R13
+ RORXL $0x16, DX, R14
+ ADDL R8, R11
+ ANDL R9, DI
+ VPALIGNR $0x04, Y7, Y4, Y1
+ XORL R12, R14
+ RORXL $0x02, DX, R12
+ XORL CX, R15
+ VPSRLD $0x07, Y1, Y2
+ XORL R12, R14
+ MOVL DX, R12
+ ANDL R10, R12
+ ADDL R13, R15
+ VPSLLD $0x19, Y1, Y3
+ ORL R12, DI
+ ADDL R14, R8
+ ADDL R15, R11
+ VPOR Y2, Y3, Y3
+ VPSRLD $0x12, Y1, Y2
+ ADDL R15, R8
+ ADDL DI, R8
+ MOVL R8, DI
+ RORXL $0x19, R11, R13
+ RORXL $0x0b, R11, R14
+ ADDL 100(SP)(SI*1), CX
+ ORL R9, DI
+ VPSRLD $0x03, Y1, Y8
+ MOVL AX, R15
+ RORXL $0x0d, R8, R12
+ XORL R14, R13
+ XORL BX, R15
+ RORXL $0x06, R11, R14
+ XORL R14, R13
+ RORXL $0x16, R8, R14
+ ANDL R11, R15
+ ADDL CX, R10
+ VPSLLD $0x0e, Y1, Y1
+ ANDL DX, DI
+ XORL R12, R14
+ VPXOR Y1, Y3, Y3
+ RORXL $0x02, R8, R12
+ XORL BX, R15
+ VPXOR Y2, Y3, Y3
+ XORL R12, R14
+ MOVL R8, R12
+ ANDL R9, R12
+ ADDL R13, R15
+ VPXOR Y8, Y3, Y1
+ VPSHUFD $0xfa, Y6, Y2
+ ORL R12, DI
+ ADDL R14, CX
+ VPADDD Y1, Y0, Y0
+ ADDL R15, R10
+ ADDL R15, CX
+ ADDL DI, CX
+ VPSRLD $0x0a, Y2, Y8
+ MOVL CX, DI
+ RORXL $0x19, R10, R13
+ ADDL 104(SP)(SI*1), BX
+ VPSRLQ $0x13, Y2, Y3
+ RORXL $0x0b, R10, R14
+ ORL DX, DI
+ MOVL R11, R15
+ XORL AX, R15
+ RORXL $0x0d, CX, R12
+ XORL R14, R13
+ VPSRLQ $0x11, Y2, Y2
+ ANDL R10, R15
+ RORXL $0x06, R10, R14
+ VPXOR Y3, Y2, Y2
+ ADDL BX, R9
+ ANDL R8, DI
+ XORL R14, R13
+ RORXL $0x16, CX, R14
+ VPXOR Y2, Y8, Y8
+ XORL AX, R15
+ VPSHUFB shuff_00BA<>+0(SB), Y8, Y8
+ XORL R12, R14
+ RORXL $0x02, CX, R12
+ VPADDD Y8, Y0, Y0
+ XORL R12, R14
+ MOVL CX, R12
+ ANDL DX, R12
+ ADDL R13, R15
+ VPSHUFD $0x50, Y0, Y2
+ ORL R12, DI
+ ADDL R14, BX
+ ADDL R15, R9
+ ADDL R15, BX
+ ADDL DI, BX
+ MOVL BX, DI
+ RORXL $0x19, R9, R13
+ RORXL $0x0b, R9, R14
+ ADDL 108(SP)(SI*1), AX
+ ORL R8, DI
+ VPSRLD $0x0a, Y2, Y11
+ MOVL R10, R15
+ RORXL $0x0d, BX, R12
+ XORL R14, R13
+ XORL R11, R15
+ VPSRLQ $0x13, Y2, Y3
+ RORXL $0x06, R9, R14
+ ANDL R9, R15
+ ADDL AX, DX
+ ANDL CX, DI
+ VPSRLQ $0x11, Y2, Y2
+ XORL R14, R13
+ XORL R11, R15
+ VPXOR Y3, Y2, Y2
+ RORXL $0x16, BX, R14
+ ADDL R13, R15
+ VPXOR Y2, Y11, Y11
+ XORL R12, R14
+ ADDL R15, DX
+ RORXL $0x02, BX, R12
+ VPSHUFB shuff_DC00<>+0(SB), Y11, Y11
+ VPADDD Y0, Y11, Y7
+ XORL R12, R14
+ MOVL BX, R12
+ ANDL R8, R12
+ ORL R12, DI
+ ADDL R14, AX
+ ADDL R15, AX
+ ADDL DI, AX
+ ADDQ $0x80, SI
+ CMPQ SI, $0x00000180
+ JB avx2_loop1
avx2_loop2:
- // w48 - w63 processed with no scheduling (last 16 rounds)
- VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER
- VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
- DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
- DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
- DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
- DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
-
- VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER
- VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
- DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
- DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
- DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
- DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
-
- ADDQ $2*32, SRND
-
- VMOVDQU XDWORD2, XDWORD0
- VMOVDQU XDWORD3, XDWORD1
-
- CMPQ SRND, $4*4*32
- JB avx2_loop2
-
- MOVQ dig+0(FP), CTX // d.h[8]
- MOVQ _INP(SP), INP
-
- addm( 0(CTX), a)
- addm( 4(CTX), b)
- addm( 8(CTX), c)
- addm( 12(CTX), d)
- addm( 16(CTX), e)
- addm( 20(CTX), f)
- addm( 24(CTX), g)
- addm( 28(CTX), h)
-
- CMPQ _INP_END(SP), INP
- JB done_hash
-
- XORQ SRND, SRND
-
-avx2_loop3: // Do second block using previously scheduled results
- DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
- DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
- DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
- DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
-
- DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
- DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
- DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
- DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
-
- ADDQ $2*32, SRND
- CMPQ SRND, $4*4*32
- JB avx2_loop3
-
- MOVQ dig+0(FP), CTX // d.h[8]
- MOVQ _INP(SP), INP
- ADDQ $64, INP
-
- addm( 0(CTX), a)
- addm( 4(CTX), b)
- addm( 8(CTX), c)
- addm( 12(CTX), d)
- addm( 16(CTX), e)
- addm( 20(CTX), f)
- addm( 24(CTX), g)
- addm( 28(CTX), h)
+ VPADDD (BP)(SI*1), Y4, Y9
+ VMOVDQU Y9, (SP)(SI*1)
+ MOVL R9, R15
+ RORXL $0x19, DX, R13
+ RORXL $0x0b, DX, R14
+ XORL R10, R15
+ XORL R14, R13
+ RORXL $0x06, DX, R14
+ ANDL DX, R15
+ XORL R14, R13
+ RORXL $0x0d, AX, R12
+ XORL R10, R15
+ RORXL $0x16, AX, R14
+ MOVL AX, DI
+ XORL R12, R14
+ RORXL $0x02, AX, R12
+ ADDL (SP)(SI*1), R11
+ ORL CX, DI
+ XORL R12, R14
+ MOVL AX, R12
+ ANDL BX, DI
+ ANDL CX, R12
+ ADDL R13, R15
+ ADDL R11, R8
+ ORL R12, DI
+ ADDL R14, R11
+ ADDL R15, R8
+ ADDL R15, R11
+ MOVL DX, R15
+ RORXL $0x19, R8, R13
+ RORXL $0x0b, R8, R14
+ XORL R9, R15
+ XORL R14, R13
+ RORXL $0x06, R8, R14
+ ANDL R8, R15
+ ADDL DI, R11
+ XORL R14, R13
+ RORXL $0x0d, R11, R12
+ XORL R9, R15
+ RORXL $0x16, R11, R14
+ MOVL R11, DI
+ XORL R12, R14
+ RORXL $0x02, R11, R12
+ ADDL 4(SP)(SI*1), R10
+ ORL BX, DI
+ XORL R12, R14
+ MOVL R11, R12
+ ANDL AX, DI
+ ANDL BX, R12
+ ADDL R13, R15
+ ADDL R10, CX
+ ORL R12, DI
+ ADDL R14, R10
+ ADDL R15, CX
+ ADDL R15, R10
+ MOVL R8, R15
+ RORXL $0x19, CX, R13
+ RORXL $0x0b, CX, R14
+ XORL DX, R15
+ XORL R14, R13
+ RORXL $0x06, CX, R14
+ ANDL CX, R15
+ ADDL DI, R10
+ XORL R14, R13
+ RORXL $0x0d, R10, R12
+ XORL DX, R15
+ RORXL $0x16, R10, R14
+ MOVL R10, DI
+ XORL R12, R14
+ RORXL $0x02, R10, R12
+ ADDL 8(SP)(SI*1), R9
+ ORL AX, DI
+ XORL R12, R14
+ MOVL R10, R12
+ ANDL R11, DI
+ ANDL AX, R12
+ ADDL R13, R15
+ ADDL R9, BX
+ ORL R12, DI
+ ADDL R14, R9
+ ADDL R15, BX
+ ADDL R15, R9
+ MOVL CX, R15
+ RORXL $0x19, BX, R13
+ RORXL $0x0b, BX, R14
+ XORL R8, R15
+ XORL R14, R13
+ RORXL $0x06, BX, R14
+ ANDL BX, R15
+ ADDL DI, R9
+ XORL R14, R13
+ RORXL $0x0d, R9, R12
+ XORL R8, R15
+ RORXL $0x16, R9, R14
+ MOVL R9, DI
+ XORL R12, R14
+ RORXL $0x02, R9, R12
+ ADDL 12(SP)(SI*1), DX
+ ORL R11, DI
+ XORL R12, R14
+ MOVL R9, R12
+ ANDL R10, DI
+ ANDL R11, R12
+ ADDL R13, R15
+ ADDL DX, AX
+ ORL R12, DI
+ ADDL R14, DX
+ ADDL R15, AX
+ ADDL R15, DX
+ ADDL DI, DX
+ VPADDD 32(BP)(SI*1), Y5, Y9
+ VMOVDQU Y9, 32(SP)(SI*1)
+ MOVL BX, R15
+ RORXL $0x19, AX, R13
+ RORXL $0x0b, AX, R14
+ XORL CX, R15
+ XORL R14, R13
+ RORXL $0x06, AX, R14
+ ANDL AX, R15
+ XORL R14, R13
+ RORXL $0x0d, DX, R12
+ XORL CX, R15
+ RORXL $0x16, DX, R14
+ MOVL DX, DI
+ XORL R12, R14
+ RORXL $0x02, DX, R12
+ ADDL 32(SP)(SI*1), R8
+ ORL R10, DI
+ XORL R12, R14
+ MOVL DX, R12
+ ANDL R9, DI
+ ANDL R10, R12
+ ADDL R13, R15
+ ADDL R8, R11
+ ORL R12, DI
+ ADDL R14, R8
+ ADDL R15, R11
+ ADDL R15, R8
+ MOVL AX, R15
+ RORXL $0x19, R11, R13
+ RORXL $0x0b, R11, R14
+ XORL BX, R15
+ XORL R14, R13
+ RORXL $0x06, R11, R14
+ ANDL R11, R15
+ ADDL DI, R8
+ XORL R14, R13
+ RORXL $0x0d, R8, R12
+ XORL BX, R15
+ RORXL $0x16, R8, R14
+ MOVL R8, DI
+ XORL R12, R14
+ RORXL $0x02, R8, R12
+ ADDL 36(SP)(SI*1), CX
+ ORL R9, DI
+ XORL R12, R14
+ MOVL R8, R12
+ ANDL DX, DI
+ ANDL R9, R12
+ ADDL R13, R15
+ ADDL CX, R10
+ ORL R12, DI
+ ADDL R14, CX
+ ADDL R15, R10
+ ADDL R15, CX
+ MOVL R11, R15
+ RORXL $0x19, R10, R13
+ RORXL $0x0b, R10, R14
+ XORL AX, R15
+ XORL R14, R13
+ RORXL $0x06, R10, R14
+ ANDL R10, R15
+ ADDL DI, CX
+ XORL R14, R13
+ RORXL $0x0d, CX, R12
+ XORL AX, R15
+ RORXL $0x16, CX, R14
+ MOVL CX, DI
+ XORL R12, R14
+ RORXL $0x02, CX, R12
+ ADDL 40(SP)(SI*1), BX
+ ORL DX, DI
+ XORL R12, R14
+ MOVL CX, R12
+ ANDL R8, DI
+ ANDL DX, R12
+ ADDL R13, R15
+ ADDL BX, R9
+ ORL R12, DI
+ ADDL R14, BX
+ ADDL R15, R9
+ ADDL R15, BX
+ MOVL R10, R15
+ RORXL $0x19, R9, R13
+ RORXL $0x0b, R9, R14
+ XORL R11, R15
+ XORL R14, R13
+ RORXL $0x06, R9, R14
+ ANDL R9, R15
+ ADDL DI, BX
+ XORL R14, R13
+ RORXL $0x0d, BX, R12
+ XORL R11, R15
+ RORXL $0x16, BX, R14
+ MOVL BX, DI
+ XORL R12, R14
+ RORXL $0x02, BX, R12
+ ADDL 44(SP)(SI*1), AX
+ ORL R8, DI
+ XORL R12, R14
+ MOVL BX, R12
+ ANDL CX, DI
+ ANDL R8, R12
+ ADDL R13, R15
+ ADDL AX, DX
+ ORL R12, DI
+ ADDL R14, AX
+ ADDL R15, DX
+ ADDL R15, AX
+ ADDL DI, AX
+ ADDQ $0x40, SI
+ VMOVDQU Y6, Y4
+ VMOVDQU Y7, Y5
+ CMPQ SI, $0x00000200
+ JB avx2_loop2
+ MOVQ dig+0(FP), SI
+ MOVQ 520(SP), DI
+ ADDL AX, (SI)
+ MOVL (SI), AX
+ ADDL BX, 4(SI)
+ MOVL 4(SI), BX
+ ADDL CX, 8(SI)
+ MOVL 8(SI), CX
+ ADDL R8, 12(SI)
+ MOVL 12(SI), R8
+ ADDL DX, 16(SI)
+ MOVL 16(SI), DX
+ ADDL R9, 20(SI)
+ MOVL 20(SI), R9
+ ADDL R10, 24(SI)
+ MOVL 24(SI), R10
+ ADDL R11, 28(SI)
+ MOVL 28(SI), R11
+ CMPQ 512(SP), DI
+ JB done_hash
+ XORQ SI, SI
- CMPQ _INP_END(SP), INP
- JA avx2_loop0
- JB done_hash
+avx2_loop3:
+ MOVL R9, R15
+ RORXL $0x19, DX, R13
+ RORXL $0x0b, DX, R14
+ XORL R10, R15
+ XORL R14, R13
+ RORXL $0x06, DX, R14
+ ANDL DX, R15
+ XORL R14, R13
+ RORXL $0x0d, AX, R12
+ XORL R10, R15
+ RORXL $0x16, AX, R14
+ MOVL AX, DI
+ XORL R12, R14
+ RORXL $0x02, AX, R12
+ ADDL 16(SP)(SI*1), R11
+ ORL CX, DI
+ XORL R12, R14
+ MOVL AX, R12
+ ANDL BX, DI
+ ANDL CX, R12
+ ADDL R13, R15
+ ADDL R11, R8
+ ORL R12, DI
+ ADDL R14, R11
+ ADDL R15, R8
+ ADDL R15, R11
+ MOVL DX, R15
+ RORXL $0x19, R8, R13
+ RORXL $0x0b, R8, R14
+ XORL R9, R15
+ XORL R14, R13
+ RORXL $0x06, R8, R14
+ ANDL R8, R15
+ ADDL DI, R11
+ XORL R14, R13
+ RORXL $0x0d, R11, R12
+ XORL R9, R15
+ RORXL $0x16, R11, R14
+ MOVL R11, DI
+ XORL R12, R14
+ RORXL $0x02, R11, R12
+ ADDL 20(SP)(SI*1), R10
+ ORL BX, DI
+ XORL R12, R14
+ MOVL R11, R12
+ ANDL AX, DI
+ ANDL BX, R12
+ ADDL R13, R15
+ ADDL R10, CX
+ ORL R12, DI
+ ADDL R14, R10
+ ADDL R15, CX
+ ADDL R15, R10
+ MOVL R8, R15
+ RORXL $0x19, CX, R13
+ RORXL $0x0b, CX, R14
+ XORL DX, R15
+ XORL R14, R13
+ RORXL $0x06, CX, R14
+ ANDL CX, R15
+ ADDL DI, R10
+ XORL R14, R13
+ RORXL $0x0d, R10, R12
+ XORL DX, R15
+ RORXL $0x16, R10, R14
+ MOVL R10, DI
+ XORL R12, R14
+ RORXL $0x02, R10, R12
+ ADDL 24(SP)(SI*1), R9
+ ORL AX, DI
+ XORL R12, R14
+ MOVL R10, R12
+ ANDL R11, DI
+ ANDL AX, R12
+ ADDL R13, R15
+ ADDL R9, BX
+ ORL R12, DI
+ ADDL R14, R9
+ ADDL R15, BX
+ ADDL R15, R9
+ MOVL CX, R15
+ RORXL $0x19, BX, R13
+ RORXL $0x0b, BX, R14
+ XORL R8, R15
+ XORL R14, R13
+ RORXL $0x06, BX, R14
+ ANDL BX, R15
+ ADDL DI, R9
+ XORL R14, R13
+ RORXL $0x0d, R9, R12
+ XORL R8, R15
+ RORXL $0x16, R9, R14
+ MOVL R9, DI
+ XORL R12, R14
+ RORXL $0x02, R9, R12
+ ADDL 28(SP)(SI*1), DX
+ ORL R11, DI
+ XORL R12, R14
+ MOVL R9, R12
+ ANDL R10, DI
+ ANDL R11, R12
+ ADDL R13, R15
+ ADDL DX, AX
+ ORL R12, DI
+ ADDL R14, DX
+ ADDL R15, AX
+ ADDL R15, DX
+ ADDL DI, DX
+ MOVL BX, R15
+ RORXL $0x19, AX, R13
+ RORXL $0x0b, AX, R14
+ XORL CX, R15
+ XORL R14, R13
+ RORXL $0x06, AX, R14
+ ANDL AX, R15
+ XORL R14, R13
+ RORXL $0x0d, DX, R12
+ XORL CX, R15
+ RORXL $0x16, DX, R14
+ MOVL DX, DI
+ XORL R12, R14
+ RORXL $0x02, DX, R12
+ ADDL 48(SP)(SI*1), R8
+ ORL R10, DI
+ XORL R12, R14
+ MOVL DX, R12
+ ANDL R9, DI
+ ANDL R10, R12
+ ADDL R13, R15
+ ADDL R8, R11
+ ORL R12, DI
+ ADDL R14, R8
+ ADDL R15, R11
+ ADDL R15, R8
+ MOVL AX, R15
+ RORXL $0x19, R11, R13
+ RORXL $0x0b, R11, R14
+ XORL BX, R15
+ XORL R14, R13
+ RORXL $0x06, R11, R14
+ ANDL R11, R15
+ ADDL DI, R8
+ XORL R14, R13
+ RORXL $0x0d, R8, R12
+ XORL BX, R15
+ RORXL $0x16, R8, R14
+ MOVL R8, DI
+ XORL R12, R14
+ RORXL $0x02, R8, R12
+ ADDL 52(SP)(SI*1), CX
+ ORL R9, DI
+ XORL R12, R14
+ MOVL R8, R12
+ ANDL DX, DI
+ ANDL R9, R12
+ ADDL R13, R15
+ ADDL CX, R10
+ ORL R12, DI
+ ADDL R14, CX
+ ADDL R15, R10
+ ADDL R15, CX
+ MOVL R11, R15
+ RORXL $0x19, R10, R13
+ RORXL $0x0b, R10, R14
+ XORL AX, R15
+ XORL R14, R13
+ RORXL $0x06, R10, R14
+ ANDL R10, R15
+ ADDL DI, CX
+ XORL R14, R13
+ RORXL $0x0d, CX, R12
+ XORL AX, R15
+ RORXL $0x16, CX, R14
+ MOVL CX, DI
+ XORL R12, R14
+ RORXL $0x02, CX, R12
+ ADDL 56(SP)(SI*1), BX
+ ORL DX, DI
+ XORL R12, R14
+ MOVL CX, R12
+ ANDL R8, DI
+ ANDL DX, R12
+ ADDL R13, R15
+ ADDL BX, R9
+ ORL R12, DI
+ ADDL R14, BX
+ ADDL R15, R9
+ ADDL R15, BX
+ MOVL R10, R15
+ RORXL $0x19, R9, R13
+ RORXL $0x0b, R9, R14
+ XORL R11, R15
+ XORL R14, R13
+ RORXL $0x06, R9, R14
+ ANDL R9, R15
+ ADDL DI, BX
+ XORL R14, R13
+ RORXL $0x0d, BX, R12
+ XORL R11, R15
+ RORXL $0x16, BX, R14
+ MOVL BX, DI
+ XORL R12, R14
+ RORXL $0x02, BX, R12
+ ADDL 60(SP)(SI*1), AX
+ ORL R8, DI
+ XORL R12, R14
+ MOVL BX, R12
+ ANDL CX, DI
+ ANDL R8, R12
+ ADDL R13, R15
+ ADDL AX, DX
+ ORL R12, DI
+ ADDL R14, AX
+ ADDL R15, DX
+ ADDL R15, AX
+ ADDL DI, AX
+ ADDQ $0x40, SI
+ CMPQ SI, $0x00000200
+ JB avx2_loop3
+ MOVQ dig+0(FP), SI
+ MOVQ 520(SP), DI
+ ADDQ $0x40, DI
+ ADDL AX, (SI)
+ MOVL (SI), AX
+ ADDL BX, 4(SI)
+ MOVL 4(SI), BX
+ ADDL CX, 8(SI)
+ MOVL 8(SI), CX
+ ADDL R8, 12(SI)
+ MOVL 12(SI), R8
+ ADDL DX, 16(SI)
+ MOVL 16(SI), DX
+ ADDL R9, 20(SI)
+ MOVL 20(SI), R9
+ ADDL R10, 24(SI)
+ MOVL 24(SI), R10
+ ADDL R11, 28(SI)
+ MOVL 28(SI), R11
+ CMPQ 512(SP), DI
+ JA avx2_loop0
+ JB done_hash
avx2_do_last_block:
-
- VMOVDQU 0(INP), XWORD0
- VMOVDQU 16(INP), XWORD1
- VMOVDQU 32(INP), XWORD2
- VMOVDQU 48(INP), XWORD3
-
- VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
-
- VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
- VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
- VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
- VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
-
- MOVQ $K256<>(SB), TBL
-
- JMP avx2_last_block_enter
+ VMOVDQU (DI), X4
+ VMOVDQU 16(DI), X5
+ VMOVDQU 32(DI), X6
+ VMOVDQU 48(DI), X7
+ VMOVDQU flip_mask<>+0(SB), Y13
+ VPSHUFB X13, X4, X4
+ VPSHUFB X13, X5, X5
+ VPSHUFB X13, X6, X6
+ VPSHUFB X13, X7, X7
+ LEAQ K256<>+0(SB), BP
+ JMP avx2_last_block_enter
avx2_only_one_block:
- // Load initial digest
- MOVL 0(CTX), a // a = H0
- MOVL 4(CTX), b // b = H1
- MOVL 8(CTX), c // c = H2
- MOVL 12(CTX), d // d = H3
- MOVL 16(CTX), e // e = H4
- MOVL 20(CTX), f // f = H5
- MOVL 24(CTX), g // g = H6
- MOVL 28(CTX), h // h = H7
-
- JMP avx2_do_last_block
+ MOVL (SI), AX
+ MOVL 4(SI), BX
+ MOVL 8(SI), CX
+ MOVL 12(SI), R8
+ MOVL 16(SI), DX
+ MOVL 20(SI), R9
+ MOVL 24(SI), R10
+ MOVL 28(SI), R11
+ JMP avx2_do_last_block
done_hash:
VZEROUPPER
RET
sha_ni:
- MOVQ dig+0(FP), digestPtr // init digest hash vector H0, H1,..., H7 pointer
- MOVQ p_base+8(FP), dataPtr // init input data base pointer
- MOVQ p_len+16(FP), numBytes // get number of input bytes to hash
- SHRQ $6, numBytes // force modulo 64 input buffer length
- SHLQ $6, numBytes
- CMPQ numBytes, $0 // exit early for zero-length input buffer
- JEQ done
- ADDQ dataPtr, numBytes // point numBytes to end of input buffer
- VMOVDQU (0*16)(digestPtr), state0 // load initial hash values and reorder
- VMOVDQU (1*16)(digestPtr), state1 // DCBA, HGFE -> ABEF, CDGH
- PSHUFD $0xb1, state0, state0 // CDAB
- PSHUFD $0x1b, state1, state1 // EFGH
- VMOVDQA state0, m4
- PALIGNR $8, state1, state0 // ABEF
- PBLENDW $0xf0, m4, state1 // CDGH
- VMOVDQA flip_mask<>(SB), shufMask
- LEAQ K256<>(SB), sha256Constants
+ MOVQ dig+0(FP), DI
+ MOVQ p_base+8(FP), SI
+ MOVQ p_len+16(FP), DX
+ SHRQ $0x06, DX
+ SHLQ $0x06, DX
+ CMPQ DX, $0x00
+ JEQ done
+ ADDQ SI, DX
+ VMOVDQU (DI), X1
+ VMOVDQU 16(DI), X2
+ PSHUFD $0xb1, X1, X1
+ PSHUFD $0x1b, X2, X2
+ VMOVDQA X1, X7
+ PALIGNR $0x08, X2, X1
+ PBLENDW $0xf0, X7, X2
+ VMOVDQA flip_mask<>+0(SB), X8
+ LEAQ K256<>+0(SB), AX
roundLoop:
// save hash values for addition after rounds
- VMOVDQA state0, abefSave
- VMOVDQA state1, cdghSave
+ VMOVDQA X1, X9
+ VMOVDQA X2, X10
// do rounds 0-59
- rounds0to11 (m0,-,0,nop) // 0-3
- rounds0to11 (m1,m0,1,sha256msg1) // 4-7
- rounds0to11 (m2,m1,2,sha256msg1) // 8-11
- VMOVDQU (3*16)(dataPtr), msg
- PSHUFB shufMask, msg
- rounds12to59 (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15
- rounds12to59 (m0,4,m3,m1,sha256msg1,vmov) // 16-19
- rounds12to59 (m1,5,m0,m2,sha256msg1,vmov) // 20-23
- rounds12to59 (m2,6,m1,m3,sha256msg1,vmov) // 24-27
- rounds12to59 (m3,7,m2,m0,sha256msg1,vmov) // 28-31
- rounds12to59 (m0,8,m3,m1,sha256msg1,vmov) // 32-35
- rounds12to59 (m1,9,m0,m2,sha256msg1,vmov) // 36-39
- rounds12to59 (m2,10,m1,m3,sha256msg1,vmov) // 40-43
- rounds12to59 (m3,11,m2,m0,sha256msg1,vmov) // 44-47
- rounds12to59 (m0,12,m3,m1,sha256msg1,vmov) // 48-51
- rounds12to59 (m1,13,m0,m2,nop,vmov) // 52-55
- rounds12to59 (m2,14,m1,m3,nop,vmov) // 56-59
+ VMOVDQU (SI), X0
+ PSHUFB X8, X0
+ VMOVDQA X0, X3
+ PADDD (AX), X0
+ SHA256RNDS2 X0, X1, X2
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ VMOVDQU 16(SI), X0
+ PSHUFB X8, X0
+ VMOVDQA X0, X4
+ PADDD 32(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X4, X3
+ VMOVDQU 32(SI), X0
+ PSHUFB X8, X0
+ VMOVDQA X0, X5
+ PADDD 64(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X5, X4
+ VMOVDQU 48(SI), X0
+ PSHUFB X8, X0
+ VMOVDQA X0, X6
+ PADDD 96(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X6, X7
+ PALIGNR $0x04, X5, X7
+ PADDD X7, X3
+ SHA256MSG2 X6, X3
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X6, X5
+ VMOVDQA X3, X0
+ PADDD 128(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X3, X7
+ PALIGNR $0x04, X6, X7
+ PADDD X7, X4
+ SHA256MSG2 X3, X4
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X3, X6
+ VMOVDQA X4, X0
+ PADDD 160(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X4, X7
+ PALIGNR $0x04, X3, X7
+ PADDD X7, X5
+ SHA256MSG2 X4, X5
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X4, X3
+ VMOVDQA X5, X0
+ PADDD 192(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X5, X7
+ PALIGNR $0x04, X4, X7
+ PADDD X7, X6
+ SHA256MSG2 X5, X6
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X5, X4
+ VMOVDQA X6, X0
+ PADDD 224(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X6, X7
+ PALIGNR $0x04, X5, X7
+ PADDD X7, X3
+ SHA256MSG2 X6, X3
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X6, X5
+ VMOVDQA X3, X0
+ PADDD 256(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X3, X7
+ PALIGNR $0x04, X6, X7
+ PADDD X7, X4
+ SHA256MSG2 X3, X4
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X3, X6
+ VMOVDQA X4, X0
+ PADDD 288(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X4, X7
+ PALIGNR $0x04, X3, X7
+ PADDD X7, X5
+ SHA256MSG2 X4, X5
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X4, X3
+ VMOVDQA X5, X0
+ PADDD 320(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X5, X7
+ PALIGNR $0x04, X4, X7
+ PADDD X7, X6
+ SHA256MSG2 X5, X6
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X5, X4
+ VMOVDQA X6, X0
+ PADDD 352(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X6, X7
+ PALIGNR $0x04, X5, X7
+ PADDD X7, X3
+ SHA256MSG2 X6, X3
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X6, X5
+ VMOVDQA X3, X0
+ PADDD 384(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X3, X7
+ PALIGNR $0x04, X6, X7
+ PADDD X7, X4
+ SHA256MSG2 X3, X4
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ SHA256MSG1 X3, X6
+ VMOVDQA X4, X0
+ PADDD 416(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X4, X7
+ PALIGNR $0x04, X3, X7
+ PADDD X7, X5
+ SHA256MSG2 X4, X5
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
+ VMOVDQA X5, X0
+ PADDD 448(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ VMOVDQA X5, X7
+ PALIGNR $0x04, X4, X7
+ PADDD X7, X6
+ SHA256MSG2 X5, X6
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
// do rounds 60-63
- VMOVDQA m3, msg
- PADDD (15*32)(sha256Constants), msg
- SHA256RNDS2 msg, state0, state1
- PSHUFD $0x0e, msg, msg
- SHA256RNDS2 msg, state1, state0
+ VMOVDQA X6, X0
+ PADDD 480(AX), X0
+ SHA256RNDS2 X0, X1, X2
+ PSHUFD $0x0e, X0, X0
+ SHA256RNDS2 X0, X2, X1
// add current hash values with previously saved
- PADDD abefSave, state0
- PADDD cdghSave, state1
+ PADDD X9, X1
+ PADDD X10, X2
// advance data pointer; loop until buffer empty
- ADDQ $64, dataPtr
- CMPQ numBytes, dataPtr
- JNE roundLoop
+ ADDQ $0x40, SI
+ CMPQ DX, SI
+ JNE roundLoop
// write hash values back in the correct order
- PSHUFD $0x1b, state0, state0 // FEBA
- PSHUFD $0xb1, state1, state1 // DCHG
- VMOVDQA state0, m4
- PBLENDW $0xf0, state1, state0 // DCBA
- PALIGNR $8, m4, state1 // HGFE
- VMOVDQU state0, (0*16)(digestPtr)
- VMOVDQU state1, (1*16)(digestPtr)
+ PSHUFD $0x1b, X1, X1
+ PSHUFD $0xb1, X2, X2
+ VMOVDQA X1, X7
+ PBLENDW $0xf0, X2, X1
+ PALIGNR $0x08, X7, X2
+ VMOVDQU X1, (DI)
+ VMOVDQU X2, 16(DI)
done:
RET
-// shuffle byte order from LE to BE
-DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
-DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
-DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
-DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
-GLOBL flip_mask<>(SB), 8, $32
-
-// shuffle xBxA -> 00BA
-DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
-DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
-DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
-GLOBL shuff_00BA<>(SB), 8, $32
-
-// shuffle xDxC -> DC00
-DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
-DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
-GLOBL shuff_DC00<>(SB), 8, $32
-
-// Round specific constants
-DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
-DATA K256<>+0x04(SB)/4, $0x71374491 // k2
-DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
-DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
-DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
-DATA K256<>+0x14(SB)/4, $0x71374491 // k2
-DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
-DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
-
-DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
-DATA K256<>+0x24(SB)/4, $0x59f111f1
-DATA K256<>+0x28(SB)/4, $0x923f82a4
-DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
-DATA K256<>+0x30(SB)/4, $0x3956c25b
-DATA K256<>+0x34(SB)/4, $0x59f111f1
-DATA K256<>+0x38(SB)/4, $0x923f82a4
-DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
-
-DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
-DATA K256<>+0x44(SB)/4, $0x12835b01
-DATA K256<>+0x48(SB)/4, $0x243185be
-DATA K256<>+0x4c(SB)/4, $0x550c7dc3
-DATA K256<>+0x50(SB)/4, $0xd807aa98
-DATA K256<>+0x54(SB)/4, $0x12835b01
-DATA K256<>+0x58(SB)/4, $0x243185be
-DATA K256<>+0x5c(SB)/4, $0x550c7dc3
-
-DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
-DATA K256<>+0x64(SB)/4, $0x80deb1fe
-DATA K256<>+0x68(SB)/4, $0x9bdc06a7
-DATA K256<>+0x6c(SB)/4, $0xc19bf174
-DATA K256<>+0x70(SB)/4, $0x72be5d74
-DATA K256<>+0x74(SB)/4, $0x80deb1fe
-DATA K256<>+0x78(SB)/4, $0x9bdc06a7
-DATA K256<>+0x7c(SB)/4, $0xc19bf174
-
-DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
-DATA K256<>+0x84(SB)/4, $0xefbe4786
-DATA K256<>+0x88(SB)/4, $0x0fc19dc6
-DATA K256<>+0x8c(SB)/4, $0x240ca1cc
-DATA K256<>+0x90(SB)/4, $0xe49b69c1
-DATA K256<>+0x94(SB)/4, $0xefbe4786
-DATA K256<>+0x98(SB)/4, $0x0fc19dc6
-DATA K256<>+0x9c(SB)/4, $0x240ca1cc
-
-DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
-DATA K256<>+0xa4(SB)/4, $0x4a7484aa
-DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
-DATA K256<>+0xac(SB)/4, $0x76f988da
-DATA K256<>+0xb0(SB)/4, $0x2de92c6f
-DATA K256<>+0xb4(SB)/4, $0x4a7484aa
-DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
-DATA K256<>+0xbc(SB)/4, $0x76f988da
-
-DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
-DATA K256<>+0xc4(SB)/4, $0xa831c66d
-DATA K256<>+0xc8(SB)/4, $0xb00327c8
-DATA K256<>+0xcc(SB)/4, $0xbf597fc7
-DATA K256<>+0xd0(SB)/4, $0x983e5152
-DATA K256<>+0xd4(SB)/4, $0xa831c66d
-DATA K256<>+0xd8(SB)/4, $0xb00327c8
-DATA K256<>+0xdc(SB)/4, $0xbf597fc7
-
-DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
-DATA K256<>+0xe4(SB)/4, $0xd5a79147
-DATA K256<>+0xe8(SB)/4, $0x06ca6351
-DATA K256<>+0xec(SB)/4, $0x14292967
-DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
-DATA K256<>+0xf4(SB)/4, $0xd5a79147
-DATA K256<>+0xf8(SB)/4, $0x06ca6351
-DATA K256<>+0xfc(SB)/4, $0x14292967
-
-DATA K256<>+0x100(SB)/4, $0x27b70a85
-DATA K256<>+0x104(SB)/4, $0x2e1b2138
-DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
-DATA K256<>+0x10c(SB)/4, $0x53380d13
-DATA K256<>+0x110(SB)/4, $0x27b70a85
-DATA K256<>+0x114(SB)/4, $0x2e1b2138
-DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
-DATA K256<>+0x11c(SB)/4, $0x53380d13
-
-DATA K256<>+0x120(SB)/4, $0x650a7354
-DATA K256<>+0x124(SB)/4, $0x766a0abb
-DATA K256<>+0x128(SB)/4, $0x81c2c92e
-DATA K256<>+0x12c(SB)/4, $0x92722c85
-DATA K256<>+0x130(SB)/4, $0x650a7354
-DATA K256<>+0x134(SB)/4, $0x766a0abb
-DATA K256<>+0x138(SB)/4, $0x81c2c92e
-DATA K256<>+0x13c(SB)/4, $0x92722c85
-
-DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
-DATA K256<>+0x144(SB)/4, $0xa81a664b
-DATA K256<>+0x148(SB)/4, $0xc24b8b70
-DATA K256<>+0x14c(SB)/4, $0xc76c51a3
-DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
-DATA K256<>+0x154(SB)/4, $0xa81a664b
-DATA K256<>+0x158(SB)/4, $0xc24b8b70
-DATA K256<>+0x15c(SB)/4, $0xc76c51a3
-
-DATA K256<>+0x160(SB)/4, $0xd192e819
-DATA K256<>+0x164(SB)/4, $0xd6990624
-DATA K256<>+0x168(SB)/4, $0xf40e3585
-DATA K256<>+0x16c(SB)/4, $0x106aa070
-DATA K256<>+0x170(SB)/4, $0xd192e819
-DATA K256<>+0x174(SB)/4, $0xd6990624
-DATA K256<>+0x178(SB)/4, $0xf40e3585
-DATA K256<>+0x17c(SB)/4, $0x106aa070
-
-DATA K256<>+0x180(SB)/4, $0x19a4c116
-DATA K256<>+0x184(SB)/4, $0x1e376c08
-DATA K256<>+0x188(SB)/4, $0x2748774c
-DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
-DATA K256<>+0x190(SB)/4, $0x19a4c116
-DATA K256<>+0x194(SB)/4, $0x1e376c08
-DATA K256<>+0x198(SB)/4, $0x2748774c
-DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
-
-DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
-DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
-DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
-DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
-DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
-DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
-DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
-DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
+DATA flip_mask<>+0(SB)/8, $0x0405060700010203
+DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b
+DATA flip_mask<>+16(SB)/8, $0x0405060700010203
+DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b
+GLOBL flip_mask<>(SB), RODATA, $32
-DATA K256<>+0x1c0(SB)/4, $0x748f82ee
-DATA K256<>+0x1c4(SB)/4, $0x78a5636f
-DATA K256<>+0x1c8(SB)/4, $0x84c87814
-DATA K256<>+0x1cc(SB)/4, $0x8cc70208
-DATA K256<>+0x1d0(SB)/4, $0x748f82ee
-DATA K256<>+0x1d4(SB)/4, $0x78a5636f
-DATA K256<>+0x1d8(SB)/4, $0x84c87814
-DATA K256<>+0x1dc(SB)/4, $0x8cc70208
+DATA K256<>+0(SB)/4, $0x428a2f98
+DATA K256<>+4(SB)/4, $0x71374491
+DATA K256<>+8(SB)/4, $0xb5c0fbcf
+DATA K256<>+12(SB)/4, $0xe9b5dba5
+DATA K256<>+16(SB)/4, $0x428a2f98
+DATA K256<>+20(SB)/4, $0x71374491
+DATA K256<>+24(SB)/4, $0xb5c0fbcf
+DATA K256<>+28(SB)/4, $0xe9b5dba5
+DATA K256<>+32(SB)/4, $0x3956c25b
+DATA K256<>+36(SB)/4, $0x59f111f1
+DATA K256<>+40(SB)/4, $0x923f82a4
+DATA K256<>+44(SB)/4, $0xab1c5ed5
+DATA K256<>+48(SB)/4, $0x3956c25b
+DATA K256<>+52(SB)/4, $0x59f111f1
+DATA K256<>+56(SB)/4, $0x923f82a4
+DATA K256<>+60(SB)/4, $0xab1c5ed5
+DATA K256<>+64(SB)/4, $0xd807aa98
+DATA K256<>+68(SB)/4, $0x12835b01
+DATA K256<>+72(SB)/4, $0x243185be
+DATA K256<>+76(SB)/4, $0x550c7dc3
+DATA K256<>+80(SB)/4, $0xd807aa98
+DATA K256<>+84(SB)/4, $0x12835b01
+DATA K256<>+88(SB)/4, $0x243185be
+DATA K256<>+92(SB)/4, $0x550c7dc3
+DATA K256<>+96(SB)/4, $0x72be5d74
+DATA K256<>+100(SB)/4, $0x80deb1fe
+DATA K256<>+104(SB)/4, $0x9bdc06a7
+DATA K256<>+108(SB)/4, $0xc19bf174
+DATA K256<>+112(SB)/4, $0x72be5d74
+DATA K256<>+116(SB)/4, $0x80deb1fe
+DATA K256<>+120(SB)/4, $0x9bdc06a7
+DATA K256<>+124(SB)/4, $0xc19bf174
+DATA K256<>+128(SB)/4, $0xe49b69c1
+DATA K256<>+132(SB)/4, $0xefbe4786
+DATA K256<>+136(SB)/4, $0x0fc19dc6
+DATA K256<>+140(SB)/4, $0x240ca1cc
+DATA K256<>+144(SB)/4, $0xe49b69c1
+DATA K256<>+148(SB)/4, $0xefbe4786
+DATA K256<>+152(SB)/4, $0x0fc19dc6
+DATA K256<>+156(SB)/4, $0x240ca1cc
+DATA K256<>+160(SB)/4, $0x2de92c6f
+DATA K256<>+164(SB)/4, $0x4a7484aa
+DATA K256<>+168(SB)/4, $0x5cb0a9dc
+DATA K256<>+172(SB)/4, $0x76f988da
+DATA K256<>+176(SB)/4, $0x2de92c6f
+DATA K256<>+180(SB)/4, $0x4a7484aa
+DATA K256<>+184(SB)/4, $0x5cb0a9dc
+DATA K256<>+188(SB)/4, $0x76f988da
+DATA K256<>+192(SB)/4, $0x983e5152
+DATA K256<>+196(SB)/4, $0xa831c66d
+DATA K256<>+200(SB)/4, $0xb00327c8
+DATA K256<>+204(SB)/4, $0xbf597fc7
+DATA K256<>+208(SB)/4, $0x983e5152
+DATA K256<>+212(SB)/4, $0xa831c66d
+DATA K256<>+216(SB)/4, $0xb00327c8
+DATA K256<>+220(SB)/4, $0xbf597fc7
+DATA K256<>+224(SB)/4, $0xc6e00bf3
+DATA K256<>+228(SB)/4, $0xd5a79147
+DATA K256<>+232(SB)/4, $0x06ca6351
+DATA K256<>+236(SB)/4, $0x14292967
+DATA K256<>+240(SB)/4, $0xc6e00bf3
+DATA K256<>+244(SB)/4, $0xd5a79147
+DATA K256<>+248(SB)/4, $0x06ca6351
+DATA K256<>+252(SB)/4, $0x14292967
+DATA K256<>+256(SB)/4, $0x27b70a85
+DATA K256<>+260(SB)/4, $0x2e1b2138
+DATA K256<>+264(SB)/4, $0x4d2c6dfc
+DATA K256<>+268(SB)/4, $0x53380d13
+DATA K256<>+272(SB)/4, $0x27b70a85
+DATA K256<>+276(SB)/4, $0x2e1b2138
+DATA K256<>+280(SB)/4, $0x4d2c6dfc
+DATA K256<>+284(SB)/4, $0x53380d13
+DATA K256<>+288(SB)/4, $0x650a7354
+DATA K256<>+292(SB)/4, $0x766a0abb
+DATA K256<>+296(SB)/4, $0x81c2c92e
+DATA K256<>+300(SB)/4, $0x92722c85
+DATA K256<>+304(SB)/4, $0x650a7354
+DATA K256<>+308(SB)/4, $0x766a0abb
+DATA K256<>+312(SB)/4, $0x81c2c92e
+DATA K256<>+316(SB)/4, $0x92722c85
+DATA K256<>+320(SB)/4, $0xa2bfe8a1
+DATA K256<>+324(SB)/4, $0xa81a664b
+DATA K256<>+328(SB)/4, $0xc24b8b70
+DATA K256<>+332(SB)/4, $0xc76c51a3
+DATA K256<>+336(SB)/4, $0xa2bfe8a1
+DATA K256<>+340(SB)/4, $0xa81a664b
+DATA K256<>+344(SB)/4, $0xc24b8b70
+DATA K256<>+348(SB)/4, $0xc76c51a3
+DATA K256<>+352(SB)/4, $0xd192e819
+DATA K256<>+356(SB)/4, $0xd6990624
+DATA K256<>+360(SB)/4, $0xf40e3585
+DATA K256<>+364(SB)/4, $0x106aa070
+DATA K256<>+368(SB)/4, $0xd192e819
+DATA K256<>+372(SB)/4, $0xd6990624
+DATA K256<>+376(SB)/4, $0xf40e3585
+DATA K256<>+380(SB)/4, $0x106aa070
+DATA K256<>+384(SB)/4, $0x19a4c116
+DATA K256<>+388(SB)/4, $0x1e376c08
+DATA K256<>+392(SB)/4, $0x2748774c
+DATA K256<>+396(SB)/4, $0x34b0bcb5
+DATA K256<>+400(SB)/4, $0x19a4c116
+DATA K256<>+404(SB)/4, $0x1e376c08
+DATA K256<>+408(SB)/4, $0x2748774c
+DATA K256<>+412(SB)/4, $0x34b0bcb5
+DATA K256<>+416(SB)/4, $0x391c0cb3
+DATA K256<>+420(SB)/4, $0x4ed8aa4a
+DATA K256<>+424(SB)/4, $0x5b9cca4f
+DATA K256<>+428(SB)/4, $0x682e6ff3
+DATA K256<>+432(SB)/4, $0x391c0cb3
+DATA K256<>+436(SB)/4, $0x4ed8aa4a
+DATA K256<>+440(SB)/4, $0x5b9cca4f
+DATA K256<>+444(SB)/4, $0x682e6ff3
+DATA K256<>+448(SB)/4, $0x748f82ee
+DATA K256<>+452(SB)/4, $0x78a5636f
+DATA K256<>+456(SB)/4, $0x84c87814
+DATA K256<>+460(SB)/4, $0x8cc70208
+DATA K256<>+464(SB)/4, $0x748f82ee
+DATA K256<>+468(SB)/4, $0x78a5636f
+DATA K256<>+472(SB)/4, $0x84c87814
+DATA K256<>+476(SB)/4, $0x8cc70208
+DATA K256<>+480(SB)/4, $0x90befffa
+DATA K256<>+484(SB)/4, $0xa4506ceb
+DATA K256<>+488(SB)/4, $0xbef9a3f7
+DATA K256<>+492(SB)/4, $0xc67178f2
+DATA K256<>+496(SB)/4, $0x90befffa
+DATA K256<>+500(SB)/4, $0xa4506ceb
+DATA K256<>+504(SB)/4, $0xbef9a3f7
+DATA K256<>+508(SB)/4, $0xc67178f2
+GLOBL K256<>(SB), RODATA|NOPTR, $512
-DATA K256<>+0x1e0(SB)/4, $0x90befffa
-DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
-DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
-DATA K256<>+0x1ec(SB)/4, $0xc67178f2
-DATA K256<>+0x1f0(SB)/4, $0x90befffa
-DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
-DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
-DATA K256<>+0x1fc(SB)/4, $0xc67178f2
+DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100
+DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff
+DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100
+DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff
+GLOBL shuff_00BA<>(SB), RODATA, $32
-GLOBL K256<>(SB), (NOPTR + RODATA), $512
+DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff
+DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100
+DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff
+DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100
+GLOBL shuff_DC00<>(SB), RODATA, $32