From: Filippo Valsorda Date: Thu, 19 Sep 2024 18:04:30 +0000 (+0200) Subject: crypto/sha256,crypto/sha512: make assembly structure consistent X-Git-Tag: go1.24rc1~626 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=a7650080302173d2de33ca35da20fc592c5fc0b6;p=gostls13.git crypto/sha256,crypto/sha512: make assembly structure consistent Ensure separate implementations are implemented in different functions called from Go, and that they can be turned off from a GODEBUG. This will be necessary to test implementations separately for #69536. Change-Id: I3e081deb7abb01b0665265e39c72fd4037dd48b3 Cq-Include-Trybots: luci.golang.try:gotip-linux-arm64-longtest,gotip-linux-amd64-longtest,gotip-linux-ppc64le_power8,gotip-linux-ppc64_power8 Reviewed-on: https://go-review.googlesource.com/c/go/+/614495 Reviewed-by: Daniel McCarney Auto-Submit: Filippo Valsorda Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Pratt --- diff --git a/src/crypto/sha256/_asm/sha256block_amd64_asm.go b/src/crypto/sha256/_asm/sha256block_amd64_asm.go index 3c70e018ce..24256185bc 100644 --- a/src/crypto/sha256/_asm/sha256block_amd64_asm.go +++ b/src/crypto/sha256/_asm/sha256block_amd64_asm.go @@ -18,17 +18,6 @@ import ( // // https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf -// The avx2-version is described in an Intel White-Paper: -// "Fast SHA-256 Implementations on Intel Architecture Processors" -// To find it, surf to http://www.intel.com/p/en_US/embedded -// and search for that title. -// AVX2 version by Intel, same algorithm as code in Linux kernel: -// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S -// by -// James Guilford -// Kirk Yap -// Tim Chen - // Wt = Mt; for 0 <= t <= 15 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 // @@ -66,7 +55,9 @@ import ( func main() { Package("crypto/sha256") ConstraintExpr("!purego") - block() + blockAMD64() + blockAVX2() + blockSHANI() Generate() } @@ -176,519 +167,10 @@ func sha256Round1(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) { sha256Round(index, konst, a, b, c, d, e, f, g, h) } -// Definitions for AVX2 version - -// addm (mem), reg -// - Add reg to mem using reg-mem add and store -func addm(P1 Mem, P2 GPPhysical) { - ADDL(P2, P1) - MOVL(P1, P2) -} - -var ( - XDWORD0 VecPhysical = Y4 - XDWORD1 = Y5 - XDWORD2 = Y6 - XDWORD3 = Y7 - - XWORD0 = X4 - XWORD1 = X5 - XWORD2 = X6 - XWORD3 = X7 - - XTMP0 = Y0 - XTMP1 = Y1 - XTMP2 = Y2 - XTMP3 = Y3 - XTMP4 = Y8 - XTMP5 = Y11 - - XFER = Y9 - - BYTE_FLIP_MASK = Y13 // mask to convert LE -> BE - X_BYTE_FLIP_MASK = X13 - - NUM_BYTES GPPhysical = RDX - INP = RDI - - CTX = RSI // Beginning of digest in memory (a, b, c, ... , h) - - a = EAX - b = EBX - c = ECX - d = R8L - e = EDX - f = R9L - g = R10L - h = R11L - - old_h = R11L - - TBL = RBP - - SRND = RSI // SRND is same register as CTX - - T1 = R12L - - y0 = R13L - y1 = R14L - y2 = R15L - y3 = EDI - - // Offsets - XFER_SIZE = 2 * 64 * 4 - INP_END_SIZE = 8 - INP_SIZE = 8 - - _XFER = 0 - _INP_END = _XFER + XFER_SIZE - _INP = _INP_END + INP_END_SIZE - STACK_SIZE = _INP + INP_SIZE -) - -func roundAndSchedN0(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { - // ############################# RND N + 0 ############################// - MOVL(a, y3) // y3 = a - RORXL(Imm(25), e, y0) // y0 = e >> 25 - RORXL(Imm(11), e, y1) // y1 = e >> 11 - - ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h - ORL(c, y3) // y3 = a|c - VPALIGNR(Imm(4), XDWORD2, XDWORD3, XTMP0) // XTMP0 = W[-7] - MOVL(f, y2) // y2 = f - RORXL(Imm(13), a, T1) // T1 = a >> 13 - - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) - XORL(g, y2) // y2 = f^g - VPADDD(XDWORD0, XTMP0, XTMP0) // XTMP0 = W[-7] + W[-16] - RORXL(Imm(6), e, y1) // y1 = (e >> 6) - - ANDL(e, y2) // y2 = (f^g)&e - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) - RORXL(Imm(22), a, y1) // y1 = a >> 22 - ADDL(h, d) // d = k + w + h + d - - ANDL(b, y3) // y3 = (a|c)&b - VPALIGNR(Imm(4), XDWORD0, XDWORD1, XTMP1) // XTMP1 = W[-15] - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) - RORXL(Imm(2), a, T1) // T1 = (a >> 2) - - XORL(g, y2) // y2 = CH = ((f^g)&e)^g - VPSRLD(Imm(7), XTMP1, XTMP2) // - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) - MOVL(a, T1) // T1 = a - ANDL(c, T1) // T1 = a&c - - ADDL(y0, y2) // y2 = S1 + CH - VPSLLD(Imm(32-7), XTMP1, XTMP3) // - ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) - ADDL(y1, h) // h = k + w + h + S0 - - ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 - VPOR(XTMP2, XTMP3, XTMP3) // XTMP3 = W[-15] ror 7 - - VPSRLD(Imm(18), XTMP1, XTMP2) - ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 - ADDL(y3, h) // h = t1 + S0 + MAJ -} - -func roundAndSchedN1(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { - // ################################### RND N + 1 ############################ - MOVL(a, y3) // y3 = a - RORXL(Imm(25), e, y0) // y0 = e >> 25 - RORXL(Imm(11), e, y1) // y1 = e >> 11 - ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h - ORL(c, y3) // y3 = a|c - - VPSRLD(Imm(3), XTMP1, XTMP4) // XTMP4 = W[-15] >> 3 - MOVL(f, y2) // y2 = f - RORXL(Imm(13), a, T1) // T1 = a >> 13 - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) - XORL(g, y2) // y2 = f^g - - RORXL(Imm(6), e, y1) // y1 = (e >> 6) - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) - RORXL(Imm(22), a, y1) // y1 = a >> 22 - ANDL(e, y2) // y2 = (f^g)&e - ADDL(h, d) // d = k + w + h + d - - VPSLLD(Imm(32-18), XTMP1, XTMP1) - ANDL(b, y3) // y3 = (a|c)&b - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) - - VPXOR(XTMP1, XTMP3, XTMP3) - RORXL(Imm(2), a, T1) // T1 = (a >> 2) - XORL(g, y2) // y2 = CH = ((f^g)&e)^g - - VPXOR(XTMP2, XTMP3, XTMP3) // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) - MOVL(a, T1) // T1 = a - ANDL(c, T1) // T1 = a&c - ADDL(y0, y2) // y2 = S1 + CH - - VPXOR(XTMP4, XTMP3, XTMP1) // XTMP1 = s0 - VPSHUFD(Imm(0xFA), XDWORD3, XTMP2) // XTMP2 = W[-2] {BBAA} - ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) - ADDL(y1, h) // h = k + w + h + S0 - - VPADDD(XTMP1, XTMP0, XTMP0) // XTMP0 = W[-16] + W[-7] + s0 - ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 - ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 - ADDL(y3, h) // h = t1 + S0 + MAJ - - VPSRLD(Imm(10), XTMP2, XTMP4) // XTMP4 = W[-2] >> 10 {BBAA} -} - -func roundAndSchedN2(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { - // ################################### RND N + 2 ############################ - var shuff_00BA Mem = shuff_00BA_DATA() - - MOVL(a, y3) // y3 = a - RORXL(Imm(25), e, y0) // y0 = e >> 25 - ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h - - VPSRLQ(Imm(19), XTMP2, XTMP3) // XTMP3 = W[-2] ror 19 {xBxA} - RORXL(Imm(11), e, y1) // y1 = e >> 11 - ORL(c, y3) // y3 = a|c - MOVL(f, y2) // y2 = f - XORL(g, y2) // y2 = f^g - - RORXL(Imm(13), a, T1) // T1 = a >> 13 - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) - VPSRLQ(Imm(17), XTMP2, XTMP2) // XTMP2 = W[-2] ror 17 {xBxA} - ANDL(e, y2) // y2 = (f^g)&e - - RORXL(Imm(6), e, y1) // y1 = (e >> 6) - VPXOR(XTMP3, XTMP2, XTMP2) - ADDL(h, d) // d = k + w + h + d - ANDL(b, y3) // y3 = (a|c)&b - - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) - RORXL(Imm(22), a, y1) // y1 = a >> 22 - VPXOR(XTMP2, XTMP4, XTMP4) // XTMP4 = s1 {xBxA} - XORL(g, y2) // y2 = CH = ((f^g)&e)^g - - VPSHUFB(shuff_00BA, XTMP4, XTMP4) // XTMP4 = s1 {00BA} - - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) - RORXL(Imm(2), a, T1) // T1 = (a >> 2) - VPADDD(XTMP4, XTMP0, XTMP0) // XTMP0 = {..., ..., W[1], W[0]} - - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) - MOVL(a, T1) // T1 = a - ANDL(c, T1) // T1 = a&c - ADDL(y0, y2) // y2 = S1 + CH - VPSHUFD(Imm(80), XTMP0, XTMP2) // XTMP2 = W[-2] {DDCC} - - ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) - ADDL(y1, h) // h = k + w + h + S0 - ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 - ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 - - ADDL(y3, h) // h = t1 + S0 + MAJ -} - -func roundAndSchedN3(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { - // ################################### RND N + 3 ############################ - var shuff_DC00 Mem = shuff_DC00_DATA() - - MOVL(a, y3) // y3 = a - RORXL(Imm(25), e, y0) // y0 = e >> 25 - RORXL(Imm(11), e, y1) // y1 = e >> 11 - ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h - ORL(c, y3) // y3 = a|c - - VPSRLD(Imm(10), XTMP2, XTMP5) // XTMP5 = W[-2] >> 10 {DDCC} - MOVL(f, y2) // y2 = f - RORXL(Imm(13), a, T1) // T1 = a >> 13 - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) - XORL(g, y2) // y2 = f^g +func blockAMD64() { + Implement("blockAMD64") + AllocLocal(256 + 8) - VPSRLQ(Imm(19), XTMP2, XTMP3) // XTMP3 = W[-2] ror 19 {xDxC} - RORXL(Imm(6), e, y1) // y1 = (e >> 6) - ANDL(e, y2) // y2 = (f^g)&e - ADDL(h, d) // d = k + w + h + d - ANDL(b, y3) // y3 = (a|c)&b - - VPSRLQ(Imm(17), XTMP2, XTMP2) // XTMP2 = W[-2] ror 17 {xDxC} - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) - XORL(g, y2) // y2 = CH = ((f^g)&e)^g - - VPXOR(XTMP3, XTMP2, XTMP2) - RORXL(Imm(22), a, y1) // y1 = a >> 22 - ADDL(y0, y2) // y2 = S1 + CH - - VPXOR(XTMP2, XTMP5, XTMP5) // XTMP5 = s1 {xDxC} - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) - ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 - - RORXL(Imm(2), a, T1) // T1 = (a >> 2) - - VPSHUFB(shuff_DC00, XTMP5, XTMP5) // XTMP5 = s1 {DC00} - - VPADDD(XTMP0, XTMP5, XDWORD0) // XDWORD0 = {W[3], W[2], W[1], W[0]} - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) - MOVL(a, T1) // T1 = a - ANDL(c, T1) // T1 = a&c - ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) - - ADDL(y1, h) // h = k + w + h + S0 - ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 - ADDL(y3, h) // h = t1 + S0 + MAJ -} - -func doRoundN0(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { - // ################################### RND N + 0 ########################### - MOVL(f, y2) // y2 = f - RORXL(Imm(25), e, y0) // y0 = e >> 25 - RORXL(Imm(11), e, y1) // y1 = e >> 11 - XORL(g, y2) // y2 = f^g - - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) - RORXL(Imm(6), e, y1) // y1 = (e >> 6) - ANDL(e, y2) // y2 = (f^g)&e - - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) - RORXL(Imm(13), a, T1) // T1 = a >> 13 - XORL(g, y2) // y2 = CH = ((f^g)&e)^g - RORXL(Imm(22), a, y1) // y1 = a >> 22 - MOVL(a, y3) // y3 = a - - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) - RORXL(Imm(2), a, T1) // T1 = (a >> 2) - ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h - ORL(c, y3) // y3 = a|c - - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) - MOVL(a, T1) // T1 = a - ANDL(b, y3) // y3 = (a|c)&b - ANDL(c, T1) // T1 = a&c - ADDL(y0, y2) // y2 = S1 + CH - - ADDL(h, d) // d = k + w + h + d - ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) - ADDL(y1, h) // h = k + w + h + S0 - ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 -} - -func doRoundN1(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { - // ################################### RND N + 1 ########################### - ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0 - MOVL(f, y2) // y2 = f - RORXL(Imm(25), e, y0) // y0 = e >> 25 - RORXL(Imm(11), e, y1) // y1 = e >> 11 - XORL(g, y2) // y2 = f^g - - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) - RORXL(Imm(6), e, y1) // y1 = (e >> 6) - ANDL(e, y2) // y2 = (f^g)&e - ADDL(y3, old_h) // h = t1 + S0 + MAJ - - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) - RORXL(Imm(13), a, T1) // T1 = a >> 13 - XORL(g, y2) // y2 = CH = ((f^g)&e)^g - RORXL(Imm(22), a, y1) // y1 = a >> 22 - MOVL(a, y3) // y3 = a - - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) - RORXL(Imm(2), a, T1) // T1 = (a >> 2) - ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h - ORL(c, y3) // y3 = a|c - - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) - MOVL(a, T1) // T1 = a - ANDL(b, y3) // y3 = (a|c)&b - ANDL(c, T1) // T1 = a&c - ADDL(y0, y2) // y2 = S1 + CH - - ADDL(h, d) // d = k + w + h + d - ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) - ADDL(y1, h) // h = k + w + h + S0 - - ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 -} - -func doRoundN2(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { - // ################################### RND N + 2 ############################## - ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0 - MOVL(f, y2) // y2 = f - RORXL(Imm(25), e, y0) // y0 = e >> 25 - RORXL(Imm(11), e, y1) // y1 = e >> 11 - XORL(g, y2) // y2 = f^g - - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) - RORXL(Imm(6), e, y1) // y1 = (e >> 6) - ANDL(e, y2) // y2 = (f^g)&e - ADDL(y3, old_h) // h = t1 + S0 + MAJ - - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) - RORXL(Imm(13), a, T1) // T1 = a >> 13 - XORL(g, y2) // y2 = CH = ((f^g)&e)^g - RORXL(Imm(22), a, y1) // y1 = a >> 22 - MOVL(a, y3) // y3 = a - - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) - RORXL(Imm(2), a, T1) // T1 = (a >> 2) - ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h - ORL(c, y3) // y3 = a|c - - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) - MOVL(a, T1) // T1 = a - ANDL(b, y3) // y3 = (a|c)&b - ANDL(c, T1) // T1 = a&c - ADDL(y0, y2) // y2 = S1 + CH - - ADDL(h, d) // d = k + w + h + d - ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) - ADDL(y1, h) // h = k + w + h + S0 - - ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 -} - -func doRoundN3(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { - // ################################### RND N + 3 ########################### - ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0 - MOVL(f, y2) // y2 = f - RORXL(Imm(25), e, y0) // y0 = e >> 25 - RORXL(Imm(11), e, y1) // y1 = e >> 11 - XORL(g, y2) // y2 = f^g - - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) - RORXL(Imm(6), e, y1) // y1 = (e >> 6) - ANDL(e, y2) // y2 = (f^g)&e - ADDL(y3, old_h) // h = t1 + S0 + MAJ - - XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) - RORXL(Imm(13), a, T1) // T1 = a >> 13 - XORL(g, y2) // y2 = CH = ((f^g)&e)^g - RORXL(Imm(22), a, y1) // y1 = a >> 22 - MOVL(a, y3) // y3 = a - - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) - RORXL(Imm(2), a, T1) // T1 = (a >> 2) - ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h - ORL(c, y3) // y3 = a|c - - XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) - MOVL(a, T1) // T1 = a - ANDL(b, y3) // y3 = (a|c)&b - ANDL(c, T1) // T1 = a&c - ADDL(y0, y2) // y2 = S1 + CH - - ADDL(h, d) // d = k + w + h + d - ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) - ADDL(y1, h) // h = k + w + h + S0 - - ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 - - ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 - - ADDL(y3, h) // h = t1 + S0 + MAJ -} - -// Definitions for sha-ni version -// -// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2 -// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version -// -// Reference -// S. Gulley, et al, "New Instructions Supporting the Secure Hash -// Algorithm on Intel® Architecture Processors", July 2013 -// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html -// - -var ( - digestPtr GPPhysical = RDI // input/output, base pointer to digest hash vector H0, H1, ..., H7 - dataPtr = RSI // input, base pointer to first input data block - numBytes = RDX // input, number of input bytes to be processed - sha256Constants = RAX // round contents from K256 table, indexed by round number x 32 - msg VecPhysical = X0 // input data - state0 = X1 // round intermediates and outputs - state1 = X2 - m0 = X3 // m0, m1,... m4 -- round message temps - m1 = X4 - m2 = X5 - m3 = X6 - m4 = X7 - shufMask = X8 // input data endian conversion control mask - abefSave = X9 // digest hash vector inter-block buffer abef - cdghSave = X10 // digest hash vector inter-block buffer cdgh -) - -// nop instead of final SHA256MSG1 for first and last few rounds -func nop(m, a VecPhysical) { -} - -// final SHA256MSG1 for middle rounds that require it -func sha256msg1(m, a VecPhysical) { - SHA256MSG1(m, a) -} - -// msg copy for all but rounds 12-15 -func vmov(a, b VecPhysical) { - VMOVDQA(a, b) -} - -// reverse copy for rounds 12-15 -func vmovrev(a, b VecPhysical) { - VMOVDQA(b, a) -} - -type VecFunc func(a, b VecPhysical) - -// sha rounds 0 to 11 -// -// identical with the exception of the final msg op -// which is replaced with a nop for rounds where it is not needed -// refer to Gulley, et al for more information -func rounds0to11(m, a VecPhysical, c int, sha256msg1 VecFunc) { - VMOVDQU(Mem{Base: dataPtr}.Offset(c*16), msg) - PSHUFB(shufMask, msg) - VMOVDQA(msg, m) - PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg) - SHA256RNDS2(msg, state0, state1) - PSHUFD(U8(0x0e), msg, msg) - SHA256RNDS2(msg, state1, state0) - sha256msg1(m, a) -} - -// sha rounds 12 to 59 -// -// identical with the exception of the final msg op -// and the reverse copy(m,msg) in round 12 which is required -// after the last data load -// refer to Gulley, et al for more information -func rounds12to59(m VecPhysical, c int, a, t VecPhysical, sha256msg1, movop VecFunc) { - movop(m, msg) - PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg) - SHA256RNDS2(msg, state0, state1) - VMOVDQA(m, m4) - PALIGNR(Imm(4), a, m4) - PADDD(m4, t) - SHA256MSG2(m, t) - PSHUFD(Imm(0x0e), msg, msg) - SHA256RNDS2(msg, state1, state0) - sha256msg1(m, a) -} - -func block() { - Implement("block") - AllocLocal(536) - - checkArchFlags() - sha256() - avx2() - sha_ni() -} - -func checkArchFlags() { - CMPB(Mem{Symbol: Symbol{Name: "·useSHA"}, Base: StaticBase}, Imm(1)) - JE(LabelRef("sha_ni")) - CMPB(Mem{Symbol: Symbol{Name: "·useAVX2"}, Base: StaticBase}, Imm(1)) - JE(LabelRef("avx2")) -} - -func sha256() { Load(Param("p").Base(), RSI) Load(Param("p").Len(), RDX) SHRQ(Imm(6), RDX) @@ -770,356 +252,6 @@ func end() { RET() } -func avx2() { - Label("avx2") - Load(Param("dig"), CTX) // d.h[8] - Load(Param("p").Base(), INP) - Load(Param("p").Len(), NUM_BYTES) - - LEAQ(Mem{Base: INP, Index: NUM_BYTES, Scale: 1, Disp: -64}, NUM_BYTES) // Pointer to the last block - MOVQ(NUM_BYTES, Mem{Base: SP}.Offset(_INP_END)) - - CMPQ(NUM_BYTES, INP) - JE(LabelRef("avx2_only_one_block")) - - Comment("Load initial digest") - CTX := Mem{Base: CTX} - MOVL(CTX.Offset(0), a) // a = H0 - MOVL(CTX.Offset(4), b) // b = H1 - MOVL(CTX.Offset(8), c) // c = H2 - MOVL(CTX.Offset(12), d) // d = H3 - MOVL(CTX.Offset(16), e) // e = H4 - MOVL(CTX.Offset(20), f) // f = H5 - MOVL(CTX.Offset(24), g) // g = H6 - MOVL(CTX.Offset(28), h) // h = H7 - - avx2_loop0() - avx2_last_block_enter() - avx2_loop1() - avx2_loop2() - avx2_loop3() - avx2_do_last_block() - avx2_only_one_block() - done_hash() -} - -func avx2_loop0() { - Label("avx2_loop0") - Comment("at each iteration works with one block (512 bit)") - VMOVDQU(Mem{Base: INP}.Offset(0*32), XTMP0) - VMOVDQU(Mem{Base: INP}.Offset(1*32), XTMP1) - VMOVDQU(Mem{Base: INP}.Offset(2*32), XTMP2) - VMOVDQU(Mem{Base: INP}.Offset(3*32), XTMP3) - - flip_mask := flip_mask_DATA() - - VMOVDQU(flip_mask, BYTE_FLIP_MASK) - - Comment("Apply Byte Flip Mask: LE -> BE") - VPSHUFB(BYTE_FLIP_MASK, XTMP0, XTMP0) - VPSHUFB(BYTE_FLIP_MASK, XTMP1, XTMP1) - VPSHUFB(BYTE_FLIP_MASK, XTMP2, XTMP2) - VPSHUFB(BYTE_FLIP_MASK, XTMP3, XTMP3) - - Comment("Transpose data into high/low parts") - VPERM2I128(Imm(0x20), XTMP2, XTMP0, XDWORD0) // w3, w2, w1, w0 - VPERM2I128(Imm(0x31), XTMP2, XTMP0, XDWORD1) // w7, w6, w5, w4 - VPERM2I128(Imm(0x20), XTMP3, XTMP1, XDWORD2) // w11, w10, w9, w8 - VPERM2I128(Imm(0x31), XTMP3, XTMP1, XDWORD3) // w15, w14, w13, w12 - - K256 := K256_DATA() - LEAQ(K256, TBL) // Loading address of table with round-specific constants -} - -func avx2_last_block_enter() { - Label("avx2_last_block_enter") - ADDQ(Imm(64), INP) - MOVQ(INP, Mem{Base: SP}.Offset(_INP)) - XORQ(SRND, SRND) -} - -// for w0 - w47 -func avx2_loop1() { - Label("avx2_loop1") - - Comment("Do 4 rounds and scheduling") - VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((0 * 32)), XDWORD0, XFER) - VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32)) - roundAndSchedN0(_XFER+0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - roundAndSchedN1(_XFER+0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - roundAndSchedN2(_XFER+0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - roundAndSchedN3(_XFER+0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) - - Comment("Do 4 rounds and scheduling") - VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER) - VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32)) - roundAndSchedN0(_XFER+1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - roundAndSchedN1(_XFER+1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - roundAndSchedN2(_XFER+1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - roundAndSchedN3(_XFER+1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) - - Comment("Do 4 rounds and scheduling") - VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((2 * 32)), XDWORD2, XFER) - VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+2*32)) - roundAndSchedN0(_XFER+2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - roundAndSchedN1(_XFER+2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - roundAndSchedN2(_XFER+2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - roundAndSchedN3(_XFER+2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) - - Comment("Do 4 rounds and scheduling") - VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((3 * 32)), XDWORD3, XFER) - VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+3*32)) - roundAndSchedN0(_XFER+3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - roundAndSchedN1(_XFER+3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - roundAndSchedN2(_XFER+3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - roundAndSchedN3(_XFER+3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) - - ADDQ(Imm(4*32), SRND) - CMPQ(SRND, U32(3*4*32)) - JB(LabelRef("avx2_loop1")) -} - -// w48 - w63 processed with no scheduling (last 16 rounds) -func avx2_loop2() { - Label("avx2_loop2") - VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(0*32), XDWORD0, XFER) - VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32)) - doRoundN0(_XFER+0*32, a, b, c, d, e, f, g, h, h) - doRoundN1(_XFER+0*32, h, a, b, c, d, e, f, g, h) - doRoundN2(_XFER+0*32, g, h, a, b, c, d, e, f, g) - doRoundN3(_XFER+0*32, f, g, h, a, b, c, d, e, f) - - VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER) - VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32)) - doRoundN0(_XFER+1*32, e, f, g, h, a, b, c, d, e) - doRoundN1(_XFER+1*32, d, e, f, g, h, a, b, c, d) - doRoundN2(_XFER+1*32, c, d, e, f, g, h, a, b, c) - doRoundN3(_XFER+1*32, b, c, d, e, f, g, h, a, b) - - ADDQ(Imm(2*32), SRND) - - VMOVDQU(XDWORD2, XDWORD0) - VMOVDQU(XDWORD3, XDWORD1) - - CMPQ(SRND, U32(4*4*32)) - JB(LabelRef("avx2_loop2")) - - Load(Param("dig"), CTX) // d.h[8] - MOVQ(Mem{Base: SP}.Offset(_INP), INP) - - registers := []GPPhysical{a, b, c, d, e, f, g, h} - for i, reg := range registers { - addm(Mem{Base: CTX}.Offset(i*4), reg) - } - - CMPQ(Mem{Base: SP}.Offset(_INP_END), INP) - JB(LabelRef("done_hash")) - - XORQ(SRND, SRND) -} - -// Do second block using previously scheduled results -func avx2_loop3() { - Label("avx2_loop3") - doRoundN0(_XFER+0*32+16, a, b, c, d, e, f, g, h, a) - doRoundN1(_XFER+0*32+16, h, a, b, c, d, e, f, g, h) - doRoundN2(_XFER+0*32+16, g, h, a, b, c, d, e, f, g) - doRoundN3(_XFER+0*32+16, f, g, h, a, b, c, d, e, f) - - doRoundN0(_XFER+1*32+16, e, f, g, h, a, b, c, d, e) - doRoundN1(_XFER+1*32+16, d, e, f, g, h, a, b, c, d) - doRoundN2(_XFER+1*32+16, c, d, e, f, g, h, a, b, c) - doRoundN3(_XFER+1*32+16, b, c, d, e, f, g, h, a, b) - - ADDQ(Imm(2*32), SRND) - CMPQ(SRND, U32(4*4*32)) - JB(LabelRef("avx2_loop3")) - - Load(Param("dig"), CTX) // d.h[8] - MOVQ(Mem{Base: SP}.Offset(_INP), INP) - ADDQ(Imm(64), INP) - - registers := []GPPhysical{a, b, c, d, e, f, g, h} - for i, reg := range registers { - addm(Mem{Base: CTX}.Offset(i*4), reg) - } - - CMPQ(Mem{Base: SP}.Offset(_INP_END), INP) - JA(LabelRef("avx2_loop0")) - JB(LabelRef("done_hash")) -} - -func avx2_do_last_block() { - Label("avx2_do_last_block") - VMOVDQU(Mem{Base: INP}.Offset(0), XWORD0) - VMOVDQU(Mem{Base: INP}.Offset(16), XWORD1) - VMOVDQU(Mem{Base: INP}.Offset(32), XWORD2) - VMOVDQU(Mem{Base: INP}.Offset(48), XWORD3) - - flip_mask := flip_mask_DATA() - VMOVDQU(flip_mask, BYTE_FLIP_MASK) - - VPSHUFB(X_BYTE_FLIP_MASK, XWORD0, XWORD0) - VPSHUFB(X_BYTE_FLIP_MASK, XWORD1, XWORD1) - VPSHUFB(X_BYTE_FLIP_MASK, XWORD2, XWORD2) - VPSHUFB(X_BYTE_FLIP_MASK, XWORD3, XWORD3) - - K256 := K256_DATA() - LEAQ(K256, TBL) - - JMP(LabelRef("avx2_last_block_enter")) -} - -// Load initial digest -func avx2_only_one_block() { - Label("avx2_only_one_block") - registers := []GPPhysical{a, b, c, d, e, f, g, h} - for i, reg := range registers { - MOVL(Mem{Base: CTX}.Offset(i*4), reg) - } - JMP(LabelRef("avx2_do_last_block")) -} - -func done_hash() { - Label("done_hash") - VZEROUPPER() - RET() -} - -func sha_ni() { - Label("sha_ni") - Load(Param("dig"), digestPtr) // init digest hash vector H0, H1,..., H7 pointer - Load(Param("p").Base(), dataPtr) // init input data base pointer - Load(Param("p").Len(), numBytes) // get number of input bytes to hash - SHRQ(Imm(6), numBytes) // force modulo 64 input buffer length - SHLQ(Imm(6), numBytes) - CMPQ(numBytes, Imm(0)) // exit early for zero-length input buffer - JEQ(LabelRef("done")) - ADDQ(dataPtr, numBytes) // point numBytes to end of input buffer - VMOVDQU(Mem{Base: digestPtr}.Offset(0*16), state0) // load initial hash values and reorder - VMOVDQU(Mem{Base: digestPtr}.Offset(1*16), state1) // DCBA, HGFE -> ABEF, CDGH - PSHUFD(Imm(0xb1), state0, state0) // CDAB - PSHUFD(Imm(0x1b), state1, state1) // EFGH - VMOVDQA(state0, m4) - PALIGNR(Imm(8), state1, state0) // ABEF - PBLENDW(Imm(0xf0), m4, state1) // CDGH - flip_mask := flip_mask_DATA() - VMOVDQA(flip_mask, shufMask) - LEAQ(K256_DATA(), sha256Constants) - - roundLoop() - done() -} - -func roundLoop() { - Label("roundLoop") - Comment("save hash values for addition after rounds") - VMOVDQA(state0, abefSave) - VMOVDQA(state1, cdghSave) - - Comment("do rounds 0-59") - rounds0to11(m0, nil, 0, nop) // 0-3 - rounds0to11(m1, m0, 1, sha256msg1) // 4-7 - rounds0to11(m2, m1, 2, sha256msg1) // 8-11 - VMOVDQU(Mem{Base: dataPtr}.Offset(3*16), msg) - PSHUFB(shufMask, msg) - rounds12to59(m3, 3, m2, m0, sha256msg1, vmovrev) // 12-15 - rounds12to59(m0, 4, m3, m1, sha256msg1, vmov) // 16-19 - rounds12to59(m1, 5, m0, m2, sha256msg1, vmov) // 20-23 - rounds12to59(m2, 6, m1, m3, sha256msg1, vmov) // 24-27 - rounds12to59(m3, 7, m2, m0, sha256msg1, vmov) // 28-31 - rounds12to59(m0, 8, m3, m1, sha256msg1, vmov) // 32-35 - rounds12to59(m1, 9, m0, m2, sha256msg1, vmov) // 36-39 - rounds12to59(m2, 10, m1, m3, sha256msg1, vmov) // 40-43 - rounds12to59(m3, 11, m2, m0, sha256msg1, vmov) // 44-47 - rounds12to59(m0, 12, m3, m1, sha256msg1, vmov) // 48-51 - rounds12to59(m1, 13, m0, m2, nop, vmov) // 52-55 - rounds12to59(m2, 14, m1, m3, nop, vmov) // 56-59 - - Comment("do rounds 60-63") - VMOVDQA(m3, msg) - PADDD(Mem{Base: sha256Constants}.Offset(15*32), msg) - SHA256RNDS2(msg, state0, state1) - PSHUFD(Imm(0x0e), msg, msg) - SHA256RNDS2(msg, state1, state0) - - Comment("add current hash values with previously saved") - PADDD(abefSave, state0) - PADDD(cdghSave, state1) - - Comment("advance data pointer; loop until buffer empty") - ADDQ(Imm(64), dataPtr) - CMPQ(numBytes, dataPtr) - JNE(LabelRef("roundLoop")) - - Comment("write hash values back in the correct order") - PSHUFD(Imm(0x1b), state0, state0) - PSHUFD(Imm(0xb1), state1, state1) - VMOVDQA(state0, m4) - PBLENDW(Imm(0xf0), state1, state0) - PALIGNR(Imm(8), m4, state1) - VMOVDQU(state0, Mem{Base: digestPtr}.Offset(0*16)) - VMOVDQU(state1, Mem{Base: digestPtr}.Offset(1*16)) -} - -func done() { - Label("done") - RET() -} - -/**~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~**/ - -// Pointers for memoizing Data section symbols -var flip_maskPtr, shuff_00BAPtr, shuff_DC00Ptr, K256Ptr *Mem - -// shuffle byte order from LE to BE -func flip_mask_DATA() Mem { - if flip_maskPtr != nil { - return *flip_maskPtr - } - - flip_mask := GLOBL("flip_mask", RODATA) - flip_maskPtr = &flip_mask - - DATA(0x00, U64(0x0405060700010203)) - DATA(0x08, U64(0x0c0d0e0f08090a0b)) - DATA(0x10, U64(0x0405060700010203)) - DATA(0x18, U64(0x0c0d0e0f08090a0b)) - return flip_mask -} - -// shuffle xBxA -> 00BA -func shuff_00BA_DATA() Mem { - if shuff_00BAPtr != nil { - return *shuff_00BAPtr - } - - shuff_00BA := GLOBL("shuff_00BA", RODATA) - shuff_00BAPtr = &shuff_00BA - - DATA(0x00, U64(0x0b0a090803020100)) - DATA(0x08, U64(0xFFFFFFFFFFFFFFFF)) - DATA(0x10, U64(0x0b0a090803020100)) - DATA(0x18, U64(0xFFFFFFFFFFFFFFFF)) - return shuff_00BA -} - -// shuffle xDxC -> DC00 -func shuff_DC00_DATA() Mem { - if shuff_DC00Ptr != nil { - return *shuff_DC00Ptr - } - - shuff_DC00 := GLOBL("shuff_DC00", RODATA) - shuff_DC00Ptr = &shuff_DC00 - - DATA(0x00, U64(0xFFFFFFFFFFFFFFFF)) - DATA(0x08, U64(0x0b0a090803020100)) - DATA(0x10, U64(0xFFFFFFFFFFFFFFFF)) - DATA(0x18, U64(0x0b0a090803020100)) - return shuff_DC00 -} - var _K = []uint32{ 0x428a2f98, 0x71374491, @@ -1186,29 +318,3 @@ var _K = []uint32{ 0xbef9a3f7, 0xc67178f2, } - -// Round specific constants -func K256_DATA() Mem { - if K256Ptr != nil { - return *K256Ptr - } - - K256 := GLOBL("K256", NOPTR+RODATA) - K256Ptr = &K256 - - offset_idx := 0 - - for i := 0; i < len(_K); i += 4 { - DATA((offset_idx+0)*4, U32(_K[i+0])) // k1 - DATA((offset_idx+1)*4, U32(_K[i+1])) // k2 - DATA((offset_idx+2)*4, U32(_K[i+2])) // k3 - DATA((offset_idx+3)*4, U32(_K[i+3])) // k4 - - DATA((offset_idx+4)*4, U32(_K[i+0])) // k1 - DATA((offset_idx+5)*4, U32(_K[i+1])) // k2 - DATA((offset_idx+6)*4, U32(_K[i+2])) // k3 - DATA((offset_idx+7)*4, U32(_K[i+3])) // k4 - offset_idx += 8 - } - return K256 -} diff --git a/src/crypto/sha256/_asm/sha256block_amd64_avx2.go b/src/crypto/sha256/_asm/sha256block_amd64_avx2.go new file mode 100644 index 0000000000..0e6f1c74cf --- /dev/null +++ b/src/crypto/sha256/_asm/sha256block_amd64_avx2.go @@ -0,0 +1,725 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" +) + +// The avx2-version is described in an Intel White-Paper: +// "Fast SHA-256 Implementations on Intel Architecture Processors" +// To find it, surf to http://www.intel.com/p/en_US/embedded +// and search for that title. +// AVX2 version by Intel, same algorithm as code in Linux kernel: +// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S +// by +// James Guilford +// Kirk Yap +// Tim Chen + +func blockAVX2() { + Implement("blockAVX2") + AllocLocal(536) + + Load(Param("dig"), CTX) // d.h[8] + Load(Param("p").Base(), INP) + Load(Param("p").Len(), NUM_BYTES) + + LEAQ(Mem{Base: INP, Index: NUM_BYTES, Scale: 1, Disp: -64}, NUM_BYTES) // Pointer to the last block + MOVQ(NUM_BYTES, Mem{Base: SP}.Offset(_INP_END)) + + CMPQ(NUM_BYTES, INP) + JE(LabelRef("avx2_only_one_block")) + + Comment("Load initial digest") + CTX := Mem{Base: CTX} + MOVL(CTX.Offset(0), a) // a = H0 + MOVL(CTX.Offset(4), b) // b = H1 + MOVL(CTX.Offset(8), c) // c = H2 + MOVL(CTX.Offset(12), d) // d = H3 + MOVL(CTX.Offset(16), e) // e = H4 + MOVL(CTX.Offset(20), f) // f = H5 + MOVL(CTX.Offset(24), g) // g = H6 + MOVL(CTX.Offset(28), h) // h = H7 + + avx2_loop0() + avx2_last_block_enter() + avx2_loop1() + avx2_loop2() + avx2_loop3() + avx2_do_last_block() + avx2_only_one_block() + done_hash() +} + +func avx2_loop0() { + Label("avx2_loop0") + Comment("at each iteration works with one block (512 bit)") + VMOVDQU(Mem{Base: INP}.Offset(0*32), XTMP0) + VMOVDQU(Mem{Base: INP}.Offset(1*32), XTMP1) + VMOVDQU(Mem{Base: INP}.Offset(2*32), XTMP2) + VMOVDQU(Mem{Base: INP}.Offset(3*32), XTMP3) + + flip_mask := flip_mask_DATA() + + VMOVDQU(flip_mask, BYTE_FLIP_MASK) + + Comment("Apply Byte Flip Mask: LE -> BE") + VPSHUFB(BYTE_FLIP_MASK, XTMP0, XTMP0) + VPSHUFB(BYTE_FLIP_MASK, XTMP1, XTMP1) + VPSHUFB(BYTE_FLIP_MASK, XTMP2, XTMP2) + VPSHUFB(BYTE_FLIP_MASK, XTMP3, XTMP3) + + Comment("Transpose data into high/low parts") + VPERM2I128(Imm(0x20), XTMP2, XTMP0, XDWORD0) // w3, w2, w1, w0 + VPERM2I128(Imm(0x31), XTMP2, XTMP0, XDWORD1) // w7, w6, w5, w4 + VPERM2I128(Imm(0x20), XTMP3, XTMP1, XDWORD2) // w11, w10, w9, w8 + VPERM2I128(Imm(0x31), XTMP3, XTMP1, XDWORD3) // w15, w14, w13, w12 + + K256 := K256_DATA() + LEAQ(K256, TBL) // Loading address of table with round-specific constants +} + +func avx2_last_block_enter() { + Label("avx2_last_block_enter") + ADDQ(Imm(64), INP) + MOVQ(INP, Mem{Base: SP}.Offset(_INP)) + XORQ(SRND, SRND) +} + +// for w0 - w47 +func avx2_loop1() { + Label("avx2_loop1") + + Comment("Do 4 rounds and scheduling") + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((0 * 32)), XDWORD0, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32)) + roundAndSchedN0(_XFER+0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + roundAndSchedN1(_XFER+0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + roundAndSchedN2(_XFER+0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + roundAndSchedN3(_XFER+0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) + + Comment("Do 4 rounds and scheduling") + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32)) + roundAndSchedN0(_XFER+1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + roundAndSchedN1(_XFER+1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + roundAndSchedN2(_XFER+1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + roundAndSchedN3(_XFER+1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) + + Comment("Do 4 rounds and scheduling") + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((2 * 32)), XDWORD2, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+2*32)) + roundAndSchedN0(_XFER+2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + roundAndSchedN1(_XFER+2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + roundAndSchedN2(_XFER+2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + roundAndSchedN3(_XFER+2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) + + Comment("Do 4 rounds and scheduling") + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((3 * 32)), XDWORD3, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+3*32)) + roundAndSchedN0(_XFER+3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + roundAndSchedN1(_XFER+3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + roundAndSchedN2(_XFER+3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + roundAndSchedN3(_XFER+3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) + + ADDQ(Imm(4*32), SRND) + CMPQ(SRND, U32(3*4*32)) + JB(LabelRef("avx2_loop1")) +} + +// w48 - w63 processed with no scheduling (last 16 rounds) +func avx2_loop2() { + Label("avx2_loop2") + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(0*32), XDWORD0, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32)) + doRoundN0(_XFER+0*32, a, b, c, d, e, f, g, h, h) + doRoundN1(_XFER+0*32, h, a, b, c, d, e, f, g, h) + doRoundN2(_XFER+0*32, g, h, a, b, c, d, e, f, g) + doRoundN3(_XFER+0*32, f, g, h, a, b, c, d, e, f) + + VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER) + VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32)) + doRoundN0(_XFER+1*32, e, f, g, h, a, b, c, d, e) + doRoundN1(_XFER+1*32, d, e, f, g, h, a, b, c, d) + doRoundN2(_XFER+1*32, c, d, e, f, g, h, a, b, c) + doRoundN3(_XFER+1*32, b, c, d, e, f, g, h, a, b) + + ADDQ(Imm(2*32), SRND) + + VMOVDQU(XDWORD2, XDWORD0) + VMOVDQU(XDWORD3, XDWORD1) + + CMPQ(SRND, U32(4*4*32)) + JB(LabelRef("avx2_loop2")) + + Load(Param("dig"), CTX) // d.h[8] + MOVQ(Mem{Base: SP}.Offset(_INP), INP) + + registers := []GPPhysical{a, b, c, d, e, f, g, h} + for i, reg := range registers { + addm(Mem{Base: CTX}.Offset(i*4), reg) + } + + CMPQ(Mem{Base: SP}.Offset(_INP_END), INP) + JB(LabelRef("done_hash")) + + XORQ(SRND, SRND) +} + +// Do second block using previously scheduled results +func avx2_loop3() { + Label("avx2_loop3") + doRoundN0(_XFER+0*32+16, a, b, c, d, e, f, g, h, a) + doRoundN1(_XFER+0*32+16, h, a, b, c, d, e, f, g, h) + doRoundN2(_XFER+0*32+16, g, h, a, b, c, d, e, f, g) + doRoundN3(_XFER+0*32+16, f, g, h, a, b, c, d, e, f) + + doRoundN0(_XFER+1*32+16, e, f, g, h, a, b, c, d, e) + doRoundN1(_XFER+1*32+16, d, e, f, g, h, a, b, c, d) + doRoundN2(_XFER+1*32+16, c, d, e, f, g, h, a, b, c) + doRoundN3(_XFER+1*32+16, b, c, d, e, f, g, h, a, b) + + ADDQ(Imm(2*32), SRND) + CMPQ(SRND, U32(4*4*32)) + JB(LabelRef("avx2_loop3")) + + Load(Param("dig"), CTX) // d.h[8] + MOVQ(Mem{Base: SP}.Offset(_INP), INP) + ADDQ(Imm(64), INP) + + registers := []GPPhysical{a, b, c, d, e, f, g, h} + for i, reg := range registers { + addm(Mem{Base: CTX}.Offset(i*4), reg) + } + + CMPQ(Mem{Base: SP}.Offset(_INP_END), INP) + JA(LabelRef("avx2_loop0")) + JB(LabelRef("done_hash")) +} + +func avx2_do_last_block() { + Label("avx2_do_last_block") + VMOVDQU(Mem{Base: INP}.Offset(0), XWORD0) + VMOVDQU(Mem{Base: INP}.Offset(16), XWORD1) + VMOVDQU(Mem{Base: INP}.Offset(32), XWORD2) + VMOVDQU(Mem{Base: INP}.Offset(48), XWORD3) + + flip_mask := flip_mask_DATA() + VMOVDQU(flip_mask, BYTE_FLIP_MASK) + + VPSHUFB(X_BYTE_FLIP_MASK, XWORD0, XWORD0) + VPSHUFB(X_BYTE_FLIP_MASK, XWORD1, XWORD1) + VPSHUFB(X_BYTE_FLIP_MASK, XWORD2, XWORD2) + VPSHUFB(X_BYTE_FLIP_MASK, XWORD3, XWORD3) + + K256 := K256_DATA() + LEAQ(K256, TBL) + + JMP(LabelRef("avx2_last_block_enter")) +} + +// Load initial digest +func avx2_only_one_block() { + Label("avx2_only_one_block") + registers := []GPPhysical{a, b, c, d, e, f, g, h} + for i, reg := range registers { + MOVL(Mem{Base: CTX}.Offset(i*4), reg) + } + JMP(LabelRef("avx2_do_last_block")) +} + +func done_hash() { + Label("done_hash") + VZEROUPPER() + RET() +} + +// addm (mem), reg +// - Add reg to mem using reg-mem add and store +func addm(P1 Mem, P2 GPPhysical) { + ADDL(P2, P1) + MOVL(P1, P2) +} + +var ( + XDWORD0 VecPhysical = Y4 + XDWORD1 = Y5 + XDWORD2 = Y6 + XDWORD3 = Y7 + + XWORD0 = X4 + XWORD1 = X5 + XWORD2 = X6 + XWORD3 = X7 + + XTMP0 = Y0 + XTMP1 = Y1 + XTMP2 = Y2 + XTMP3 = Y3 + XTMP4 = Y8 + XTMP5 = Y11 + + XFER = Y9 + + BYTE_FLIP_MASK = Y13 // mask to convert LE -> BE + X_BYTE_FLIP_MASK = X13 + + NUM_BYTES GPPhysical = RDX + INP = RDI + + CTX = RSI // Beginning of digest in memory (a, b, c, ... , h) + + a = EAX + b = EBX + c = ECX + d = R8L + e = EDX + f = R9L + g = R10L + h = R11L + + old_h = R11L + + TBL = RBP + + SRND = RSI // SRND is same register as CTX + + T1 = R12L + + y0 = R13L + y1 = R14L + y2 = R15L + y3 = EDI + + // Offsets + XFER_SIZE = 2 * 64 * 4 + INP_END_SIZE = 8 + INP_SIZE = 8 + + _XFER = 0 + _INP_END = _XFER + XFER_SIZE + _INP = _INP_END + INP_END_SIZE + STACK_SIZE = _INP + INP_SIZE +) + +func roundAndSchedN0(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { + // ############################# RND N + 0 ############################// + MOVL(a, y3) // y3 = a + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + + ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + VPALIGNR(Imm(4), XDWORD2, XDWORD3, XTMP0) // XTMP0 = W[-7] + MOVL(f, y2) // y2 = f + RORXL(Imm(13), a, T1) // T1 = a >> 13 + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + XORL(g, y2) // y2 = f^g + VPADDD(XDWORD0, XTMP0, XTMP0) // XTMP0 = W[-7] + W[-16] + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + + ANDL(e, y2) // y2 = (f^g)&e + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(22), a, y1) // y1 = a >> 22 + ADDL(h, d) // d = k + w + h + d + + ANDL(b, y3) // y3 = (a|c)&b + VPALIGNR(Imm(4), XDWORD0, XDWORD1, XTMP1) // XTMP1 = W[-15] + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + VPSRLD(Imm(7), XTMP1, XTMP2) // + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(c, T1) // T1 = a&c + + ADDL(y0, y2) // y2 = S1 + CH + VPSLLD(Imm(32-7), XTMP1, XTMP3) // + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 + VPOR(XTMP2, XTMP3, XTMP3) // XTMP3 = W[-15] ror 7 + + VPSRLD(Imm(18), XTMP1, XTMP2) + ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + ADDL(y3, h) // h = t1 + S0 + MAJ +} + +func roundAndSchedN1(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { + // ################################### RND N + 1 ############################ + MOVL(a, y3) // y3 = a + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + VPSRLD(Imm(3), XTMP1, XTMP4) // XTMP4 = W[-15] >> 3 + MOVL(f, y2) // y2 = f + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + XORL(g, y2) // y2 = f^g + + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(22), a, y1) // y1 = a >> 22 + ANDL(e, y2) // y2 = (f^g)&e + ADDL(h, d) // d = k + w + h + d + + VPSLLD(Imm(32-18), XTMP1, XTMP1) + ANDL(b, y3) // y3 = (a|c)&b + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + + VPXOR(XTMP1, XTMP3, XTMP3) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + + VPXOR(XTMP2, XTMP3, XTMP3) // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + + VPXOR(XTMP4, XTMP3, XTMP1) // XTMP1 = s0 + VPSHUFD(Imm(0xFA), XDWORD3, XTMP2) // XTMP2 = W[-2] {BBAA} + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + + VPADDD(XTMP1, XTMP0, XTMP0) // XTMP0 = W[-16] + W[-7] + s0 + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 + ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + ADDL(y3, h) // h = t1 + S0 + MAJ + + VPSRLD(Imm(10), XTMP2, XTMP4) // XTMP4 = W[-2] >> 10 {BBAA} +} + +func roundAndSchedN2(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { + // ################################### RND N + 2 ############################ + var shuff_00BA Mem = shuff_00BA_DATA() + + MOVL(a, y3) // y3 = a + RORXL(Imm(25), e, y0) // y0 = e >> 25 + ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h + + VPSRLQ(Imm(19), XTMP2, XTMP3) // XTMP3 = W[-2] ror 19 {xBxA} + RORXL(Imm(11), e, y1) // y1 = e >> 11 + ORL(c, y3) // y3 = a|c + MOVL(f, y2) // y2 = f + XORL(g, y2) // y2 = f^g + + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + VPSRLQ(Imm(17), XTMP2, XTMP2) // XTMP2 = W[-2] ror 17 {xBxA} + ANDL(e, y2) // y2 = (f^g)&e + + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + VPXOR(XTMP3, XTMP2, XTMP2) + ADDL(h, d) // d = k + w + h + d + ANDL(b, y3) // y3 = (a|c)&b + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(22), a, y1) // y1 = a >> 22 + VPXOR(XTMP2, XTMP4, XTMP4) // XTMP4 = s1 {xBxA} + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + + VPSHUFB(shuff_00BA, XTMP4, XTMP4) // XTMP4 = s1 {00BA} + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + VPADDD(XTMP4, XTMP0, XTMP0) // XTMP0 = {..., ..., W[1], W[0]} + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + VPSHUFD(Imm(80), XTMP0, XTMP2) // XTMP2 = W[-2] {DDCC} + + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 + ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + + ADDL(y3, h) // h = t1 + S0 + MAJ +} + +func roundAndSchedN3(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) { + // ################################### RND N + 3 ############################ + var shuff_DC00 Mem = shuff_DC00_DATA() + + MOVL(a, y3) // y3 = a + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + VPSRLD(Imm(10), XTMP2, XTMP5) // XTMP5 = W[-2] >> 10 {DDCC} + MOVL(f, y2) // y2 = f + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + XORL(g, y2) // y2 = f^g + + VPSRLQ(Imm(19), XTMP2, XTMP3) // XTMP3 = W[-2] ror 19 {xDxC} + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + ANDL(e, y2) // y2 = (f^g)&e + ADDL(h, d) // d = k + w + h + d + ANDL(b, y3) // y3 = (a|c)&b + + VPSRLQ(Imm(17), XTMP2, XTMP2) // XTMP2 = W[-2] ror 17 {xDxC} + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + + VPXOR(XTMP3, XTMP2, XTMP2) + RORXL(Imm(22), a, y1) // y1 = a >> 22 + ADDL(y0, y2) // y2 = S1 + CH + + VPXOR(XTMP2, XTMP5, XTMP5) // XTMP5 = s1 {xDxC} + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 + + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + + VPSHUFB(shuff_DC00, XTMP5, XTMP5) // XTMP5 = s1 {DC00} + + VPADDD(XTMP0, XTMP5, XDWORD0) // XDWORD0 = {W[3], W[2], W[1], W[0]} + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(c, T1) // T1 = a&c + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + + ADDL(y1, h) // h = k + w + h + S0 + ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + ADDL(y3, h) // h = t1 + S0 + MAJ +} + +func doRoundN0(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { + // ################################### RND N + 0 ########################### + MOVL(f, y2) // y2 = f + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + XORL(g, y2) // y2 = f^g + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + ANDL(e, y2) // y2 = (f^g)&e + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + RORXL(Imm(22), a, y1) // y1 = a >> 22 + MOVL(a, y3) // y3 = a + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(b, y3) // y3 = (a|c)&b + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + + ADDL(h, d) // d = k + w + h + d + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 +} + +func doRoundN1(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { + // ################################### RND N + 1 ########################### + ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + MOVL(f, y2) // y2 = f + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + XORL(g, y2) // y2 = f^g + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + ANDL(e, y2) // y2 = (f^g)&e + ADDL(y3, old_h) // h = t1 + S0 + MAJ + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + RORXL(Imm(22), a, y1) // y1 = a >> 22 + MOVL(a, y3) // y3 = a + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(b, y3) // y3 = (a|c)&b + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + + ADDL(h, d) // d = k + w + h + d + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 +} + +func doRoundN2(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { + // ################################### RND N + 2 ############################## + ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + MOVL(f, y2) // y2 = f + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + XORL(g, y2) // y2 = f^g + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + ANDL(e, y2) // y2 = (f^g)&e + ADDL(y3, old_h) // h = t1 + S0 + MAJ + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + RORXL(Imm(22), a, y1) // y1 = a >> 22 + MOVL(a, y3) // y3 = a + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(b, y3) // y3 = (a|c)&b + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + + ADDL(h, d) // d = k + w + h + d + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 +} + +func doRoundN3(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) { + // ################################### RND N + 3 ########################### + ADDL(y2, old_h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + MOVL(f, y2) // y2 = f + RORXL(Imm(25), e, y0) // y0 = e >> 25 + RORXL(Imm(11), e, y1) // y1 = e >> 11 + XORL(g, y2) // y2 = f^g + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) + RORXL(Imm(6), e, y1) // y1 = (e >> 6) + ANDL(e, y2) // y2 = (f^g)&e + ADDL(y3, old_h) // h = t1 + S0 + MAJ + + XORL(y1, y0) // y0 = (e>>25) ^ (e>>11) ^ (e>>6) + RORXL(Imm(13), a, T1) // T1 = a >> 13 + XORL(g, y2) // y2 = CH = ((f^g)&e)^g + RORXL(Imm(22), a, y1) // y1 = a >> 22 + MOVL(a, y3) // y3 = a + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) + RORXL(Imm(2), a, T1) // T1 = (a >> 2) + ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h + ORL(c, y3) // y3 = a|c + + XORL(T1, y1) // y1 = (a>>22) ^ (a>>13) ^ (a>>2) + MOVL(a, T1) // T1 = a + ANDL(b, y3) // y3 = (a|c)&b + ANDL(c, T1) // T1 = a&c + ADDL(y0, y2) // y2 = S1 + CH + + ADDL(h, d) // d = k + w + h + d + ORL(T1, y3) // y3 = MAJ = (a|c)&b)|(a&c) + ADDL(y1, h) // h = k + w + h + S0 + + ADDL(y2, d) // d = k + w + h + d + S1 + CH = d + t1 + + ADDL(y2, h) // h = k + w + h + S0 + S1 + CH = t1 + S0 + + ADDL(y3, h) // h = t1 + S0 + MAJ +} + +// Pointers for memoizing Data section symbols +var flip_maskPtr, shuff_00BAPtr, shuff_DC00Ptr, K256Ptr *Mem + +// shuffle byte order from LE to BE +func flip_mask_DATA() Mem { + if flip_maskPtr != nil { + return *flip_maskPtr + } + + flip_mask := GLOBL("flip_mask", RODATA) + flip_maskPtr = &flip_mask + + DATA(0x00, U64(0x0405060700010203)) + DATA(0x08, U64(0x0c0d0e0f08090a0b)) + DATA(0x10, U64(0x0405060700010203)) + DATA(0x18, U64(0x0c0d0e0f08090a0b)) + return flip_mask +} + +// shuffle xBxA -> 00BA +func shuff_00BA_DATA() Mem { + if shuff_00BAPtr != nil { + return *shuff_00BAPtr + } + + shuff_00BA := GLOBL("shuff_00BA", RODATA) + shuff_00BAPtr = &shuff_00BA + + DATA(0x00, U64(0x0b0a090803020100)) + DATA(0x08, U64(0xFFFFFFFFFFFFFFFF)) + DATA(0x10, U64(0x0b0a090803020100)) + DATA(0x18, U64(0xFFFFFFFFFFFFFFFF)) + return shuff_00BA +} + +// shuffle xDxC -> DC00 +func shuff_DC00_DATA() Mem { + if shuff_DC00Ptr != nil { + return *shuff_DC00Ptr + } + + shuff_DC00 := GLOBL("shuff_DC00", RODATA) + shuff_DC00Ptr = &shuff_DC00 + + DATA(0x00, U64(0xFFFFFFFFFFFFFFFF)) + DATA(0x08, U64(0x0b0a090803020100)) + DATA(0x10, U64(0xFFFFFFFFFFFFFFFF)) + DATA(0x18, U64(0x0b0a090803020100)) + return shuff_DC00 +} + +// Round specific constants +func K256_DATA() Mem { + if K256Ptr != nil { + return *K256Ptr + } + + K256 := GLOBL("K256", NOPTR+RODATA) + K256Ptr = &K256 + + offset_idx := 0 + + for i := 0; i < len(_K); i += 4 { + DATA((offset_idx+0)*4, U32(_K[i+0])) // k1 + DATA((offset_idx+1)*4, U32(_K[i+1])) // k2 + DATA((offset_idx+2)*4, U32(_K[i+2])) // k3 + DATA((offset_idx+3)*4, U32(_K[i+3])) // k4 + + DATA((offset_idx+4)*4, U32(_K[i+0])) // k1 + DATA((offset_idx+5)*4, U32(_K[i+1])) // k2 + DATA((offset_idx+6)*4, U32(_K[i+2])) // k3 + DATA((offset_idx+7)*4, U32(_K[i+3])) // k4 + offset_idx += 8 + } + return K256 +} diff --git a/src/crypto/sha256/_asm/sha256block_amd64_shani.go b/src/crypto/sha256/_asm/sha256block_amd64_shani.go new file mode 100644 index 0000000000..423e86206f --- /dev/null +++ b/src/crypto/sha256/_asm/sha256block_amd64_shani.go @@ -0,0 +1,174 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" +) + +// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2 +// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version +// +// Reference +// S. Gulley, et al, "New Instructions Supporting the Secure Hash +// Algorithm on Intel® Architecture Processors", July 2013 +// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html + +func blockSHANI() { + Implement("blockSHANI") + Load(Param("dig"), digestPtr) // init digest hash vector H0, H1,..., H7 pointer + Load(Param("p").Base(), dataPtr) // init input data base pointer + Load(Param("p").Len(), numBytes) // get number of input bytes to hash + SHRQ(Imm(6), numBytes) // force modulo 64 input buffer length + SHLQ(Imm(6), numBytes) + CMPQ(numBytes, Imm(0)) // exit early for zero-length input buffer + JEQ(LabelRef("done")) + ADDQ(dataPtr, numBytes) // point numBytes to end of input buffer + VMOVDQU(Mem{Base: digestPtr}.Offset(0*16), state0) // load initial hash values and reorder + VMOVDQU(Mem{Base: digestPtr}.Offset(1*16), state1) // DCBA, HGFE -> ABEF, CDGH + PSHUFD(Imm(0xb1), state0, state0) // CDAB + PSHUFD(Imm(0x1b), state1, state1) // EFGH + VMOVDQA(state0, m4) + PALIGNR(Imm(8), state1, state0) // ABEF + PBLENDW(Imm(0xf0), m4, state1) // CDGH + flip_mask := flip_mask_DATA() + VMOVDQA(flip_mask, shufMask) + LEAQ(K256_DATA(), sha256Constants) + + roundLoop() + done() +} + +func roundLoop() { + Label("roundLoop") + Comment("save hash values for addition after rounds") + VMOVDQA(state0, abefSave) + VMOVDQA(state1, cdghSave) + + Comment("do rounds 0-59") + rounds0to11(m0, nil, 0, nop) // 0-3 + rounds0to11(m1, m0, 1, sha256msg1) // 4-7 + rounds0to11(m2, m1, 2, sha256msg1) // 8-11 + VMOVDQU(Mem{Base: dataPtr}.Offset(3*16), msg) + PSHUFB(shufMask, msg) + rounds12to59(m3, 3, m2, m0, sha256msg1, vmovrev) // 12-15 + rounds12to59(m0, 4, m3, m1, sha256msg1, vmov) // 16-19 + rounds12to59(m1, 5, m0, m2, sha256msg1, vmov) // 20-23 + rounds12to59(m2, 6, m1, m3, sha256msg1, vmov) // 24-27 + rounds12to59(m3, 7, m2, m0, sha256msg1, vmov) // 28-31 + rounds12to59(m0, 8, m3, m1, sha256msg1, vmov) // 32-35 + rounds12to59(m1, 9, m0, m2, sha256msg1, vmov) // 36-39 + rounds12to59(m2, 10, m1, m3, sha256msg1, vmov) // 40-43 + rounds12to59(m3, 11, m2, m0, sha256msg1, vmov) // 44-47 + rounds12to59(m0, 12, m3, m1, sha256msg1, vmov) // 48-51 + rounds12to59(m1, 13, m0, m2, nop, vmov) // 52-55 + rounds12to59(m2, 14, m1, m3, nop, vmov) // 56-59 + + Comment("do rounds 60-63") + VMOVDQA(m3, msg) + PADDD(Mem{Base: sha256Constants}.Offset(15*32), msg) + SHA256RNDS2(msg, state0, state1) + PSHUFD(Imm(0x0e), msg, msg) + SHA256RNDS2(msg, state1, state0) + + Comment("add current hash values with previously saved") + PADDD(abefSave, state0) + PADDD(cdghSave, state1) + + Comment("advance data pointer; loop until buffer empty") + ADDQ(Imm(64), dataPtr) + CMPQ(numBytes, dataPtr) + JNE(LabelRef("roundLoop")) + + Comment("write hash values back in the correct order") + PSHUFD(Imm(0x1b), state0, state0) + PSHUFD(Imm(0xb1), state1, state1) + VMOVDQA(state0, m4) + PBLENDW(Imm(0xf0), state1, state0) + PALIGNR(Imm(8), m4, state1) + VMOVDQU(state0, Mem{Base: digestPtr}.Offset(0*16)) + VMOVDQU(state1, Mem{Base: digestPtr}.Offset(1*16)) +} + +func done() { + Label("done") + RET() +} + +var ( + digestPtr GPPhysical = RDI // input/output, base pointer to digest hash vector H0, H1, ..., H7 + dataPtr = RSI // input, base pointer to first input data block + numBytes = RDX // input, number of input bytes to be processed + sha256Constants = RAX // round contents from K256 table, indexed by round number x 32 + msg VecPhysical = X0 // input data + state0 = X1 // round intermediates and outputs + state1 = X2 + m0 = X3 // m0, m1,... m4 -- round message temps + m1 = X4 + m2 = X5 + m3 = X6 + m4 = X7 + shufMask = X8 // input data endian conversion control mask + abefSave = X9 // digest hash vector inter-block buffer abef + cdghSave = X10 // digest hash vector inter-block buffer cdgh +) + +// nop instead of final SHA256MSG1 for first and last few rounds +func nop(m, a VecPhysical) { +} + +// final SHA256MSG1 for middle rounds that require it +func sha256msg1(m, a VecPhysical) { + SHA256MSG1(m, a) +} + +// msg copy for all but rounds 12-15 +func vmov(a, b VecPhysical) { + VMOVDQA(a, b) +} + +// reverse copy for rounds 12-15 +func vmovrev(a, b VecPhysical) { + VMOVDQA(b, a) +} + +type VecFunc func(a, b VecPhysical) + +// sha rounds 0 to 11 +// +// identical with the exception of the final msg op +// which is replaced with a nop for rounds where it is not needed +// refer to Gulley, et al for more information +func rounds0to11(m, a VecPhysical, c int, sha256msg1 VecFunc) { + VMOVDQU(Mem{Base: dataPtr}.Offset(c*16), msg) + PSHUFB(shufMask, msg) + VMOVDQA(msg, m) + PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg) + SHA256RNDS2(msg, state0, state1) + PSHUFD(U8(0x0e), msg, msg) + SHA256RNDS2(msg, state1, state0) + sha256msg1(m, a) +} + +// sha rounds 12 to 59 +// +// identical with the exception of the final msg op +// and the reverse copy(m,msg) in round 12 which is required +// after the last data load +// refer to Gulley, et al for more information +func rounds12to59(m VecPhysical, c int, a, t VecPhysical, sha256msg1, movop VecFunc) { + movop(m, msg) + PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg) + SHA256RNDS2(msg, state0, state1) + VMOVDQA(m, m4) + PALIGNR(Imm(4), a, m4) + PADDD(m4, t) + SHA256MSG2(m, t) + PSHUFD(Imm(0x0e), msg, msg) + SHA256RNDS2(msg, state1, state0) + sha256msg1(m, a) +} diff --git a/src/crypto/sha256/fallback_test.go b/src/crypto/sha256/fallback_test.go deleted file mode 100644 index ceef3cc922..0000000000 --- a/src/crypto/sha256/fallback_test.go +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build s390x && !purego - -package sha256 - -import ( - "fmt" - "io" - "testing" -) - -// Tests the fallback code path in case the optimized asm -// implementation cannot be used. -// See also TestBlockGeneric. -func TestGenericPath(t *testing.T) { - if useAsm == false { - t.Skipf("assembly implementation unavailable") - } - useAsm = false - defer func() { useAsm = true }() - c := New() - in := "ΑΒΓΔΕϜΖΗΘΙΚΛΜΝΞΟΠϺϘΡΣΤΥΦΧΨΩ" - gold := "e93d84ec2b22383123be9f713697fb25" + - "338c86e2f7d8d1ddc2d89d332dd9d76c" - if _, err := io.WriteString(c, in); err != nil { - t.Fatalf("could not write to c: %v", err) - } - out := fmt.Sprintf("%x", c.Sum(nil)) - if out != gold { - t.Fatalf("mismatch: got %s, wanted %s", out, gold) - } -} diff --git a/src/crypto/sha256/sha256block_amd64.go b/src/crypto/sha256/sha256block_amd64.go index fdd75a3f3e..411f5ebf02 100644 --- a/src/crypto/sha256/sha256block_amd64.go +++ b/src/crypto/sha256/sha256block_amd64.go @@ -8,5 +8,25 @@ package sha256 import "internal/cpu" +//go:noescape +func blockAMD64(dig *digest, p []byte) + var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2 -var useSHA = useAVX2 && cpu.X86.HasSHA + +//go:noescape +func blockAVX2(dig *digest, p []byte) + +var useSHANI = useAVX2 && cpu.X86.HasSHA + +//go:noescape +func blockSHANI(dig *digest, p []byte) + +func block(dig *digest, p []byte) { + if useSHANI { + blockSHANI(dig, p) + } else if useAVX2 { + blockAVX2(dig, p) + } else { + blockAMD64(dig, p) + } +} diff --git a/src/crypto/sha256/sha256block_amd64.s b/src/crypto/sha256/sha256block_amd64.s index 700a4eff97..d073c5fe30 100644 --- a/src/crypto/sha256/sha256block_amd64.s +++ b/src/crypto/sha256/sha256block_amd64.s @@ -4,13 +4,8 @@ #include "textflag.h" -// func block(dig *digest, p []byte) -// Requires: AVX, AVX2, BMI2, SHA, SSE2, SSE4.1, SSSE3 -TEXT ·block(SB), $536-32 - CMPB ·useSHA+0(SB), $0x01 - JE sha_ni - CMPB ·useAVX2+0(SB), $0x01 - JE avx2 +// func blockAMD64(dig *digest, p []byte) +TEXT ·blockAMD64(SB), $264-32 MOVQ p_base+8(FP), SI MOVQ p_len+16(FP), DX SHRQ $0x06, DX @@ -3495,7 +3490,9 @@ loop: end: RET -avx2: +// func blockAVX2(dig *digest, p []byte) +// Requires: AVX, AVX2, BMI2 +TEXT ·blockAVX2(SB), $536-32 MOVQ dig+0(FP), SI MOVQ p_base+8(FP), DI MOVQ p_len+16(FP), DX @@ -4627,7 +4624,157 @@ done_hash: VZEROUPPER RET -sha_ni: +DATA flip_mask<>+0(SB)/8, $0x0405060700010203 +DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b +DATA flip_mask<>+16(SB)/8, $0x0405060700010203 +DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b +GLOBL flip_mask<>(SB), RODATA, $32 + +DATA K256<>+0(SB)/4, $0x428a2f98 +DATA K256<>+4(SB)/4, $0x71374491 +DATA K256<>+8(SB)/4, $0xb5c0fbcf +DATA K256<>+12(SB)/4, $0xe9b5dba5 +DATA K256<>+16(SB)/4, $0x428a2f98 +DATA K256<>+20(SB)/4, $0x71374491 +DATA K256<>+24(SB)/4, $0xb5c0fbcf +DATA K256<>+28(SB)/4, $0xe9b5dba5 +DATA K256<>+32(SB)/4, $0x3956c25b +DATA K256<>+36(SB)/4, $0x59f111f1 +DATA K256<>+40(SB)/4, $0x923f82a4 +DATA K256<>+44(SB)/4, $0xab1c5ed5 +DATA K256<>+48(SB)/4, $0x3956c25b +DATA K256<>+52(SB)/4, $0x59f111f1 +DATA K256<>+56(SB)/4, $0x923f82a4 +DATA K256<>+60(SB)/4, $0xab1c5ed5 +DATA K256<>+64(SB)/4, $0xd807aa98 +DATA K256<>+68(SB)/4, $0x12835b01 +DATA K256<>+72(SB)/4, $0x243185be +DATA K256<>+76(SB)/4, $0x550c7dc3 +DATA K256<>+80(SB)/4, $0xd807aa98 +DATA K256<>+84(SB)/4, $0x12835b01 +DATA K256<>+88(SB)/4, $0x243185be +DATA K256<>+92(SB)/4, $0x550c7dc3 +DATA K256<>+96(SB)/4, $0x72be5d74 +DATA K256<>+100(SB)/4, $0x80deb1fe +DATA K256<>+104(SB)/4, $0x9bdc06a7 +DATA K256<>+108(SB)/4, $0xc19bf174 +DATA K256<>+112(SB)/4, $0x72be5d74 +DATA K256<>+116(SB)/4, $0x80deb1fe +DATA K256<>+120(SB)/4, $0x9bdc06a7 +DATA K256<>+124(SB)/4, $0xc19bf174 +DATA K256<>+128(SB)/4, $0xe49b69c1 +DATA K256<>+132(SB)/4, $0xefbe4786 +DATA K256<>+136(SB)/4, $0x0fc19dc6 +DATA K256<>+140(SB)/4, $0x240ca1cc +DATA K256<>+144(SB)/4, $0xe49b69c1 +DATA K256<>+148(SB)/4, $0xefbe4786 +DATA K256<>+152(SB)/4, $0x0fc19dc6 +DATA K256<>+156(SB)/4, $0x240ca1cc +DATA K256<>+160(SB)/4, $0x2de92c6f +DATA K256<>+164(SB)/4, $0x4a7484aa +DATA K256<>+168(SB)/4, $0x5cb0a9dc +DATA K256<>+172(SB)/4, $0x76f988da +DATA K256<>+176(SB)/4, $0x2de92c6f +DATA K256<>+180(SB)/4, $0x4a7484aa +DATA K256<>+184(SB)/4, $0x5cb0a9dc +DATA K256<>+188(SB)/4, $0x76f988da +DATA K256<>+192(SB)/4, $0x983e5152 +DATA K256<>+196(SB)/4, $0xa831c66d +DATA K256<>+200(SB)/4, $0xb00327c8 +DATA K256<>+204(SB)/4, $0xbf597fc7 +DATA K256<>+208(SB)/4, $0x983e5152 +DATA K256<>+212(SB)/4, $0xa831c66d +DATA K256<>+216(SB)/4, $0xb00327c8 +DATA K256<>+220(SB)/4, $0xbf597fc7 +DATA K256<>+224(SB)/4, $0xc6e00bf3 +DATA K256<>+228(SB)/4, $0xd5a79147 +DATA K256<>+232(SB)/4, $0x06ca6351 +DATA K256<>+236(SB)/4, $0x14292967 +DATA K256<>+240(SB)/4, $0xc6e00bf3 +DATA K256<>+244(SB)/4, $0xd5a79147 +DATA K256<>+248(SB)/4, $0x06ca6351 +DATA K256<>+252(SB)/4, $0x14292967 +DATA K256<>+256(SB)/4, $0x27b70a85 +DATA K256<>+260(SB)/4, $0x2e1b2138 +DATA K256<>+264(SB)/4, $0x4d2c6dfc +DATA K256<>+268(SB)/4, $0x53380d13 +DATA K256<>+272(SB)/4, $0x27b70a85 +DATA K256<>+276(SB)/4, $0x2e1b2138 +DATA K256<>+280(SB)/4, $0x4d2c6dfc +DATA K256<>+284(SB)/4, $0x53380d13 +DATA K256<>+288(SB)/4, $0x650a7354 +DATA K256<>+292(SB)/4, $0x766a0abb +DATA K256<>+296(SB)/4, $0x81c2c92e +DATA K256<>+300(SB)/4, $0x92722c85 +DATA K256<>+304(SB)/4, $0x650a7354 +DATA K256<>+308(SB)/4, $0x766a0abb +DATA K256<>+312(SB)/4, $0x81c2c92e +DATA K256<>+316(SB)/4, $0x92722c85 +DATA K256<>+320(SB)/4, $0xa2bfe8a1 +DATA K256<>+324(SB)/4, $0xa81a664b +DATA K256<>+328(SB)/4, $0xc24b8b70 +DATA K256<>+332(SB)/4, $0xc76c51a3 +DATA K256<>+336(SB)/4, $0xa2bfe8a1 +DATA K256<>+340(SB)/4, $0xa81a664b +DATA K256<>+344(SB)/4, $0xc24b8b70 +DATA K256<>+348(SB)/4, $0xc76c51a3 +DATA K256<>+352(SB)/4, $0xd192e819 +DATA K256<>+356(SB)/4, $0xd6990624 +DATA K256<>+360(SB)/4, $0xf40e3585 +DATA K256<>+364(SB)/4, $0x106aa070 +DATA K256<>+368(SB)/4, $0xd192e819 +DATA K256<>+372(SB)/4, $0xd6990624 +DATA K256<>+376(SB)/4, $0xf40e3585 +DATA K256<>+380(SB)/4, $0x106aa070 +DATA K256<>+384(SB)/4, $0x19a4c116 +DATA K256<>+388(SB)/4, $0x1e376c08 +DATA K256<>+392(SB)/4, $0x2748774c +DATA K256<>+396(SB)/4, $0x34b0bcb5 +DATA K256<>+400(SB)/4, $0x19a4c116 +DATA K256<>+404(SB)/4, $0x1e376c08 +DATA K256<>+408(SB)/4, $0x2748774c +DATA K256<>+412(SB)/4, $0x34b0bcb5 +DATA K256<>+416(SB)/4, $0x391c0cb3 +DATA K256<>+420(SB)/4, $0x4ed8aa4a +DATA K256<>+424(SB)/4, $0x5b9cca4f +DATA K256<>+428(SB)/4, $0x682e6ff3 +DATA K256<>+432(SB)/4, $0x391c0cb3 +DATA K256<>+436(SB)/4, $0x4ed8aa4a +DATA K256<>+440(SB)/4, $0x5b9cca4f +DATA K256<>+444(SB)/4, $0x682e6ff3 +DATA K256<>+448(SB)/4, $0x748f82ee +DATA K256<>+452(SB)/4, $0x78a5636f +DATA K256<>+456(SB)/4, $0x84c87814 +DATA K256<>+460(SB)/4, $0x8cc70208 +DATA K256<>+464(SB)/4, $0x748f82ee +DATA K256<>+468(SB)/4, $0x78a5636f +DATA K256<>+472(SB)/4, $0x84c87814 +DATA K256<>+476(SB)/4, $0x8cc70208 +DATA K256<>+480(SB)/4, $0x90befffa +DATA K256<>+484(SB)/4, $0xa4506ceb +DATA K256<>+488(SB)/4, $0xbef9a3f7 +DATA K256<>+492(SB)/4, $0xc67178f2 +DATA K256<>+496(SB)/4, $0x90befffa +DATA K256<>+500(SB)/4, $0xa4506ceb +DATA K256<>+504(SB)/4, $0xbef9a3f7 +DATA K256<>+508(SB)/4, $0xc67178f2 +GLOBL K256<>(SB), RODATA|NOPTR, $512 + +DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100 +DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff +DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100 +DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff +GLOBL shuff_00BA<>(SB), RODATA, $32 + +DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff +DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100 +DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff +DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100 +GLOBL shuff_DC00<>(SB), RODATA, $32 + +// func blockSHANI(dig *digest, p []byte) +// Requires: AVX, SHA, SSE2, SSE4.1, SSSE3 +TEXT ·blockSHANI(SB), $0-32 MOVQ dig+0(FP), DI MOVQ p_base+8(FP), SI MOVQ p_len+16(FP), DX @@ -4823,151 +4970,3 @@ roundLoop: done: RET - -DATA flip_mask<>+0(SB)/8, $0x0405060700010203 -DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b -DATA flip_mask<>+16(SB)/8, $0x0405060700010203 -DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b -GLOBL flip_mask<>(SB), RODATA, $32 - -DATA K256<>+0(SB)/4, $0x428a2f98 -DATA K256<>+4(SB)/4, $0x71374491 -DATA K256<>+8(SB)/4, $0xb5c0fbcf -DATA K256<>+12(SB)/4, $0xe9b5dba5 -DATA K256<>+16(SB)/4, $0x428a2f98 -DATA K256<>+20(SB)/4, $0x71374491 -DATA K256<>+24(SB)/4, $0xb5c0fbcf -DATA K256<>+28(SB)/4, $0xe9b5dba5 -DATA K256<>+32(SB)/4, $0x3956c25b -DATA K256<>+36(SB)/4, $0x59f111f1 -DATA K256<>+40(SB)/4, $0x923f82a4 -DATA K256<>+44(SB)/4, $0xab1c5ed5 -DATA K256<>+48(SB)/4, $0x3956c25b -DATA K256<>+52(SB)/4, $0x59f111f1 -DATA K256<>+56(SB)/4, $0x923f82a4 -DATA K256<>+60(SB)/4, $0xab1c5ed5 -DATA K256<>+64(SB)/4, $0xd807aa98 -DATA K256<>+68(SB)/4, $0x12835b01 -DATA K256<>+72(SB)/4, $0x243185be -DATA K256<>+76(SB)/4, $0x550c7dc3 -DATA K256<>+80(SB)/4, $0xd807aa98 -DATA K256<>+84(SB)/4, $0x12835b01 -DATA K256<>+88(SB)/4, $0x243185be -DATA K256<>+92(SB)/4, $0x550c7dc3 -DATA K256<>+96(SB)/4, $0x72be5d74 -DATA K256<>+100(SB)/4, $0x80deb1fe -DATA K256<>+104(SB)/4, $0x9bdc06a7 -DATA K256<>+108(SB)/4, $0xc19bf174 -DATA K256<>+112(SB)/4, $0x72be5d74 -DATA K256<>+116(SB)/4, $0x80deb1fe -DATA K256<>+120(SB)/4, $0x9bdc06a7 -DATA K256<>+124(SB)/4, $0xc19bf174 -DATA K256<>+128(SB)/4, $0xe49b69c1 -DATA K256<>+132(SB)/4, $0xefbe4786 -DATA K256<>+136(SB)/4, $0x0fc19dc6 -DATA K256<>+140(SB)/4, $0x240ca1cc -DATA K256<>+144(SB)/4, $0xe49b69c1 -DATA K256<>+148(SB)/4, $0xefbe4786 -DATA K256<>+152(SB)/4, $0x0fc19dc6 -DATA K256<>+156(SB)/4, $0x240ca1cc -DATA K256<>+160(SB)/4, $0x2de92c6f -DATA K256<>+164(SB)/4, $0x4a7484aa -DATA K256<>+168(SB)/4, $0x5cb0a9dc -DATA K256<>+172(SB)/4, $0x76f988da -DATA K256<>+176(SB)/4, $0x2de92c6f -DATA K256<>+180(SB)/4, $0x4a7484aa -DATA K256<>+184(SB)/4, $0x5cb0a9dc -DATA K256<>+188(SB)/4, $0x76f988da -DATA K256<>+192(SB)/4, $0x983e5152 -DATA K256<>+196(SB)/4, $0xa831c66d -DATA K256<>+200(SB)/4, $0xb00327c8 -DATA K256<>+204(SB)/4, $0xbf597fc7 -DATA K256<>+208(SB)/4, $0x983e5152 -DATA K256<>+212(SB)/4, $0xa831c66d -DATA K256<>+216(SB)/4, $0xb00327c8 -DATA K256<>+220(SB)/4, $0xbf597fc7 -DATA K256<>+224(SB)/4, $0xc6e00bf3 -DATA K256<>+228(SB)/4, $0xd5a79147 -DATA K256<>+232(SB)/4, $0x06ca6351 -DATA K256<>+236(SB)/4, $0x14292967 -DATA K256<>+240(SB)/4, $0xc6e00bf3 -DATA K256<>+244(SB)/4, $0xd5a79147 -DATA K256<>+248(SB)/4, $0x06ca6351 -DATA K256<>+252(SB)/4, $0x14292967 -DATA K256<>+256(SB)/4, $0x27b70a85 -DATA K256<>+260(SB)/4, $0x2e1b2138 -DATA K256<>+264(SB)/4, $0x4d2c6dfc -DATA K256<>+268(SB)/4, $0x53380d13 -DATA K256<>+272(SB)/4, $0x27b70a85 -DATA K256<>+276(SB)/4, $0x2e1b2138 -DATA K256<>+280(SB)/4, $0x4d2c6dfc -DATA K256<>+284(SB)/4, $0x53380d13 -DATA K256<>+288(SB)/4, $0x650a7354 -DATA K256<>+292(SB)/4, $0x766a0abb -DATA K256<>+296(SB)/4, $0x81c2c92e -DATA K256<>+300(SB)/4, $0x92722c85 -DATA K256<>+304(SB)/4, $0x650a7354 -DATA K256<>+308(SB)/4, $0x766a0abb -DATA K256<>+312(SB)/4, $0x81c2c92e -DATA K256<>+316(SB)/4, $0x92722c85 -DATA K256<>+320(SB)/4, $0xa2bfe8a1 -DATA K256<>+324(SB)/4, $0xa81a664b -DATA K256<>+328(SB)/4, $0xc24b8b70 -DATA K256<>+332(SB)/4, $0xc76c51a3 -DATA K256<>+336(SB)/4, $0xa2bfe8a1 -DATA K256<>+340(SB)/4, $0xa81a664b -DATA K256<>+344(SB)/4, $0xc24b8b70 -DATA K256<>+348(SB)/4, $0xc76c51a3 -DATA K256<>+352(SB)/4, $0xd192e819 -DATA K256<>+356(SB)/4, $0xd6990624 -DATA K256<>+360(SB)/4, $0xf40e3585 -DATA K256<>+364(SB)/4, $0x106aa070 -DATA K256<>+368(SB)/4, $0xd192e819 -DATA K256<>+372(SB)/4, $0xd6990624 -DATA K256<>+376(SB)/4, $0xf40e3585 -DATA K256<>+380(SB)/4, $0x106aa070 -DATA K256<>+384(SB)/4, $0x19a4c116 -DATA K256<>+388(SB)/4, $0x1e376c08 -DATA K256<>+392(SB)/4, $0x2748774c -DATA K256<>+396(SB)/4, $0x34b0bcb5 -DATA K256<>+400(SB)/4, $0x19a4c116 -DATA K256<>+404(SB)/4, $0x1e376c08 -DATA K256<>+408(SB)/4, $0x2748774c -DATA K256<>+412(SB)/4, $0x34b0bcb5 -DATA K256<>+416(SB)/4, $0x391c0cb3 -DATA K256<>+420(SB)/4, $0x4ed8aa4a -DATA K256<>+424(SB)/4, $0x5b9cca4f -DATA K256<>+428(SB)/4, $0x682e6ff3 -DATA K256<>+432(SB)/4, $0x391c0cb3 -DATA K256<>+436(SB)/4, $0x4ed8aa4a -DATA K256<>+440(SB)/4, $0x5b9cca4f -DATA K256<>+444(SB)/4, $0x682e6ff3 -DATA K256<>+448(SB)/4, $0x748f82ee -DATA K256<>+452(SB)/4, $0x78a5636f -DATA K256<>+456(SB)/4, $0x84c87814 -DATA K256<>+460(SB)/4, $0x8cc70208 -DATA K256<>+464(SB)/4, $0x748f82ee -DATA K256<>+468(SB)/4, $0x78a5636f -DATA K256<>+472(SB)/4, $0x84c87814 -DATA K256<>+476(SB)/4, $0x8cc70208 -DATA K256<>+480(SB)/4, $0x90befffa -DATA K256<>+484(SB)/4, $0xa4506ceb -DATA K256<>+488(SB)/4, $0xbef9a3f7 -DATA K256<>+492(SB)/4, $0xc67178f2 -DATA K256<>+496(SB)/4, $0x90befffa -DATA K256<>+500(SB)/4, $0xa4506ceb -DATA K256<>+504(SB)/4, $0xbef9a3f7 -DATA K256<>+508(SB)/4, $0xc67178f2 -GLOBL K256<>(SB), RODATA|NOPTR, $512 - -DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100 -DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff -DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100 -DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff -GLOBL shuff_00BA<>(SB), RODATA, $32 - -DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff -DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100 -DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff -DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100 -GLOBL shuff_DC00<>(SB), RODATA, $32 diff --git a/src/crypto/sha256/sha256block_arm64.go b/src/crypto/sha256/sha256block_arm64.go index 434b6f253d..4bb873ac75 100644 --- a/src/crypto/sha256/sha256block_arm64.go +++ b/src/crypto/sha256/sha256block_arm64.go @@ -8,16 +8,13 @@ package sha256 import "internal/cpu" -var k = _K - //go:noescape -func sha256block(h []uint32, p []byte, k []uint32) +func blockSHA2(dig *digest, p []byte) func block(dig *digest, p []byte) { - if !cpu.ARM64.HasSHA2 { - blockGeneric(dig, p) + if cpu.ARM64.HasSHA2 { + blockSHA2(dig, p) } else { - h := dig.h[:] - sha256block(h, p, k) + blockGeneric(dig, p) } } diff --git a/src/crypto/sha256/sha256block_arm64.s b/src/crypto/sha256/sha256block_arm64.s index 6757310c34..f6d19e35c6 100644 --- a/src/crypto/sha256/sha256block_arm64.s +++ b/src/crypto/sha256/sha256block_arm64.s @@ -11,12 +11,12 @@ SHA256H2 V9.S4, V8, V3 \ VMOV V2.B16, V8.B16 -// func sha256block(h []uint32, p []byte, k []uint32) -TEXT ·sha256block(SB),NOSPLIT,$0 - MOVD h_base+0(FP), R0 // Hash value first address - MOVD p_base+24(FP), R1 // message first address - MOVD k_base+48(FP), R2 // k constants first address - MOVD p_len+32(FP), R3 // message length +// func blockSHA2(dig *digest, p []byte) +TEXT ·blockSHA2(SB),NOSPLIT,$0 + MOVD dig+0(FP), R0 // Hash value first address + MOVD p_base+8(FP), R1 // message first address + MOVD p_len+16(FP), R3 // message length + MOVD ·_K+0(SB), R2 // k constants first address VLD1 (R0), [V0.S4, V1.S4] // load h(a,b,c,d,e,f,g,h) VLD1.P 64(R2), [V16.S4, V17.S4, V18.S4, V19.S4] VLD1.P 64(R2), [V20.S4, V21.S4, V22.S4, V23.S4] diff --git a/src/crypto/sha256/sha256block_decl.go b/src/crypto/sha256/sha256block_asm.go similarity index 71% rename from src/crypto/sha256/sha256block_decl.go rename to src/crypto/sha256/sha256block_asm.go index e793039387..50e9615c5e 100644 --- a/src/crypto/sha256/sha256block_decl.go +++ b/src/crypto/sha256/sha256block_asm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (386 || amd64 || loong64 || ppc64 || ppc64le || riscv64 || s390x) && !purego +//go:build (386 || loong64 || riscv64) && !purego package sha256 diff --git a/src/crypto/sha256/sha256block_generic.go b/src/crypto/sha256/sha256block_noasm.go similarity index 100% rename from src/crypto/sha256/sha256block_generic.go rename to src/crypto/sha256/sha256block_noasm.go diff --git a/src/crypto/sha256/sha256block_ppc64x.go b/src/crypto/sha256/sha256block_ppc64x.go new file mode 100644 index 0000000000..ae5437598e --- /dev/null +++ b/src/crypto/sha256/sha256block_ppc64x.go @@ -0,0 +1,26 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (ppc64 || ppc64le) && !purego + +package sha256 + +import "internal/godebug" + +// The POWER architecture doesn't have a way to turn off SHA-2 support at +// runtime with GODEBUG=cpu.something=off, so introduce a new GODEBUG knob for +// that. It's intentionally only checked at init() time, to avoid the +// performance overhead of checking it on every block. +var ppc64sha2 = godebug.New("#ppc64sha2").Value() != "off" + +//go:noescape +func blockPOWER(dig *digest, p []byte) + +func block(dig *digest, p []byte) { + if ppc64sha2 { + blockPOWER(dig, p) + } else { + blockGeneric(dig, p) + } +} diff --git a/src/crypto/sha256/sha256block_ppc64x.s b/src/crypto/sha256/sha256block_ppc64x.s index ba8fa623c1..a5f40ff04a 100644 --- a/src/crypto/sha256/sha256block_ppc64x.s +++ b/src/crypto/sha256/sha256block_ppc64x.s @@ -284,8 +284,8 @@ GLOBL ·kcon(SB), RODATA, $1088 #define VPERMLE(va,vb,vc,vt) #endif -// func block(dig *digest, p []byte) -TEXT ·block(SB),0,$0-32 +// func blockPOWER(dig *digest, p []byte) +TEXT ·blockPOWER(SB),0,$0-32 MOVD dig+0(FP), CTX MOVD p_base+8(FP), INP MOVD p_len+16(FP), LEN diff --git a/src/crypto/sha256/sha256block_s390x.go b/src/crypto/sha256/sha256block_s390x.go index 0a1dc5785d..2abebc98e9 100644 --- a/src/crypto/sha256/sha256block_s390x.go +++ b/src/crypto/sha256/sha256block_s390x.go @@ -8,4 +8,13 @@ package sha256 import "internal/cpu" -var useAsm = cpu.S390X.HasSHA256 +//go:noescape +func blockS390X(dig *digest, p []byte) + +func block(dig *digest, p []byte) { + if cpu.S390X.HasSHA256 { + blockS390X(dig, p) + } else { + blockGeneric(dig, p) + } +} diff --git a/src/crypto/sha256/sha256block_s390x.s b/src/crypto/sha256/sha256block_s390x.s index 757d62f512..6372d67738 100644 --- a/src/crypto/sha256/sha256block_s390x.s +++ b/src/crypto/sha256/sha256block_s390x.s @@ -6,17 +6,12 @@ #include "textflag.h" -// func block(dig *digest, p []byte) -TEXT ·block(SB), NOSPLIT|NOFRAME, $0-32 - MOVBZ ·useAsm(SB), R4 +// func blockS390X(dig *digest, p []byte) +TEXT ·blockS390X(SB), NOSPLIT|NOFRAME, $0-32 LMG dig+0(FP), R1, R3 // R2 = &p[0], R3 = len(p) MOVBZ $2, R0 // SHA-256 function code - CMPBEQ R4, $0, generic loop: KIMD R0, R2 // compute intermediate message digest (KIMD) BVS loop // continue if interrupted RET - -generic: - BR ·blockGeneric(SB) diff --git a/src/crypto/sha512/fallback_test.go b/src/crypto/sha512/fallback_test.go deleted file mode 100644 index b55a4a56fa..0000000000 --- a/src/crypto/sha512/fallback_test.go +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build s390x && !purego - -package sha512 - -import ( - "fmt" - "io" - "testing" -) - -// Tests the fallback code path in case the optimized asm -// implementation cannot be used. -// See also TestBlockGeneric. -func TestGenericPath(t *testing.T) { - if !useAsm { - t.Skipf("assembly implementation unavailable") - } - useAsm = false - defer func() { useAsm = true }() - c := New() - in := "ΑΒΓΔΕϜΖΗΘΙΚΛΜΝΞΟΠϺϘΡΣΤΥΦΧΨΩ" - gold := "6922e319366d677f34c504af31bfcb29" + - "e531c125ecd08679362bffbd6b6ebfb9" + - "0dcc27dfc1f3d3b16a16c0763cf43b91" + - "40bbf9bbb7233724e9a0c6655b185d76" - if _, err := io.WriteString(c, in); err != nil { - t.Fatalf("could not write to c: %v", err) - } - out := fmt.Sprintf("%x", c.Sum(nil)) - if out != gold { - t.Fatalf("mismatch: got %s, wanted %s", out, gold) - } -} diff --git a/src/crypto/sha512/sha512block_arm64.go b/src/crypto/sha512/sha512block_arm64.go index 4e2793100a..d62eb92917 100644 --- a/src/crypto/sha512/sha512block_arm64.go +++ b/src/crypto/sha512/sha512block_arm64.go @@ -8,13 +8,13 @@ package sha512 import "internal/cpu" +//go:noescape +func blockSHA512(dig *digest, p []byte) + func block(dig *digest, p []byte) { if cpu.ARM64.HasSHA512 { - blockAsm(dig, p) - return + blockSHA512(dig, p) + } else { + blockGeneric(dig, p) } - blockGeneric(dig, p) } - -//go:noescape -func blockAsm(dig *digest, p []byte) diff --git a/src/crypto/sha512/sha512block_arm64.s b/src/crypto/sha512/sha512block_arm64.s index 25f3dbfe43..15242e4bbc 100644 --- a/src/crypto/sha512/sha512block_arm64.s +++ b/src/crypto/sha512/sha512block_arm64.s @@ -40,8 +40,8 @@ VADD i3.D2, i1.D2, i4.D2 \ SHA512H2 i0.D2, i1, i3 -// func blockAsm(dig *digest, p []byte) -TEXT ·blockAsm(SB),NOSPLIT,$0 +// func blockSHA512(dig *digest, p []byte) +TEXT ·blockSHA512(SB),NOSPLIT,$0 MOVD dig+0(FP), R0 MOVD p_base+8(FP), R1 MOVD p_len+16(FP), R2 diff --git a/src/crypto/sha512/sha512block_decl.go b/src/crypto/sha512/sha512block_asm.go similarity index 75% rename from src/crypto/sha512/sha512block_decl.go rename to src/crypto/sha512/sha512block_asm.go index b8a7854e4d..888804678e 100644 --- a/src/crypto/sha512/sha512block_decl.go +++ b/src/crypto/sha512/sha512block_asm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (loong64 || ppc64 || ppc64le || riscv64 || s390x) && !purego +//go:build (loong64 || riscv64) && !purego package sha512 diff --git a/src/crypto/sha512/sha512block_generic.go b/src/crypto/sha512/sha512block_noasm.go similarity index 100% rename from src/crypto/sha512/sha512block_generic.go rename to src/crypto/sha512/sha512block_noasm.go diff --git a/src/crypto/sha512/sha512block_ppc64x.go b/src/crypto/sha512/sha512block_ppc64x.go new file mode 100644 index 0000000000..2f7793ba49 --- /dev/null +++ b/src/crypto/sha512/sha512block_ppc64x.go @@ -0,0 +1,26 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (ppc64 || ppc64le) && !purego + +package sha512 + +import "internal/godebug" + +// The POWER architecture doesn't have a way to turn off SHA-512 support at +// runtime with GODEBUG=cpu.something=off, so introduce a new GODEBUG knob for +// that. It's intentionally only checked at init() time, to avoid the +// performance overhead of checking it on every block. +var ppc64sha512 = godebug.New("#ppc64sha512").Value() != "off" + +//go:noescape +func blockPOWER(dig *digest, p []byte) + +func block(dig *digest, p []byte) { + if ppc64sha512 { + blockPOWER(dig, p) + } else { + blockGeneric(dig, p) + } +} diff --git a/src/crypto/sha512/sha512block_ppc64x.s b/src/crypto/sha512/sha512block_ppc64x.s index 87aab80903..cccce22797 100644 --- a/src/crypto/sha512/sha512block_ppc64x.s +++ b/src/crypto/sha512/sha512block_ppc64x.s @@ -304,8 +304,8 @@ GLOBL ·kcon(SB), RODATA, $1312 VADDUDM S0, h, h; \ VADDUDM s1, xj, xj -// func block(dig *digest, p []byte) -TEXT ·block(SB),0,$0-32 +// func blockPOWER(dig *digest, p []byte) +TEXT ·blockPOWER(SB),0,$0-32 MOVD dig+0(FP), CTX MOVD p_base+8(FP), INP MOVD p_len+16(FP), LEN diff --git a/src/crypto/sha512/sha512block_s390x.go b/src/crypto/sha512/sha512block_s390x.go index d0f09ea9ed..2d1b9ed3db 100644 --- a/src/crypto/sha512/sha512block_s390x.go +++ b/src/crypto/sha512/sha512block_s390x.go @@ -8,4 +8,13 @@ package sha512 import "internal/cpu" -var useAsm = cpu.S390X.HasSHA512 +//go:noescape +func blockS390X(dig *digest, p []byte) + +func block(dig *digest, p []byte) { + if cpu.S390X.HasSHA512 { + blockS390X(dig, p) + } else { + blockGeneric(dig, p) + } +} diff --git a/src/crypto/sha512/sha512block_s390x.s b/src/crypto/sha512/sha512block_s390x.s index 230bd414d3..bd3cd43967 100644 --- a/src/crypto/sha512/sha512block_s390x.s +++ b/src/crypto/sha512/sha512block_s390x.s @@ -6,17 +6,12 @@ #include "textflag.h" -// func block(dig *digest, p []byte) -TEXT ·block(SB), NOSPLIT|NOFRAME, $0-32 - MOVBZ ·useAsm(SB), R4 +// func blockS390X(dig *digest, p []byte) +TEXT ·blockS390X(SB), NOSPLIT|NOFRAME, $0-32 LMG dig+0(FP), R1, R3 // R2 = &p[0], R3 = len(p) MOVBZ $3, R0 // SHA-512 function code - CMPBEQ R4, $0, generic loop: KIMD R0, R2 // compute intermediate message digest (KIMD) BVS loop // continue if interrupted RET - -generic: - BR ·blockGeneric(SB)