]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/sha256,crypto/sha512: make assembly structure consistent
authorFilippo Valsorda <filippo@golang.org>
Thu, 19 Sep 2024 18:04:30 +0000 (20:04 +0200)
committerGopher Robot <gobot@golang.org>
Wed, 23 Oct 2024 15:21:09 +0000 (15:21 +0000)
Ensure separate implementations are implemented in different functions
called from Go, and that they can be turned off from a GODEBUG.

This will be necessary to test implementations separately for #69536.

Change-Id: I3e081deb7abb01b0665265e39c72fd4037dd48b3
Cq-Include-Trybots: luci.golang.try:gotip-linux-arm64-longtest,gotip-linux-amd64-longtest,gotip-linux-ppc64le_power8,gotip-linux-ppc64_power8
Reviewed-on: https://go-review.googlesource.com/c/go/+/614495
Reviewed-by: Daniel McCarney <daniel@binaryparadox.net>
Auto-Submit: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Roland Shoemaker <roland@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
23 files changed:
src/crypto/sha256/_asm/sha256block_amd64_asm.go
src/crypto/sha256/_asm/sha256block_amd64_avx2.go [new file with mode: 0644]
src/crypto/sha256/_asm/sha256block_amd64_shani.go [new file with mode: 0644]
src/crypto/sha256/fallback_test.go [deleted file]
src/crypto/sha256/sha256block_amd64.go
src/crypto/sha256/sha256block_amd64.s
src/crypto/sha256/sha256block_arm64.go
src/crypto/sha256/sha256block_arm64.s
src/crypto/sha256/sha256block_asm.go [moved from src/crypto/sha256/sha256block_decl.go with 71% similarity]
src/crypto/sha256/sha256block_noasm.go [moved from src/crypto/sha256/sha256block_generic.go with 100% similarity]
src/crypto/sha256/sha256block_ppc64x.go [new file with mode: 0644]
src/crypto/sha256/sha256block_ppc64x.s
src/crypto/sha256/sha256block_s390x.go
src/crypto/sha256/sha256block_s390x.s
src/crypto/sha512/fallback_test.go [deleted file]
src/crypto/sha512/sha512block_arm64.go
src/crypto/sha512/sha512block_arm64.s
src/crypto/sha512/sha512block_asm.go [moved from src/crypto/sha512/sha512block_decl.go with 75% similarity]
src/crypto/sha512/sha512block_noasm.go [moved from src/crypto/sha512/sha512block_generic.go with 100% similarity]
src/crypto/sha512/sha512block_ppc64x.go [new file with mode: 0644]
src/crypto/sha512/sha512block_ppc64x.s
src/crypto/sha512/sha512block_s390x.go
src/crypto/sha512/sha512block_s390x.s

index 3c70e018ce6037b805cfc91feb63cdd9a1d7bf8e..24256185bc2ab66c5fa79f69ef06b6a2d5526cab 100644 (file)
@@ -18,17 +18,6 @@ import (
 //
 //  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
 
-// The avx2-version is described in an Intel White-Paper:
-// "Fast SHA-256 Implementations on Intel Architecture Processors"
-// To find it, surf to http://www.intel.com/p/en_US/embedded
-// and search for that title.
-// AVX2 version by Intel, same algorithm as code in Linux kernel:
-// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
-// by
-//     James Guilford <james.guilford@intel.com>
-//     Kirk Yap <kirk.s.yap@intel.com>
-//     Tim Chen <tim.c.chen@linux.intel.com>
-
 // Wt = Mt; for 0 <= t <= 15
 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
 //
@@ -66,7 +55,9 @@ import (
 func main() {
        Package("crypto/sha256")
        ConstraintExpr("!purego")
-       block()
+       blockAMD64()
+       blockAVX2()
+       blockSHANI()
        Generate()
 }
 
@@ -176,519 +167,10 @@ func sha256Round1(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) {
        sha256Round(index, konst, a, b, c, d, e, f, g, h)
 }
 
-// Definitions for AVX2 version
-
-// addm (mem), reg
-//   - Add reg to mem using reg-mem add and store
-func addm(P1 Mem, P2 GPPhysical) {
-       ADDL(P2, P1)
-       MOVL(P1, P2)
-}
-
-var (
-       XDWORD0 VecPhysical = Y4
-       XDWORD1             = Y5
-       XDWORD2             = Y6
-       XDWORD3             = Y7
-
-       XWORD0 = X4
-       XWORD1 = X5
-       XWORD2 = X6
-       XWORD3 = X7
-
-       XTMP0 = Y0
-       XTMP1 = Y1
-       XTMP2 = Y2
-       XTMP3 = Y3
-       XTMP4 = Y8
-       XTMP5 = Y11
-
-       XFER = Y9
-
-       BYTE_FLIP_MASK   = Y13 // mask to convert LE -> BE
-       X_BYTE_FLIP_MASK = X13
-
-       NUM_BYTES GPPhysical = RDX
-       INP                  = RDI
-
-       CTX = RSI // Beginning of digest in memory (a, b, c, ... , h)
-
-       a = EAX
-       b = EBX
-       c = ECX
-       d = R8L
-       e = EDX
-       f = R9L
-       g = R10L
-       h = R11L
-
-       old_h = R11L
-
-       TBL = RBP
-
-       SRND = RSI // SRND is same register as CTX
-
-       T1 = R12L
-
-       y0 = R13L
-       y1 = R14L
-       y2 = R15L
-       y3 = EDI
-
-       // Offsets
-       XFER_SIZE    = 2 * 64 * 4
-       INP_END_SIZE = 8
-       INP_SIZE     = 8
-
-       _XFER      = 0
-       _INP_END   = _XFER + XFER_SIZE
-       _INP       = _INP_END + INP_END_SIZE
-       STACK_SIZE = _INP + INP_SIZE
-)
-
-func roundAndSchedN0(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
-       //                                                                 #############################  RND N + 0 ############################//
-       MOVL(a, y3)           //                                           y3 = a
-       RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
-       RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
-
-       ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
-       ORL(c, y3)                                                      // y3 = a|c
-       VPALIGNR(Imm(4), XDWORD2, XDWORD3, XTMP0)                       // XTMP0 = W[-7]
-       MOVL(f, y2)                                                     // y2 = f
-       RORXL(Imm(13), a, T1)                                           // T1 = a >> 13
-
-       XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
-       XORL(g, y2)                   //                                   y2 = f^g
-       VPADDD(XDWORD0, XTMP0, XTMP0) //                                   XTMP0 = W[-7] + W[-16]
-       RORXL(Imm(6), e, y1)          //                                   y1 = (e >> 6)
-
-       ANDL(e, y2)           //                                           y2 = (f^g)&e
-       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
-       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
-       ADDL(h, d)            //                                           d = k + w + h + d
-
-       ANDL(b, y3)                               //                       y3 = (a|c)&b
-       VPALIGNR(Imm(4), XDWORD0, XDWORD1, XTMP1) //                       XTMP1 = W[-15]
-       XORL(T1, y1)                              //                       y1 = (a>>22) ^ (a>>13)
-       RORXL(Imm(2), a, T1)                      //                       T1 = (a >> 2)
-
-       XORL(g, y2)                  //                                    y2 = CH = ((f^g)&e)^g
-       VPSRLD(Imm(7), XTMP1, XTMP2) //
-       XORL(T1, y1)                 //                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
-       MOVL(a, T1)                  //                                    T1 = a
-       ANDL(c, T1)                  //                                    T1 = a&c
-
-       ADDL(y0, y2)                    //                                 y2 = S1 + CH
-       VPSLLD(Imm(32-7), XTMP1, XTMP3) //
-       ORL(T1, y3)                     //                                 y3 = MAJ = (a|c)&b)|(a&c)
-       ADDL(y1, h)                     //                                 h = k + w + h + S0
-
-       ADDL(y2, d)               //                                       d = k + w + h + d + S1 + CH = d + t1
-       VPOR(XTMP2, XTMP3, XTMP3) //                                       XTMP3 = W[-15] ror 7
-
-       VPSRLD(Imm(18), XTMP1, XTMP2)
-       ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
-       ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
-}
-
-func roundAndSchedN1(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
-       //                                                                 ################################### RND N + 1 ############################
-       MOVL(a, y3)                                                     // y3 = a
-       RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
-       RORXL(Imm(11), e, y1)                                           // y1 = e >> 11
-       ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
-       ORL(c, y3)                                                      // y3 = a|c
-
-       VPSRLD(Imm(3), XTMP1, XTMP4) //                                    XTMP4 = W[-15] >> 3
-       MOVL(f, y2)                  //                                    y2 = f
-       RORXL(Imm(13), a, T1)        //                                    T1 = a >> 13
-       XORL(y1, y0)                 //                                    y0 = (e>>25) ^ (e>>11)
-       XORL(g, y2)                  //                                    y2 = f^g
-
-       RORXL(Imm(6), e, y1)  //                                           y1 = (e >> 6)
-       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
-       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
-       ANDL(e, y2)           //                                           y2 = (f^g)&e
-       ADDL(h, d)            //                                           d = k + w + h + d
-
-       VPSLLD(Imm(32-18), XTMP1, XTMP1)
-       ANDL(b, y3)  //                                                    y3 = (a|c)&b
-       XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13)
-
-       VPXOR(XTMP1, XTMP3, XTMP3)
-       RORXL(Imm(2), a, T1) //                                            T1 = (a >> 2)
-       XORL(g, y2)          //                                            y2 = CH = ((f^g)&e)^g
-
-       VPXOR(XTMP2, XTMP3, XTMP3) //                                      XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
-       XORL(T1, y1)               //                                      y1 = (a>>22) ^ (a>>13) ^ (a>>2)
-       MOVL(a, T1)                //                                      T1 = a
-       ANDL(c, T1)                //                                      T1 = a&c
-       ADDL(y0, y2)               //                                      y2 = S1 + CH
-
-       VPXOR(XTMP4, XTMP3, XTMP1)         //                              XTMP1 = s0
-       VPSHUFD(Imm(0xFA), XDWORD3, XTMP2) //                              XTMP2 = W[-2] {BBAA}
-       ORL(T1, y3)                        //                              y3 = MAJ = (a|c)&b)|(a&c)
-       ADDL(y1, h)                        //                              h = k + w + h + S0
-
-       VPADDD(XTMP1, XTMP0, XTMP0) //                                     XTMP0 = W[-16] + W[-7] + s0
-       ADDL(y2, d)                 //                                     d = k + w + h + d + S1 + CH = d + t1
-       ADDL(y2, h)                 //                                     h = k + w + h + S0 + S1 + CH = t1 + S0
-       ADDL(y3, h)                 //                                     h = t1 + S0 + MAJ
-
-       VPSRLD(Imm(10), XTMP2, XTMP4) //                                   XTMP4 = W[-2] >> 10 {BBAA}
-}
-
-func roundAndSchedN2(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
-       //                                                                 ################################### RND N + 2 ############################
-       var shuff_00BA Mem = shuff_00BA_DATA()
-
-       MOVL(a, y3)                                                     // y3 = a
-       RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
-       ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
-
-       VPSRLQ(Imm(19), XTMP2, XTMP3) //                                   XTMP3 = W[-2] ror 19 {xBxA}
-       RORXL(Imm(11), e, y1)         //                                   y1 = e >> 11
-       ORL(c, y3)                    //                                   y3 = a|c
-       MOVL(f, y2)                   //                                   y2 = f
-       XORL(g, y2)                   //                                   y2 = f^g
-
-       RORXL(Imm(13), a, T1)         //                                   T1 = a >> 13
-       XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
-       VPSRLQ(Imm(17), XTMP2, XTMP2) //                                   XTMP2 = W[-2] ror 17 {xBxA}
-       ANDL(e, y2)                   //                                   y2 = (f^g)&e
-
-       RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
-       VPXOR(XTMP3, XTMP2, XTMP2)
-       ADDL(h, d)  //                                                     d = k + w + h + d
-       ANDL(b, y3) //                                                     y3 = (a|c)&b
-
-       XORL(y1, y0)               //                                      y0 = (e>>25) ^ (e>>11) ^ (e>>6)
-       RORXL(Imm(22), a, y1)      //                                      y1 = a >> 22
-       VPXOR(XTMP2, XTMP4, XTMP4) //                                      XTMP4 = s1 {xBxA}
-       XORL(g, y2)                //                                      y2 = CH = ((f^g)&e)^g
-
-       VPSHUFB(shuff_00BA, XTMP4, XTMP4) //                               XTMP4 = s1 {00BA}
-
-       XORL(T1, y1)                //                                     y1 = (a>>22) ^ (a>>13)
-       RORXL(Imm(2), a, T1)        //                                     T1 = (a >> 2)
-       VPADDD(XTMP4, XTMP0, XTMP0) //                                     XTMP0 = {..., ..., W[1], W[0]}
-
-       XORL(T1, y1)                   //                                  y1 = (a>>22) ^ (a>>13) ^ (a>>2)
-       MOVL(a, T1)                    //                                  T1 = a
-       ANDL(c, T1)                    //                                  T1 = a&c
-       ADDL(y0, y2)                   //                                  y2 = S1 + CH
-       VPSHUFD(Imm(80), XTMP0, XTMP2) //                                  XTMP2 = W[-2] {DDCC}
-
-       ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
-       ADDL(y1, h) //                                                     h = k + w + h + S0
-       ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
-       ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
-
-       ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
-}
-
-func roundAndSchedN3(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
-       //                                                                 ################################### RND N + 3 ############################
-       var shuff_DC00 Mem = shuff_DC00_DATA()
-
-       MOVL(a, y3)                                                     // y3 = a
-       RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
-       RORXL(Imm(11), e, y1)                                           // y1 = e >> 11
-       ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
-       ORL(c, y3)                                                      // y3 = a|c
-
-       VPSRLD(Imm(10), XTMP2, XTMP5) //                                   XTMP5 = W[-2] >> 10 {DDCC}
-       MOVL(f, y2)                   //                                   y2 = f
-       RORXL(Imm(13), a, T1)         //                                   T1 = a >> 13
-       XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
-       XORL(g, y2)                   //                                   y2 = f^g
+func blockAMD64() {
+       Implement("blockAMD64")
+       AllocLocal(256 + 8)
 
-       VPSRLQ(Imm(19), XTMP2, XTMP3) //                                   XTMP3 = W[-2] ror 19 {xDxC}
-       RORXL(Imm(6), e, y1)          //                                   y1 = (e >> 6)
-       ANDL(e, y2)                   //                                   y2 = (f^g)&e
-       ADDL(h, d)                    //                                   d = k + w + h + d
-       ANDL(b, y3)                   //                                   y3 = (a|c)&b
-
-       VPSRLQ(Imm(17), XTMP2, XTMP2) //                                   XTMP2 = W[-2] ror 17 {xDxC}
-       XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11) ^ (e>>6)
-       XORL(g, y2)                   //                                   y2 = CH = ((f^g)&e)^g
-
-       VPXOR(XTMP3, XTMP2, XTMP2)
-       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
-       ADDL(y0, y2)          //                                           y2 = S1 + CH
-
-       VPXOR(XTMP2, XTMP5, XTMP5) //                                      XTMP5 = s1 {xDxC}
-       XORL(T1, y1)               //                                      y1 = (a>>22) ^ (a>>13)
-       ADDL(y2, d)                //                                      d = k + w + h + d + S1 + CH = d + t1
-
-       RORXL(Imm(2), a, T1) //                                            T1 = (a >> 2)
-
-       VPSHUFB(shuff_DC00, XTMP5, XTMP5) //                               XTMP5 = s1 {DC00}
-
-       VPADDD(XTMP0, XTMP5, XDWORD0) //                                   XDWORD0 = {W[3], W[2], W[1], W[0]}
-       XORL(T1, y1)                  //                                   y1 = (a>>22) ^ (a>>13) ^ (a>>2)
-       MOVL(a, T1)                   //                                   T1 = a
-       ANDL(c, T1)                   //                                   T1 = a&c
-       ORL(T1, y3)                   //                                   y3 = MAJ = (a|c)&b)|(a&c)
-
-       ADDL(y1, h) //                                                     h = k + w + h + S0
-       ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
-       ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
-}
-
-func doRoundN0(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
-       //                                                                 ################################### RND N + 0 ###########################
-       MOVL(f, y2)           //                                           y2 = f
-       RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
-       RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
-       XORL(g, y2)           //                                           y2 = f^g
-
-       XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
-       RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
-       ANDL(e, y2)          //                                            y2 = (f^g)&e
-
-       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
-       RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
-       XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
-       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
-       MOVL(a, y3)           //                                           y3 = a
-
-       XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
-       RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
-       ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
-       ORL(c, y3)                                                      // y3 = a|c
-
-       XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
-       MOVL(a, T1)  //                                                    T1 = a
-       ANDL(b, y3)  //                                                    y3 = (a|c)&b
-       ANDL(c, T1)  //                                                    T1 = a&c
-       ADDL(y0, y2) //                                                    y2 = S1 + CH
-
-       ADDL(h, d)  //                                                     d = k + w + h + d
-       ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
-       ADDL(y1, h) //                                                     h = k + w + h + S0
-       ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
-}
-
-func doRoundN1(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
-       //                                                                 ################################### RND N + 1 ###########################
-       ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
-       MOVL(f, y2)           //                                           y2 = f
-       RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
-       RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
-       XORL(g, y2)           //                                           y2 = f^g
-
-       XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
-       RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
-       ANDL(e, y2)          //                                            y2 = (f^g)&e
-       ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
-
-       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
-       RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
-       XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
-       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
-       MOVL(a, y3)           //                                           y3 = a
-
-       XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
-       RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
-       ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
-       ORL(c, y3)                                                      // y3 = a|c
-
-       XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
-       MOVL(a, T1)  //                                                    T1 = a
-       ANDL(b, y3)  //                                                    y3 = (a|c)&b
-       ANDL(c, T1)  //                                                    T1 = a&c
-       ADDL(y0, y2) //                                                    y2 = S1 + CH
-
-       ADDL(h, d)  //                                                     d = k + w + h + d
-       ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
-       ADDL(y1, h) //                                                     h = k + w + h + S0
-
-       ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
-}
-
-func doRoundN2(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
-       //                                                                 ################################### RND N + 2 ##############################
-       ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
-       MOVL(f, y2)           //                                           y2 = f
-       RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
-       RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
-       XORL(g, y2)           //                                           y2 = f^g
-
-       XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
-       RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
-       ANDL(e, y2)          //                                            y2 = (f^g)&e
-       ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
-
-       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
-       RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
-       XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
-       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
-       MOVL(a, y3)           //                                           y3 = a
-
-       XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
-       RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
-       ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
-       ORL(c, y3)                                                      // y3 = a|c
-
-       XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
-       MOVL(a, T1)  //                                                    T1 = a
-       ANDL(b, y3)  //                                                    y3 = (a|c)&b
-       ANDL(c, T1)  //                                                    T1 = a&c
-       ADDL(y0, y2) //                                                    y2 = S1 + CH
-
-       ADDL(h, d)  //                                                     d = k + w + h + d
-       ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
-       ADDL(y1, h) //                                                     h = k + w + h + S0
-
-       ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
-}
-
-func doRoundN3(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
-       //                                                                 ################################### RND N + 3 ###########################
-       ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
-       MOVL(f, y2)           //                                           y2 = f
-       RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
-       RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
-       XORL(g, y2)           //                                           y2 = f^g
-
-       XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
-       RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
-       ANDL(e, y2)          //                                            y2 = (f^g)&e
-       ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
-
-       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
-       RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
-       XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
-       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
-       MOVL(a, y3)           //                                           y3 = a
-
-       XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
-       RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
-       ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
-       ORL(c, y3)                                                      // y3 = a|c
-
-       XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
-       MOVL(a, T1)  //                                                    T1 = a
-       ANDL(b, y3)  //                                                    y3 = (a|c)&b
-       ANDL(c, T1)  //                                                    T1 = a&c
-       ADDL(y0, y2) //                                                    y2 = S1 + CH
-
-       ADDL(h, d)  //                                                     d = k + w + h + d
-       ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
-       ADDL(y1, h) //                                                     h = k + w + h + S0
-
-       ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
-
-       ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
-
-       ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
-}
-
-// Definitions for sha-ni version
-//
-// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
-// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
-//
-// Reference
-// S. Gulley, et al, "New Instructions Supporting the Secure Hash
-// Algorithm on Intel® Architecture Processors", July 2013
-// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
-//
-
-var (
-       digestPtr       GPPhysical  = RDI // input/output, base pointer to digest hash vector H0, H1, ..., H7
-       dataPtr                     = RSI // input, base pointer to first input data block
-       numBytes                    = RDX // input, number of input bytes to be processed
-       sha256Constants             = RAX // round contents from K256 table, indexed by round number x 32
-       msg             VecPhysical = X0  // input data
-       state0                      = X1  // round intermediates and outputs
-       state1                      = X2
-       m0                          = X3 //  m0, m1,... m4 -- round message temps
-       m1                          = X4
-       m2                          = X5
-       m3                          = X6
-       m4                          = X7
-       shufMask                    = X8  // input data endian conversion control mask
-       abefSave                    = X9  // digest hash vector inter-block buffer abef
-       cdghSave                    = X10 // digest hash vector inter-block buffer cdgh
-)
-
-// nop instead of final SHA256MSG1 for first and last few rounds
-func nop(m, a VecPhysical) {
-}
-
-// final SHA256MSG1 for middle rounds that require it
-func sha256msg1(m, a VecPhysical) {
-       SHA256MSG1(m, a)
-}
-
-// msg copy for all but rounds 12-15
-func vmov(a, b VecPhysical) {
-       VMOVDQA(a, b)
-}
-
-// reverse copy for rounds 12-15
-func vmovrev(a, b VecPhysical) {
-       VMOVDQA(b, a)
-}
-
-type VecFunc func(a, b VecPhysical)
-
-// sha rounds 0 to 11
-//
-// identical with the exception of the final msg op
-// which is replaced with a nop for rounds where it is not needed
-// refer to Gulley, et al for more information
-func rounds0to11(m, a VecPhysical, c int, sha256msg1 VecFunc) {
-       VMOVDQU(Mem{Base: dataPtr}.Offset(c*16), msg)
-       PSHUFB(shufMask, msg)
-       VMOVDQA(msg, m)
-       PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg)
-       SHA256RNDS2(msg, state0, state1)
-       PSHUFD(U8(0x0e), msg, msg)
-       SHA256RNDS2(msg, state1, state0)
-       sha256msg1(m, a)
-}
-
-// sha rounds 12 to 59
-//
-// identical with the exception of the final msg op
-// and the reverse copy(m,msg) in round 12 which is required
-// after the last data load
-// refer to Gulley, et al for more information
-func rounds12to59(m VecPhysical, c int, a, t VecPhysical, sha256msg1, movop VecFunc) {
-       movop(m, msg)
-       PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg)
-       SHA256RNDS2(msg, state0, state1)
-       VMOVDQA(m, m4)
-       PALIGNR(Imm(4), a, m4)
-       PADDD(m4, t)
-       SHA256MSG2(m, t)
-       PSHUFD(Imm(0x0e), msg, msg)
-       SHA256RNDS2(msg, state1, state0)
-       sha256msg1(m, a)
-}
-
-func block() {
-       Implement("block")
-       AllocLocal(536)
-
-       checkArchFlags()
-       sha256()
-       avx2()
-       sha_ni()
-}
-
-func checkArchFlags() {
-       CMPB(Mem{Symbol: Symbol{Name: "·useSHA"}, Base: StaticBase}, Imm(1))
-       JE(LabelRef("sha_ni"))
-       CMPB(Mem{Symbol: Symbol{Name: "·useAVX2"}, Base: StaticBase}, Imm(1))
-       JE(LabelRef("avx2"))
-}
-
-func sha256() {
        Load(Param("p").Base(), RSI)
        Load(Param("p").Len(), RDX)
        SHRQ(Imm(6), RDX)
@@ -770,356 +252,6 @@ func end() {
        RET()
 }
 
-func avx2() {
-       Label("avx2")
-       Load(Param("dig"), CTX) // d.h[8]
-       Load(Param("p").Base(), INP)
-       Load(Param("p").Len(), NUM_BYTES)
-
-       LEAQ(Mem{Base: INP, Index: NUM_BYTES, Scale: 1, Disp: -64}, NUM_BYTES) // Pointer to the last block
-       MOVQ(NUM_BYTES, Mem{Base: SP}.Offset(_INP_END))
-
-       CMPQ(NUM_BYTES, INP)
-       JE(LabelRef("avx2_only_one_block"))
-
-       Comment("Load initial digest")
-       CTX := Mem{Base: CTX}
-       MOVL(CTX.Offset(0), a)  //  a = H0
-       MOVL(CTX.Offset(4), b)  //  b = H1
-       MOVL(CTX.Offset(8), c)  //  c = H2
-       MOVL(CTX.Offset(12), d) //  d = H3
-       MOVL(CTX.Offset(16), e) //  e = H4
-       MOVL(CTX.Offset(20), f) //  f = H5
-       MOVL(CTX.Offset(24), g) //  g = H6
-       MOVL(CTX.Offset(28), h) //  h = H7
-
-       avx2_loop0()
-       avx2_last_block_enter()
-       avx2_loop1()
-       avx2_loop2()
-       avx2_loop3()
-       avx2_do_last_block()
-       avx2_only_one_block()
-       done_hash()
-}
-
-func avx2_loop0() {
-       Label("avx2_loop0")
-       Comment("at each iteration works with one block (512 bit)")
-       VMOVDQU(Mem{Base: INP}.Offset(0*32), XTMP0)
-       VMOVDQU(Mem{Base: INP}.Offset(1*32), XTMP1)
-       VMOVDQU(Mem{Base: INP}.Offset(2*32), XTMP2)
-       VMOVDQU(Mem{Base: INP}.Offset(3*32), XTMP3)
-
-       flip_mask := flip_mask_DATA()
-
-       VMOVDQU(flip_mask, BYTE_FLIP_MASK)
-
-       Comment("Apply Byte Flip Mask: LE -> BE")
-       VPSHUFB(BYTE_FLIP_MASK, XTMP0, XTMP0)
-       VPSHUFB(BYTE_FLIP_MASK, XTMP1, XTMP1)
-       VPSHUFB(BYTE_FLIP_MASK, XTMP2, XTMP2)
-       VPSHUFB(BYTE_FLIP_MASK, XTMP3, XTMP3)
-
-       Comment("Transpose data into high/low parts")
-       VPERM2I128(Imm(0x20), XTMP2, XTMP0, XDWORD0) //  w3,  w2,  w1,  w0
-       VPERM2I128(Imm(0x31), XTMP2, XTMP0, XDWORD1) //  w7,  w6,  w5,  w4
-       VPERM2I128(Imm(0x20), XTMP3, XTMP1, XDWORD2) // w11, w10,  w9,  w8
-       VPERM2I128(Imm(0x31), XTMP3, XTMP1, XDWORD3) // w15, w14, w13, w12
-
-       K256 := K256_DATA()
-       LEAQ(K256, TBL) // Loading address of table with round-specific constants
-}
-
-func avx2_last_block_enter() {
-       Label("avx2_last_block_enter")
-       ADDQ(Imm(64), INP)
-       MOVQ(INP, Mem{Base: SP}.Offset(_INP))
-       XORQ(SRND, SRND)
-}
-
-// for w0 - w47
-func avx2_loop1() {
-       Label("avx2_loop1")
-
-       Comment("Do 4 rounds and scheduling")
-       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((0 * 32)), XDWORD0, XFER)
-       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
-       roundAndSchedN0(_XFER+0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
-       roundAndSchedN1(_XFER+0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
-       roundAndSchedN2(_XFER+0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
-       roundAndSchedN3(_XFER+0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
-
-       Comment("Do 4 rounds and scheduling")
-       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
-       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
-       roundAndSchedN0(_XFER+1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
-       roundAndSchedN1(_XFER+1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
-       roundAndSchedN2(_XFER+1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
-       roundAndSchedN3(_XFER+1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
-
-       Comment("Do 4 rounds and scheduling")
-       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((2 * 32)), XDWORD2, XFER)
-       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+2*32))
-       roundAndSchedN0(_XFER+2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
-       roundAndSchedN1(_XFER+2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
-       roundAndSchedN2(_XFER+2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
-       roundAndSchedN3(_XFER+2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
-
-       Comment("Do 4 rounds and scheduling")
-       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((3 * 32)), XDWORD3, XFER)
-       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+3*32))
-       roundAndSchedN0(_XFER+3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
-       roundAndSchedN1(_XFER+3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
-       roundAndSchedN2(_XFER+3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
-       roundAndSchedN3(_XFER+3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
-
-       ADDQ(Imm(4*32), SRND)
-       CMPQ(SRND, U32(3*4*32))
-       JB(LabelRef("avx2_loop1"))
-}
-
-// w48 - w63 processed with no scheduling (last 16 rounds)
-func avx2_loop2() {
-       Label("avx2_loop2")
-       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(0*32), XDWORD0, XFER)
-       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
-       doRoundN0(_XFER+0*32, a, b, c, d, e, f, g, h, h)
-       doRoundN1(_XFER+0*32, h, a, b, c, d, e, f, g, h)
-       doRoundN2(_XFER+0*32, g, h, a, b, c, d, e, f, g)
-       doRoundN3(_XFER+0*32, f, g, h, a, b, c, d, e, f)
-
-       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
-       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
-       doRoundN0(_XFER+1*32, e, f, g, h, a, b, c, d, e)
-       doRoundN1(_XFER+1*32, d, e, f, g, h, a, b, c, d)
-       doRoundN2(_XFER+1*32, c, d, e, f, g, h, a, b, c)
-       doRoundN3(_XFER+1*32, b, c, d, e, f, g, h, a, b)
-
-       ADDQ(Imm(2*32), SRND)
-
-       VMOVDQU(XDWORD2, XDWORD0)
-       VMOVDQU(XDWORD3, XDWORD1)
-
-       CMPQ(SRND, U32(4*4*32))
-       JB(LabelRef("avx2_loop2"))
-
-       Load(Param("dig"), CTX) // d.h[8]
-       MOVQ(Mem{Base: SP}.Offset(_INP), INP)
-
-       registers := []GPPhysical{a, b, c, d, e, f, g, h}
-       for i, reg := range registers {
-               addm(Mem{Base: CTX}.Offset(i*4), reg)
-       }
-
-       CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
-       JB(LabelRef("done_hash"))
-
-       XORQ(SRND, SRND)
-}
-
-// Do second block using previously scheduled results
-func avx2_loop3() {
-       Label("avx2_loop3")
-       doRoundN0(_XFER+0*32+16, a, b, c, d, e, f, g, h, a)
-       doRoundN1(_XFER+0*32+16, h, a, b, c, d, e, f, g, h)
-       doRoundN2(_XFER+0*32+16, g, h, a, b, c, d, e, f, g)
-       doRoundN3(_XFER+0*32+16, f, g, h, a, b, c, d, e, f)
-
-       doRoundN0(_XFER+1*32+16, e, f, g, h, a, b, c, d, e)
-       doRoundN1(_XFER+1*32+16, d, e, f, g, h, a, b, c, d)
-       doRoundN2(_XFER+1*32+16, c, d, e, f, g, h, a, b, c)
-       doRoundN3(_XFER+1*32+16, b, c, d, e, f, g, h, a, b)
-
-       ADDQ(Imm(2*32), SRND)
-       CMPQ(SRND, U32(4*4*32))
-       JB(LabelRef("avx2_loop3"))
-
-       Load(Param("dig"), CTX) // d.h[8]
-       MOVQ(Mem{Base: SP}.Offset(_INP), INP)
-       ADDQ(Imm(64), INP)
-
-       registers := []GPPhysical{a, b, c, d, e, f, g, h}
-       for i, reg := range registers {
-               addm(Mem{Base: CTX}.Offset(i*4), reg)
-       }
-
-       CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
-       JA(LabelRef("avx2_loop0"))
-       JB(LabelRef("done_hash"))
-}
-
-func avx2_do_last_block() {
-       Label("avx2_do_last_block")
-       VMOVDQU(Mem{Base: INP}.Offset(0), XWORD0)
-       VMOVDQU(Mem{Base: INP}.Offset(16), XWORD1)
-       VMOVDQU(Mem{Base: INP}.Offset(32), XWORD2)
-       VMOVDQU(Mem{Base: INP}.Offset(48), XWORD3)
-
-       flip_mask := flip_mask_DATA()
-       VMOVDQU(flip_mask, BYTE_FLIP_MASK)
-
-       VPSHUFB(X_BYTE_FLIP_MASK, XWORD0, XWORD0)
-       VPSHUFB(X_BYTE_FLIP_MASK, XWORD1, XWORD1)
-       VPSHUFB(X_BYTE_FLIP_MASK, XWORD2, XWORD2)
-       VPSHUFB(X_BYTE_FLIP_MASK, XWORD3, XWORD3)
-
-       K256 := K256_DATA()
-       LEAQ(K256, TBL)
-
-       JMP(LabelRef("avx2_last_block_enter"))
-}
-
-// Load initial digest
-func avx2_only_one_block() {
-       Label("avx2_only_one_block")
-       registers := []GPPhysical{a, b, c, d, e, f, g, h}
-       for i, reg := range registers {
-               MOVL(Mem{Base: CTX}.Offset(i*4), reg)
-       }
-       JMP(LabelRef("avx2_do_last_block"))
-}
-
-func done_hash() {
-       Label("done_hash")
-       VZEROUPPER()
-       RET()
-}
-
-func sha_ni() {
-       Label("sha_ni")
-       Load(Param("dig"), digestPtr)    //                   init digest hash vector H0, H1,..., H7 pointer
-       Load(Param("p").Base(), dataPtr) //                   init input data base pointer
-       Load(Param("p").Len(), numBytes) //                   get number of input bytes to hash
-       SHRQ(Imm(6), numBytes)           //                   force modulo 64 input buffer length
-       SHLQ(Imm(6), numBytes)
-       CMPQ(numBytes, Imm(0)) //                             exit early for zero-length input buffer
-       JEQ(LabelRef("done"))
-       ADDQ(dataPtr, numBytes)                            // point numBytes to end of input buffer
-       VMOVDQU(Mem{Base: digestPtr}.Offset(0*16), state0) // load initial hash values and reorder
-       VMOVDQU(Mem{Base: digestPtr}.Offset(1*16), state1) // DCBA, HGFE -> ABEF, CDGH
-       PSHUFD(Imm(0xb1), state0, state0)                  // CDAB
-       PSHUFD(Imm(0x1b), state1, state1)                  // EFGH
-       VMOVDQA(state0, m4)
-       PALIGNR(Imm(8), state1, state0) //                    ABEF
-       PBLENDW(Imm(0xf0), m4, state1)  //                    CDGH
-       flip_mask := flip_mask_DATA()
-       VMOVDQA(flip_mask, shufMask)
-       LEAQ(K256_DATA(), sha256Constants)
-
-       roundLoop()
-       done()
-}
-
-func roundLoop() {
-       Label("roundLoop")
-       Comment("save hash values for addition after rounds")
-       VMOVDQA(state0, abefSave)
-       VMOVDQA(state1, cdghSave)
-
-       Comment("do rounds 0-59")
-       rounds0to11(m0, nil, 0, nop)       //                 0-3
-       rounds0to11(m1, m0, 1, sha256msg1) //                 4-7
-       rounds0to11(m2, m1, 2, sha256msg1) //                8-11
-       VMOVDQU(Mem{Base: dataPtr}.Offset(3*16), msg)
-       PSHUFB(shufMask, msg)
-       rounds12to59(m3, 3, m2, m0, sha256msg1, vmovrev) // 12-15
-       rounds12to59(m0, 4, m3, m1, sha256msg1, vmov)    // 16-19
-       rounds12to59(m1, 5, m0, m2, sha256msg1, vmov)    // 20-23
-       rounds12to59(m2, 6, m1, m3, sha256msg1, vmov)    // 24-27
-       rounds12to59(m3, 7, m2, m0, sha256msg1, vmov)    // 28-31
-       rounds12to59(m0, 8, m3, m1, sha256msg1, vmov)    // 32-35
-       rounds12to59(m1, 9, m0, m2, sha256msg1, vmov)    // 36-39
-       rounds12to59(m2, 10, m1, m3, sha256msg1, vmov)   // 40-43
-       rounds12to59(m3, 11, m2, m0, sha256msg1, vmov)   // 44-47
-       rounds12to59(m0, 12, m3, m1, sha256msg1, vmov)   // 48-51
-       rounds12to59(m1, 13, m0, m2, nop, vmov)          // 52-55
-       rounds12to59(m2, 14, m1, m3, nop, vmov)          // 56-59
-
-       Comment("do rounds 60-63")
-       VMOVDQA(m3, msg)
-       PADDD(Mem{Base: sha256Constants}.Offset(15*32), msg)
-       SHA256RNDS2(msg, state0, state1)
-       PSHUFD(Imm(0x0e), msg, msg)
-       SHA256RNDS2(msg, state1, state0)
-
-       Comment("add current hash values with previously saved")
-       PADDD(abefSave, state0)
-       PADDD(cdghSave, state1)
-
-       Comment("advance data pointer; loop until buffer empty")
-       ADDQ(Imm(64), dataPtr)
-       CMPQ(numBytes, dataPtr)
-       JNE(LabelRef("roundLoop"))
-
-       Comment("write hash values back in the correct order")
-       PSHUFD(Imm(0x1b), state0, state0)
-       PSHUFD(Imm(0xb1), state1, state1)
-       VMOVDQA(state0, m4)
-       PBLENDW(Imm(0xf0), state1, state0)
-       PALIGNR(Imm(8), m4, state1)
-       VMOVDQU(state0, Mem{Base: digestPtr}.Offset(0*16))
-       VMOVDQU(state1, Mem{Base: digestPtr}.Offset(1*16))
-}
-
-func done() {
-       Label("done")
-       RET()
-}
-
-/**~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~**/
-
-// Pointers for memoizing Data section symbols
-var flip_maskPtr, shuff_00BAPtr, shuff_DC00Ptr, K256Ptr *Mem
-
-// shuffle byte order from LE to BE
-func flip_mask_DATA() Mem {
-       if flip_maskPtr != nil {
-               return *flip_maskPtr
-       }
-
-       flip_mask := GLOBL("flip_mask", RODATA)
-       flip_maskPtr = &flip_mask
-
-       DATA(0x00, U64(0x0405060700010203))
-       DATA(0x08, U64(0x0c0d0e0f08090a0b))
-       DATA(0x10, U64(0x0405060700010203))
-       DATA(0x18, U64(0x0c0d0e0f08090a0b))
-       return flip_mask
-}
-
-// shuffle xBxA -> 00BA
-func shuff_00BA_DATA() Mem {
-       if shuff_00BAPtr != nil {
-               return *shuff_00BAPtr
-       }
-
-       shuff_00BA := GLOBL("shuff_00BA", RODATA)
-       shuff_00BAPtr = &shuff_00BA
-
-       DATA(0x00, U64(0x0b0a090803020100))
-       DATA(0x08, U64(0xFFFFFFFFFFFFFFFF))
-       DATA(0x10, U64(0x0b0a090803020100))
-       DATA(0x18, U64(0xFFFFFFFFFFFFFFFF))
-       return shuff_00BA
-}
-
-// shuffle xDxC -> DC00
-func shuff_DC00_DATA() Mem {
-       if shuff_DC00Ptr != nil {
-               return *shuff_DC00Ptr
-       }
-
-       shuff_DC00 := GLOBL("shuff_DC00", RODATA)
-       shuff_DC00Ptr = &shuff_DC00
-
-       DATA(0x00, U64(0xFFFFFFFFFFFFFFFF))
-       DATA(0x08, U64(0x0b0a090803020100))
-       DATA(0x10, U64(0xFFFFFFFFFFFFFFFF))
-       DATA(0x18, U64(0x0b0a090803020100))
-       return shuff_DC00
-}
-
 var _K = []uint32{
        0x428a2f98,
        0x71374491,
@@ -1186,29 +318,3 @@ var _K = []uint32{
        0xbef9a3f7,
        0xc67178f2,
 }
-
-// Round specific constants
-func K256_DATA() Mem {
-       if K256Ptr != nil {
-               return *K256Ptr
-       }
-
-       K256 := GLOBL("K256", NOPTR+RODATA)
-       K256Ptr = &K256
-
-       offset_idx := 0
-
-       for i := 0; i < len(_K); i += 4 {
-               DATA((offset_idx+0)*4, U32(_K[i+0])) // k1
-               DATA((offset_idx+1)*4, U32(_K[i+1])) // k2
-               DATA((offset_idx+2)*4, U32(_K[i+2])) // k3
-               DATA((offset_idx+3)*4, U32(_K[i+3])) // k4
-
-               DATA((offset_idx+4)*4, U32(_K[i+0])) // k1
-               DATA((offset_idx+5)*4, U32(_K[i+1])) // k2
-               DATA((offset_idx+6)*4, U32(_K[i+2])) // k3
-               DATA((offset_idx+7)*4, U32(_K[i+3])) // k4
-               offset_idx += 8
-       }
-       return K256
-}
diff --git a/src/crypto/sha256/_asm/sha256block_amd64_avx2.go b/src/crypto/sha256/_asm/sha256block_amd64_avx2.go
new file mode 100644 (file)
index 0000000..0e6f1c7
--- /dev/null
@@ -0,0 +1,725 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+       . "github.com/mmcloughlin/avo/build"
+       . "github.com/mmcloughlin/avo/operand"
+       . "github.com/mmcloughlin/avo/reg"
+)
+
+// The avx2-version is described in an Intel White-Paper:
+// "Fast SHA-256 Implementations on Intel Architecture Processors"
+// To find it, surf to http://www.intel.com/p/en_US/embedded
+// and search for that title.
+// AVX2 version by Intel, same algorithm as code in Linux kernel:
+// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
+// by
+//     James Guilford <james.guilford@intel.com>
+//     Kirk Yap <kirk.s.yap@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+
+func blockAVX2() {
+       Implement("blockAVX2")
+       AllocLocal(536)
+
+       Load(Param("dig"), CTX) // d.h[8]
+       Load(Param("p").Base(), INP)
+       Load(Param("p").Len(), NUM_BYTES)
+
+       LEAQ(Mem{Base: INP, Index: NUM_BYTES, Scale: 1, Disp: -64}, NUM_BYTES) // Pointer to the last block
+       MOVQ(NUM_BYTES, Mem{Base: SP}.Offset(_INP_END))
+
+       CMPQ(NUM_BYTES, INP)
+       JE(LabelRef("avx2_only_one_block"))
+
+       Comment("Load initial digest")
+       CTX := Mem{Base: CTX}
+       MOVL(CTX.Offset(0), a)  //  a = H0
+       MOVL(CTX.Offset(4), b)  //  b = H1
+       MOVL(CTX.Offset(8), c)  //  c = H2
+       MOVL(CTX.Offset(12), d) //  d = H3
+       MOVL(CTX.Offset(16), e) //  e = H4
+       MOVL(CTX.Offset(20), f) //  f = H5
+       MOVL(CTX.Offset(24), g) //  g = H6
+       MOVL(CTX.Offset(28), h) //  h = H7
+
+       avx2_loop0()
+       avx2_last_block_enter()
+       avx2_loop1()
+       avx2_loop2()
+       avx2_loop3()
+       avx2_do_last_block()
+       avx2_only_one_block()
+       done_hash()
+}
+
+func avx2_loop0() {
+       Label("avx2_loop0")
+       Comment("at each iteration works with one block (512 bit)")
+       VMOVDQU(Mem{Base: INP}.Offset(0*32), XTMP0)
+       VMOVDQU(Mem{Base: INP}.Offset(1*32), XTMP1)
+       VMOVDQU(Mem{Base: INP}.Offset(2*32), XTMP2)
+       VMOVDQU(Mem{Base: INP}.Offset(3*32), XTMP3)
+
+       flip_mask := flip_mask_DATA()
+
+       VMOVDQU(flip_mask, BYTE_FLIP_MASK)
+
+       Comment("Apply Byte Flip Mask: LE -> BE")
+       VPSHUFB(BYTE_FLIP_MASK, XTMP0, XTMP0)
+       VPSHUFB(BYTE_FLIP_MASK, XTMP1, XTMP1)
+       VPSHUFB(BYTE_FLIP_MASK, XTMP2, XTMP2)
+       VPSHUFB(BYTE_FLIP_MASK, XTMP3, XTMP3)
+
+       Comment("Transpose data into high/low parts")
+       VPERM2I128(Imm(0x20), XTMP2, XTMP0, XDWORD0) //  w3,  w2,  w1,  w0
+       VPERM2I128(Imm(0x31), XTMP2, XTMP0, XDWORD1) //  w7,  w6,  w5,  w4
+       VPERM2I128(Imm(0x20), XTMP3, XTMP1, XDWORD2) // w11, w10,  w9,  w8
+       VPERM2I128(Imm(0x31), XTMP3, XTMP1, XDWORD3) // w15, w14, w13, w12
+
+       K256 := K256_DATA()
+       LEAQ(K256, TBL) // Loading address of table with round-specific constants
+}
+
+func avx2_last_block_enter() {
+       Label("avx2_last_block_enter")
+       ADDQ(Imm(64), INP)
+       MOVQ(INP, Mem{Base: SP}.Offset(_INP))
+       XORQ(SRND, SRND)
+}
+
+// for w0 - w47
+func avx2_loop1() {
+       Label("avx2_loop1")
+
+       Comment("Do 4 rounds and scheduling")
+       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((0 * 32)), XDWORD0, XFER)
+       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
+       roundAndSchedN0(_XFER+0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
+       roundAndSchedN1(_XFER+0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
+       roundAndSchedN2(_XFER+0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
+       roundAndSchedN3(_XFER+0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
+
+       Comment("Do 4 rounds and scheduling")
+       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
+       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
+       roundAndSchedN0(_XFER+1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
+       roundAndSchedN1(_XFER+1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
+       roundAndSchedN2(_XFER+1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
+       roundAndSchedN3(_XFER+1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
+
+       Comment("Do 4 rounds and scheduling")
+       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((2 * 32)), XDWORD2, XFER)
+       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+2*32))
+       roundAndSchedN0(_XFER+2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
+       roundAndSchedN1(_XFER+2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
+       roundAndSchedN2(_XFER+2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
+       roundAndSchedN3(_XFER+2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
+
+       Comment("Do 4 rounds and scheduling")
+       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((3 * 32)), XDWORD3, XFER)
+       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+3*32))
+       roundAndSchedN0(_XFER+3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
+       roundAndSchedN1(_XFER+3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
+       roundAndSchedN2(_XFER+3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
+       roundAndSchedN3(_XFER+3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
+
+       ADDQ(Imm(4*32), SRND)
+       CMPQ(SRND, U32(3*4*32))
+       JB(LabelRef("avx2_loop1"))
+}
+
+// w48 - w63 processed with no scheduling (last 16 rounds)
+func avx2_loop2() {
+       Label("avx2_loop2")
+       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(0*32), XDWORD0, XFER)
+       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
+       doRoundN0(_XFER+0*32, a, b, c, d, e, f, g, h, h)
+       doRoundN1(_XFER+0*32, h, a, b, c, d, e, f, g, h)
+       doRoundN2(_XFER+0*32, g, h, a, b, c, d, e, f, g)
+       doRoundN3(_XFER+0*32, f, g, h, a, b, c, d, e, f)
+
+       VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
+       VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
+       doRoundN0(_XFER+1*32, e, f, g, h, a, b, c, d, e)
+       doRoundN1(_XFER+1*32, d, e, f, g, h, a, b, c, d)
+       doRoundN2(_XFER+1*32, c, d, e, f, g, h, a, b, c)
+       doRoundN3(_XFER+1*32, b, c, d, e, f, g, h, a, b)
+
+       ADDQ(Imm(2*32), SRND)
+
+       VMOVDQU(XDWORD2, XDWORD0)
+       VMOVDQU(XDWORD3, XDWORD1)
+
+       CMPQ(SRND, U32(4*4*32))
+       JB(LabelRef("avx2_loop2"))
+
+       Load(Param("dig"), CTX) // d.h[8]
+       MOVQ(Mem{Base: SP}.Offset(_INP), INP)
+
+       registers := []GPPhysical{a, b, c, d, e, f, g, h}
+       for i, reg := range registers {
+               addm(Mem{Base: CTX}.Offset(i*4), reg)
+       }
+
+       CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
+       JB(LabelRef("done_hash"))
+
+       XORQ(SRND, SRND)
+}
+
+// Do second block using previously scheduled results
+func avx2_loop3() {
+       Label("avx2_loop3")
+       doRoundN0(_XFER+0*32+16, a, b, c, d, e, f, g, h, a)
+       doRoundN1(_XFER+0*32+16, h, a, b, c, d, e, f, g, h)
+       doRoundN2(_XFER+0*32+16, g, h, a, b, c, d, e, f, g)
+       doRoundN3(_XFER+0*32+16, f, g, h, a, b, c, d, e, f)
+
+       doRoundN0(_XFER+1*32+16, e, f, g, h, a, b, c, d, e)
+       doRoundN1(_XFER+1*32+16, d, e, f, g, h, a, b, c, d)
+       doRoundN2(_XFER+1*32+16, c, d, e, f, g, h, a, b, c)
+       doRoundN3(_XFER+1*32+16, b, c, d, e, f, g, h, a, b)
+
+       ADDQ(Imm(2*32), SRND)
+       CMPQ(SRND, U32(4*4*32))
+       JB(LabelRef("avx2_loop3"))
+
+       Load(Param("dig"), CTX) // d.h[8]
+       MOVQ(Mem{Base: SP}.Offset(_INP), INP)
+       ADDQ(Imm(64), INP)
+
+       registers := []GPPhysical{a, b, c, d, e, f, g, h}
+       for i, reg := range registers {
+               addm(Mem{Base: CTX}.Offset(i*4), reg)
+       }
+
+       CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
+       JA(LabelRef("avx2_loop0"))
+       JB(LabelRef("done_hash"))
+}
+
+func avx2_do_last_block() {
+       Label("avx2_do_last_block")
+       VMOVDQU(Mem{Base: INP}.Offset(0), XWORD0)
+       VMOVDQU(Mem{Base: INP}.Offset(16), XWORD1)
+       VMOVDQU(Mem{Base: INP}.Offset(32), XWORD2)
+       VMOVDQU(Mem{Base: INP}.Offset(48), XWORD3)
+
+       flip_mask := flip_mask_DATA()
+       VMOVDQU(flip_mask, BYTE_FLIP_MASK)
+
+       VPSHUFB(X_BYTE_FLIP_MASK, XWORD0, XWORD0)
+       VPSHUFB(X_BYTE_FLIP_MASK, XWORD1, XWORD1)
+       VPSHUFB(X_BYTE_FLIP_MASK, XWORD2, XWORD2)
+       VPSHUFB(X_BYTE_FLIP_MASK, XWORD3, XWORD3)
+
+       K256 := K256_DATA()
+       LEAQ(K256, TBL)
+
+       JMP(LabelRef("avx2_last_block_enter"))
+}
+
+// Load initial digest
+func avx2_only_one_block() {
+       Label("avx2_only_one_block")
+       registers := []GPPhysical{a, b, c, d, e, f, g, h}
+       for i, reg := range registers {
+               MOVL(Mem{Base: CTX}.Offset(i*4), reg)
+       }
+       JMP(LabelRef("avx2_do_last_block"))
+}
+
+func done_hash() {
+       Label("done_hash")
+       VZEROUPPER()
+       RET()
+}
+
+// addm (mem), reg
+//   - Add reg to mem using reg-mem add and store
+func addm(P1 Mem, P2 GPPhysical) {
+       ADDL(P2, P1)
+       MOVL(P1, P2)
+}
+
+var (
+       XDWORD0 VecPhysical = Y4
+       XDWORD1             = Y5
+       XDWORD2             = Y6
+       XDWORD3             = Y7
+
+       XWORD0 = X4
+       XWORD1 = X5
+       XWORD2 = X6
+       XWORD3 = X7
+
+       XTMP0 = Y0
+       XTMP1 = Y1
+       XTMP2 = Y2
+       XTMP3 = Y3
+       XTMP4 = Y8
+       XTMP5 = Y11
+
+       XFER = Y9
+
+       BYTE_FLIP_MASK   = Y13 // mask to convert LE -> BE
+       X_BYTE_FLIP_MASK = X13
+
+       NUM_BYTES GPPhysical = RDX
+       INP                  = RDI
+
+       CTX = RSI // Beginning of digest in memory (a, b, c, ... , h)
+
+       a = EAX
+       b = EBX
+       c = ECX
+       d = R8L
+       e = EDX
+       f = R9L
+       g = R10L
+       h = R11L
+
+       old_h = R11L
+
+       TBL = RBP
+
+       SRND = RSI // SRND is same register as CTX
+
+       T1 = R12L
+
+       y0 = R13L
+       y1 = R14L
+       y2 = R15L
+       y3 = EDI
+
+       // Offsets
+       XFER_SIZE    = 2 * 64 * 4
+       INP_END_SIZE = 8
+       INP_SIZE     = 8
+
+       _XFER      = 0
+       _INP_END   = _XFER + XFER_SIZE
+       _INP       = _INP_END + INP_END_SIZE
+       STACK_SIZE = _INP + INP_SIZE
+)
+
+func roundAndSchedN0(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
+       //                                                                 #############################  RND N + 0 ############################//
+       MOVL(a, y3)           //                                           y3 = a
+       RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
+       RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
+
+       ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+       ORL(c, y3)                                                      // y3 = a|c
+       VPALIGNR(Imm(4), XDWORD2, XDWORD3, XTMP0)                       // XTMP0 = W[-7]
+       MOVL(f, y2)                                                     // y2 = f
+       RORXL(Imm(13), a, T1)                                           // T1 = a >> 13
+
+       XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
+       XORL(g, y2)                   //                                   y2 = f^g
+       VPADDD(XDWORD0, XTMP0, XTMP0) //                                   XTMP0 = W[-7] + W[-16]
+       RORXL(Imm(6), e, y1)          //                                   y1 = (e >> 6)
+
+       ANDL(e, y2)           //                                           y2 = (f^g)&e
+       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
+       ADDL(h, d)            //                                           d = k + w + h + d
+
+       ANDL(b, y3)                               //                       y3 = (a|c)&b
+       VPALIGNR(Imm(4), XDWORD0, XDWORD1, XTMP1) //                       XTMP1 = W[-15]
+       XORL(T1, y1)                              //                       y1 = (a>>22) ^ (a>>13)
+       RORXL(Imm(2), a, T1)                      //                       T1 = (a >> 2)
+
+       XORL(g, y2)                  //                                    y2 = CH = ((f^g)&e)^g
+       VPSRLD(Imm(7), XTMP1, XTMP2) //
+       XORL(T1, y1)                 //                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+       MOVL(a, T1)                  //                                    T1 = a
+       ANDL(c, T1)                  //                                    T1 = a&c
+
+       ADDL(y0, y2)                    //                                 y2 = S1 + CH
+       VPSLLD(Imm(32-7), XTMP1, XTMP3) //
+       ORL(T1, y3)                     //                                 y3 = MAJ = (a|c)&b)|(a&c)
+       ADDL(y1, h)                     //                                 h = k + w + h + S0
+
+       ADDL(y2, d)               //                                       d = k + w + h + d + S1 + CH = d + t1
+       VPOR(XTMP2, XTMP3, XTMP3) //                                       XTMP3 = W[-15] ror 7
+
+       VPSRLD(Imm(18), XTMP1, XTMP2)
+       ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
+       ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
+}
+
+func roundAndSchedN1(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
+       //                                                                 ################################### RND N + 1 ############################
+       MOVL(a, y3)                                                     // y3 = a
+       RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
+       RORXL(Imm(11), e, y1)                                           // y1 = e >> 11
+       ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+       ORL(c, y3)                                                      // y3 = a|c
+
+       VPSRLD(Imm(3), XTMP1, XTMP4) //                                    XTMP4 = W[-15] >> 3
+       MOVL(f, y2)                  //                                    y2 = f
+       RORXL(Imm(13), a, T1)        //                                    T1 = a >> 13
+       XORL(y1, y0)                 //                                    y0 = (e>>25) ^ (e>>11)
+       XORL(g, y2)                  //                                    y2 = f^g
+
+       RORXL(Imm(6), e, y1)  //                                           y1 = (e >> 6)
+       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
+       ANDL(e, y2)           //                                           y2 = (f^g)&e
+       ADDL(h, d)            //                                           d = k + w + h + d
+
+       VPSLLD(Imm(32-18), XTMP1, XTMP1)
+       ANDL(b, y3)  //                                                    y3 = (a|c)&b
+       XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13)
+
+       VPXOR(XTMP1, XTMP3, XTMP3)
+       RORXL(Imm(2), a, T1) //                                            T1 = (a >> 2)
+       XORL(g, y2)          //                                            y2 = CH = ((f^g)&e)^g
+
+       VPXOR(XTMP2, XTMP3, XTMP3) //                                      XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+       XORL(T1, y1)               //                                      y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+       MOVL(a, T1)                //                                      T1 = a
+       ANDL(c, T1)                //                                      T1 = a&c
+       ADDL(y0, y2)               //                                      y2 = S1 + CH
+
+       VPXOR(XTMP4, XTMP3, XTMP1)         //                              XTMP1 = s0
+       VPSHUFD(Imm(0xFA), XDWORD3, XTMP2) //                              XTMP2 = W[-2] {BBAA}
+       ORL(T1, y3)                        //                              y3 = MAJ = (a|c)&b)|(a&c)
+       ADDL(y1, h)                        //                              h = k + w + h + S0
+
+       VPADDD(XTMP1, XTMP0, XTMP0) //                                     XTMP0 = W[-16] + W[-7] + s0
+       ADDL(y2, d)                 //                                     d = k + w + h + d + S1 + CH = d + t1
+       ADDL(y2, h)                 //                                     h = k + w + h + S0 + S1 + CH = t1 + S0
+       ADDL(y3, h)                 //                                     h = t1 + S0 + MAJ
+
+       VPSRLD(Imm(10), XTMP2, XTMP4) //                                   XTMP4 = W[-2] >> 10 {BBAA}
+}
+
+func roundAndSchedN2(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
+       //                                                                 ################################### RND N + 2 ############################
+       var shuff_00BA Mem = shuff_00BA_DATA()
+
+       MOVL(a, y3)                                                     // y3 = a
+       RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
+       ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+
+       VPSRLQ(Imm(19), XTMP2, XTMP3) //                                   XTMP3 = W[-2] ror 19 {xBxA}
+       RORXL(Imm(11), e, y1)         //                                   y1 = e >> 11
+       ORL(c, y3)                    //                                   y3 = a|c
+       MOVL(f, y2)                   //                                   y2 = f
+       XORL(g, y2)                   //                                   y2 = f^g
+
+       RORXL(Imm(13), a, T1)         //                                   T1 = a >> 13
+       XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
+       VPSRLQ(Imm(17), XTMP2, XTMP2) //                                   XTMP2 = W[-2] ror 17 {xBxA}
+       ANDL(e, y2)                   //                                   y2 = (f^g)&e
+
+       RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
+       VPXOR(XTMP3, XTMP2, XTMP2)
+       ADDL(h, d)  //                                                     d = k + w + h + d
+       ANDL(b, y3) //                                                     y3 = (a|c)&b
+
+       XORL(y1, y0)               //                                      y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+       RORXL(Imm(22), a, y1)      //                                      y1 = a >> 22
+       VPXOR(XTMP2, XTMP4, XTMP4) //                                      XTMP4 = s1 {xBxA}
+       XORL(g, y2)                //                                      y2 = CH = ((f^g)&e)^g
+
+       VPSHUFB(shuff_00BA, XTMP4, XTMP4) //                               XTMP4 = s1 {00BA}
+
+       XORL(T1, y1)                //                                     y1 = (a>>22) ^ (a>>13)
+       RORXL(Imm(2), a, T1)        //                                     T1 = (a >> 2)
+       VPADDD(XTMP4, XTMP0, XTMP0) //                                     XTMP0 = {..., ..., W[1], W[0]}
+
+       XORL(T1, y1)                   //                                  y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+       MOVL(a, T1)                    //                                  T1 = a
+       ANDL(c, T1)                    //                                  T1 = a&c
+       ADDL(y0, y2)                   //                                  y2 = S1 + CH
+       VPSHUFD(Imm(80), XTMP0, XTMP2) //                                  XTMP2 = W[-2] {DDCC}
+
+       ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
+       ADDL(y1, h) //                                                     h = k + w + h + S0
+       ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
+       ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
+
+       ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
+}
+
+func roundAndSchedN3(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
+       //                                                                 ################################### RND N + 3 ############################
+       var shuff_DC00 Mem = shuff_DC00_DATA()
+
+       MOVL(a, y3)                                                     // y3 = a
+       RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
+       RORXL(Imm(11), e, y1)                                           // y1 = e >> 11
+       ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+       ORL(c, y3)                                                      // y3 = a|c
+
+       VPSRLD(Imm(10), XTMP2, XTMP5) //                                   XTMP5 = W[-2] >> 10 {DDCC}
+       MOVL(f, y2)                   //                                   y2 = f
+       RORXL(Imm(13), a, T1)         //                                   T1 = a >> 13
+       XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
+       XORL(g, y2)                   //                                   y2 = f^g
+
+       VPSRLQ(Imm(19), XTMP2, XTMP3) //                                   XTMP3 = W[-2] ror 19 {xDxC}
+       RORXL(Imm(6), e, y1)          //                                   y1 = (e >> 6)
+       ANDL(e, y2)                   //                                   y2 = (f^g)&e
+       ADDL(h, d)                    //                                   d = k + w + h + d
+       ANDL(b, y3)                   //                                   y3 = (a|c)&b
+
+       VPSRLQ(Imm(17), XTMP2, XTMP2) //                                   XTMP2 = W[-2] ror 17 {xDxC}
+       XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+       XORL(g, y2)                   //                                   y2 = CH = ((f^g)&e)^g
+
+       VPXOR(XTMP3, XTMP2, XTMP2)
+       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
+       ADDL(y0, y2)          //                                           y2 = S1 + CH
+
+       VPXOR(XTMP2, XTMP5, XTMP5) //                                      XTMP5 = s1 {xDxC}
+       XORL(T1, y1)               //                                      y1 = (a>>22) ^ (a>>13)
+       ADDL(y2, d)                //                                      d = k + w + h + d + S1 + CH = d + t1
+
+       RORXL(Imm(2), a, T1) //                                            T1 = (a >> 2)
+
+       VPSHUFB(shuff_DC00, XTMP5, XTMP5) //                               XTMP5 = s1 {DC00}
+
+       VPADDD(XTMP0, XTMP5, XDWORD0) //                                   XDWORD0 = {W[3], W[2], W[1], W[0]}
+       XORL(T1, y1)                  //                                   y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+       MOVL(a, T1)                   //                                   T1 = a
+       ANDL(c, T1)                   //                                   T1 = a&c
+       ORL(T1, y3)                   //                                   y3 = MAJ = (a|c)&b)|(a&c)
+
+       ADDL(y1, h) //                                                     h = k + w + h + S0
+       ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
+       ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
+}
+
+func doRoundN0(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
+       //                                                                 ################################### RND N + 0 ###########################
+       MOVL(f, y2)           //                                           y2 = f
+       RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
+       RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
+       XORL(g, y2)           //                                           y2 = f^g
+
+       XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
+       RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
+       ANDL(e, y2)          //                                            y2 = (f^g)&e
+
+       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+       RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
+       XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
+       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
+       MOVL(a, y3)           //                                           y3 = a
+
+       XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
+       RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
+       ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+       ORL(c, y3)                                                      // y3 = a|c
+
+       XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+       MOVL(a, T1)  //                                                    T1 = a
+       ANDL(b, y3)  //                                                    y3 = (a|c)&b
+       ANDL(c, T1)  //                                                    T1 = a&c
+       ADDL(y0, y2) //                                                    y2 = S1 + CH
+
+       ADDL(h, d)  //                                                     d = k + w + h + d
+       ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
+       ADDL(y1, h) //                                                     h = k + w + h + S0
+       ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
+}
+
+func doRoundN1(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
+       //                                                                 ################################### RND N + 1 ###########################
+       ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
+       MOVL(f, y2)           //                                           y2 = f
+       RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
+       RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
+       XORL(g, y2)           //                                           y2 = f^g
+
+       XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
+       RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
+       ANDL(e, y2)          //                                            y2 = (f^g)&e
+       ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
+
+       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+       RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
+       XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
+       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
+       MOVL(a, y3)           //                                           y3 = a
+
+       XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
+       RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
+       ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+       ORL(c, y3)                                                      // y3 = a|c
+
+       XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+       MOVL(a, T1)  //                                                    T1 = a
+       ANDL(b, y3)  //                                                    y3 = (a|c)&b
+       ANDL(c, T1)  //                                                    T1 = a&c
+       ADDL(y0, y2) //                                                    y2 = S1 + CH
+
+       ADDL(h, d)  //                                                     d = k + w + h + d
+       ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
+       ADDL(y1, h) //                                                     h = k + w + h + S0
+
+       ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
+}
+
+func doRoundN2(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
+       //                                                                 ################################### RND N + 2 ##############################
+       ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
+       MOVL(f, y2)           //                                           y2 = f
+       RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
+       RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
+       XORL(g, y2)           //                                           y2 = f^g
+
+       XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
+       RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
+       ANDL(e, y2)          //                                            y2 = (f^g)&e
+       ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
+
+       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+       RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
+       XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
+       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
+       MOVL(a, y3)           //                                           y3 = a
+
+       XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
+       RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
+       ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+       ORL(c, y3)                                                      // y3 = a|c
+
+       XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+       MOVL(a, T1)  //                                                    T1 = a
+       ANDL(b, y3)  //                                                    y3 = (a|c)&b
+       ANDL(c, T1)  //                                                    T1 = a&c
+       ADDL(y0, y2) //                                                    y2 = S1 + CH
+
+       ADDL(h, d)  //                                                     d = k + w + h + d
+       ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
+       ADDL(y1, h) //                                                     h = k + w + h + S0
+
+       ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
+}
+
+func doRoundN3(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
+       //                                                                 ################################### RND N + 3 ###########################
+       ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
+       MOVL(f, y2)           //                                           y2 = f
+       RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
+       RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
+       XORL(g, y2)           //                                           y2 = f^g
+
+       XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
+       RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
+       ANDL(e, y2)          //                                            y2 = (f^g)&e
+       ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
+
+       XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
+       RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
+       XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
+       RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
+       MOVL(a, y3)           //                                           y3 = a
+
+       XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
+       RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
+       ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
+       ORL(c, y3)                                                      // y3 = a|c
+
+       XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
+       MOVL(a, T1)  //                                                    T1 = a
+       ANDL(b, y3)  //                                                    y3 = (a|c)&b
+       ANDL(c, T1)  //                                                    T1 = a&c
+       ADDL(y0, y2) //                                                    y2 = S1 + CH
+
+       ADDL(h, d)  //                                                     d = k + w + h + d
+       ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
+       ADDL(y1, h) //                                                     h = k + w + h + S0
+
+       ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
+
+       ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
+
+       ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
+}
+
+// Pointers for memoizing Data section symbols
+var flip_maskPtr, shuff_00BAPtr, shuff_DC00Ptr, K256Ptr *Mem
+
+// shuffle byte order from LE to BE
+func flip_mask_DATA() Mem {
+       if flip_maskPtr != nil {
+               return *flip_maskPtr
+       }
+
+       flip_mask := GLOBL("flip_mask", RODATA)
+       flip_maskPtr = &flip_mask
+
+       DATA(0x00, U64(0x0405060700010203))
+       DATA(0x08, U64(0x0c0d0e0f08090a0b))
+       DATA(0x10, U64(0x0405060700010203))
+       DATA(0x18, U64(0x0c0d0e0f08090a0b))
+       return flip_mask
+}
+
+// shuffle xBxA -> 00BA
+func shuff_00BA_DATA() Mem {
+       if shuff_00BAPtr != nil {
+               return *shuff_00BAPtr
+       }
+
+       shuff_00BA := GLOBL("shuff_00BA", RODATA)
+       shuff_00BAPtr = &shuff_00BA
+
+       DATA(0x00, U64(0x0b0a090803020100))
+       DATA(0x08, U64(0xFFFFFFFFFFFFFFFF))
+       DATA(0x10, U64(0x0b0a090803020100))
+       DATA(0x18, U64(0xFFFFFFFFFFFFFFFF))
+       return shuff_00BA
+}
+
+// shuffle xDxC -> DC00
+func shuff_DC00_DATA() Mem {
+       if shuff_DC00Ptr != nil {
+               return *shuff_DC00Ptr
+       }
+
+       shuff_DC00 := GLOBL("shuff_DC00", RODATA)
+       shuff_DC00Ptr = &shuff_DC00
+
+       DATA(0x00, U64(0xFFFFFFFFFFFFFFFF))
+       DATA(0x08, U64(0x0b0a090803020100))
+       DATA(0x10, U64(0xFFFFFFFFFFFFFFFF))
+       DATA(0x18, U64(0x0b0a090803020100))
+       return shuff_DC00
+}
+
+// Round specific constants
+func K256_DATA() Mem {
+       if K256Ptr != nil {
+               return *K256Ptr
+       }
+
+       K256 := GLOBL("K256", NOPTR+RODATA)
+       K256Ptr = &K256
+
+       offset_idx := 0
+
+       for i := 0; i < len(_K); i += 4 {
+               DATA((offset_idx+0)*4, U32(_K[i+0])) // k1
+               DATA((offset_idx+1)*4, U32(_K[i+1])) // k2
+               DATA((offset_idx+2)*4, U32(_K[i+2])) // k3
+               DATA((offset_idx+3)*4, U32(_K[i+3])) // k4
+
+               DATA((offset_idx+4)*4, U32(_K[i+0])) // k1
+               DATA((offset_idx+5)*4, U32(_K[i+1])) // k2
+               DATA((offset_idx+6)*4, U32(_K[i+2])) // k3
+               DATA((offset_idx+7)*4, U32(_K[i+3])) // k4
+               offset_idx += 8
+       }
+       return K256
+}
diff --git a/src/crypto/sha256/_asm/sha256block_amd64_shani.go b/src/crypto/sha256/_asm/sha256block_amd64_shani.go
new file mode 100644 (file)
index 0000000..423e862
--- /dev/null
@@ -0,0 +1,174 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+       . "github.com/mmcloughlin/avo/build"
+       . "github.com/mmcloughlin/avo/operand"
+       . "github.com/mmcloughlin/avo/reg"
+)
+
+// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
+// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
+//
+// Reference
+// S. Gulley, et al, "New Instructions Supporting the Secure Hash
+// Algorithm on Intel® Architecture Processors", July 2013
+// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
+
+func blockSHANI() {
+       Implement("blockSHANI")
+       Load(Param("dig"), digestPtr)    //                   init digest hash vector H0, H1,..., H7 pointer
+       Load(Param("p").Base(), dataPtr) //                   init input data base pointer
+       Load(Param("p").Len(), numBytes) //                   get number of input bytes to hash
+       SHRQ(Imm(6), numBytes)           //                   force modulo 64 input buffer length
+       SHLQ(Imm(6), numBytes)
+       CMPQ(numBytes, Imm(0)) //                             exit early for zero-length input buffer
+       JEQ(LabelRef("done"))
+       ADDQ(dataPtr, numBytes)                            // point numBytes to end of input buffer
+       VMOVDQU(Mem{Base: digestPtr}.Offset(0*16), state0) // load initial hash values and reorder
+       VMOVDQU(Mem{Base: digestPtr}.Offset(1*16), state1) // DCBA, HGFE -> ABEF, CDGH
+       PSHUFD(Imm(0xb1), state0, state0)                  // CDAB
+       PSHUFD(Imm(0x1b), state1, state1)                  // EFGH
+       VMOVDQA(state0, m4)
+       PALIGNR(Imm(8), state1, state0) //                    ABEF
+       PBLENDW(Imm(0xf0), m4, state1)  //                    CDGH
+       flip_mask := flip_mask_DATA()
+       VMOVDQA(flip_mask, shufMask)
+       LEAQ(K256_DATA(), sha256Constants)
+
+       roundLoop()
+       done()
+}
+
+func roundLoop() {
+       Label("roundLoop")
+       Comment("save hash values for addition after rounds")
+       VMOVDQA(state0, abefSave)
+       VMOVDQA(state1, cdghSave)
+
+       Comment("do rounds 0-59")
+       rounds0to11(m0, nil, 0, nop)       //                 0-3
+       rounds0to11(m1, m0, 1, sha256msg1) //                 4-7
+       rounds0to11(m2, m1, 2, sha256msg1) //                8-11
+       VMOVDQU(Mem{Base: dataPtr}.Offset(3*16), msg)
+       PSHUFB(shufMask, msg)
+       rounds12to59(m3, 3, m2, m0, sha256msg1, vmovrev) // 12-15
+       rounds12to59(m0, 4, m3, m1, sha256msg1, vmov)    // 16-19
+       rounds12to59(m1, 5, m0, m2, sha256msg1, vmov)    // 20-23
+       rounds12to59(m2, 6, m1, m3, sha256msg1, vmov)    // 24-27
+       rounds12to59(m3, 7, m2, m0, sha256msg1, vmov)    // 28-31
+       rounds12to59(m0, 8, m3, m1, sha256msg1, vmov)    // 32-35
+       rounds12to59(m1, 9, m0, m2, sha256msg1, vmov)    // 36-39
+       rounds12to59(m2, 10, m1, m3, sha256msg1, vmov)   // 40-43
+       rounds12to59(m3, 11, m2, m0, sha256msg1, vmov)   // 44-47
+       rounds12to59(m0, 12, m3, m1, sha256msg1, vmov)   // 48-51
+       rounds12to59(m1, 13, m0, m2, nop, vmov)          // 52-55
+       rounds12to59(m2, 14, m1, m3, nop, vmov)          // 56-59
+
+       Comment("do rounds 60-63")
+       VMOVDQA(m3, msg)
+       PADDD(Mem{Base: sha256Constants}.Offset(15*32), msg)
+       SHA256RNDS2(msg, state0, state1)
+       PSHUFD(Imm(0x0e), msg, msg)
+       SHA256RNDS2(msg, state1, state0)
+
+       Comment("add current hash values with previously saved")
+       PADDD(abefSave, state0)
+       PADDD(cdghSave, state1)
+
+       Comment("advance data pointer; loop until buffer empty")
+       ADDQ(Imm(64), dataPtr)
+       CMPQ(numBytes, dataPtr)
+       JNE(LabelRef("roundLoop"))
+
+       Comment("write hash values back in the correct order")
+       PSHUFD(Imm(0x1b), state0, state0)
+       PSHUFD(Imm(0xb1), state1, state1)
+       VMOVDQA(state0, m4)
+       PBLENDW(Imm(0xf0), state1, state0)
+       PALIGNR(Imm(8), m4, state1)
+       VMOVDQU(state0, Mem{Base: digestPtr}.Offset(0*16))
+       VMOVDQU(state1, Mem{Base: digestPtr}.Offset(1*16))
+}
+
+func done() {
+       Label("done")
+       RET()
+}
+
+var (
+       digestPtr       GPPhysical  = RDI // input/output, base pointer to digest hash vector H0, H1, ..., H7
+       dataPtr                     = RSI // input, base pointer to first input data block
+       numBytes                    = RDX // input, number of input bytes to be processed
+       sha256Constants             = RAX // round contents from K256 table, indexed by round number x 32
+       msg             VecPhysical = X0  // input data
+       state0                      = X1  // round intermediates and outputs
+       state1                      = X2
+       m0                          = X3 //  m0, m1,... m4 -- round message temps
+       m1                          = X4
+       m2                          = X5
+       m3                          = X6
+       m4                          = X7
+       shufMask                    = X8  // input data endian conversion control mask
+       abefSave                    = X9  // digest hash vector inter-block buffer abef
+       cdghSave                    = X10 // digest hash vector inter-block buffer cdgh
+)
+
+// nop instead of final SHA256MSG1 for first and last few rounds
+func nop(m, a VecPhysical) {
+}
+
+// final SHA256MSG1 for middle rounds that require it
+func sha256msg1(m, a VecPhysical) {
+       SHA256MSG1(m, a)
+}
+
+// msg copy for all but rounds 12-15
+func vmov(a, b VecPhysical) {
+       VMOVDQA(a, b)
+}
+
+// reverse copy for rounds 12-15
+func vmovrev(a, b VecPhysical) {
+       VMOVDQA(b, a)
+}
+
+type VecFunc func(a, b VecPhysical)
+
+// sha rounds 0 to 11
+//
+// identical with the exception of the final msg op
+// which is replaced with a nop for rounds where it is not needed
+// refer to Gulley, et al for more information
+func rounds0to11(m, a VecPhysical, c int, sha256msg1 VecFunc) {
+       VMOVDQU(Mem{Base: dataPtr}.Offset(c*16), msg)
+       PSHUFB(shufMask, msg)
+       VMOVDQA(msg, m)
+       PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg)
+       SHA256RNDS2(msg, state0, state1)
+       PSHUFD(U8(0x0e), msg, msg)
+       SHA256RNDS2(msg, state1, state0)
+       sha256msg1(m, a)
+}
+
+// sha rounds 12 to 59
+//
+// identical with the exception of the final msg op
+// and the reverse copy(m,msg) in round 12 which is required
+// after the last data load
+// refer to Gulley, et al for more information
+func rounds12to59(m VecPhysical, c int, a, t VecPhysical, sha256msg1, movop VecFunc) {
+       movop(m, msg)
+       PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg)
+       SHA256RNDS2(msg, state0, state1)
+       VMOVDQA(m, m4)
+       PALIGNR(Imm(4), a, m4)
+       PADDD(m4, t)
+       SHA256MSG2(m, t)
+       PSHUFD(Imm(0x0e), msg, msg)
+       SHA256RNDS2(msg, state1, state0)
+       sha256msg1(m, a)
+}
diff --git a/src/crypto/sha256/fallback_test.go b/src/crypto/sha256/fallback_test.go
deleted file mode 100644 (file)
index ceef3cc..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build s390x && !purego
-
-package sha256
-
-import (
-       "fmt"
-       "io"
-       "testing"
-)
-
-// Tests the fallback code path in case the optimized asm
-// implementation cannot be used.
-// See also TestBlockGeneric.
-func TestGenericPath(t *testing.T) {
-       if useAsm == false {
-               t.Skipf("assembly implementation unavailable")
-       }
-       useAsm = false
-       defer func() { useAsm = true }()
-       c := New()
-       in := "ΑΒΓΔΕϜΖΗΘΙΚΛΜΝΞΟΠϺϘΡΣΤΥΦΧΨΩ"
-       gold := "e93d84ec2b22383123be9f713697fb25" +
-               "338c86e2f7d8d1ddc2d89d332dd9d76c"
-       if _, err := io.WriteString(c, in); err != nil {
-               t.Fatalf("could not write to c: %v", err)
-       }
-       out := fmt.Sprintf("%x", c.Sum(nil))
-       if out != gold {
-               t.Fatalf("mismatch: got %s, wanted %s", out, gold)
-       }
-}
index fdd75a3f3e7e048bb4dc69a1bb7b1e5633c5e2ce..411f5ebf0206d19a61a9224f7ab5b8207d48dac7 100644 (file)
@@ -8,5 +8,25 @@ package sha256
 
 import "internal/cpu"
 
+//go:noescape
+func blockAMD64(dig *digest, p []byte)
+
 var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
-var useSHA = useAVX2 && cpu.X86.HasSHA
+
+//go:noescape
+func blockAVX2(dig *digest, p []byte)
+
+var useSHANI = useAVX2 && cpu.X86.HasSHA
+
+//go:noescape
+func blockSHANI(dig *digest, p []byte)
+
+func block(dig *digest, p []byte) {
+       if useSHANI {
+               blockSHANI(dig, p)
+       } else if useAVX2 {
+               blockAVX2(dig, p)
+       } else {
+               blockAMD64(dig, p)
+       }
+}
index 700a4eff97ed16ca14803f617073455716381f9c..d073c5fe3054b0243a80e13abfd84ea3d2fd3928 100644 (file)
@@ -4,13 +4,8 @@
 
 #include "textflag.h"
 
-// func block(dig *digest, p []byte)
-// Requires: AVX, AVX2, BMI2, SHA, SSE2, SSE4.1, SSSE3
-TEXT ·block(SB), $536-32
-       CMPB ·useSHA+0(SB), $0x01
-       JE   sha_ni
-       CMPB ·useAVX2+0(SB), $0x01
-       JE   avx2
+// func blockAMD64(dig *digest, p []byte)
+TEXT ·blockAMD64(SB), $264-32
        MOVQ p_base+8(FP), SI
        MOVQ p_len+16(FP), DX
        SHRQ $0x06, DX
@@ -3495,7 +3490,9 @@ loop:
 end:
        RET
 
-avx2:
+// func blockAVX2(dig *digest, p []byte)
+// Requires: AVX, AVX2, BMI2
+TEXT ·blockAVX2(SB), $536-32
        MOVQ dig+0(FP), SI
        MOVQ p_base+8(FP), DI
        MOVQ p_len+16(FP), DX
@@ -4627,7 +4624,157 @@ done_hash:
        VZEROUPPER
        RET
 
-sha_ni:
+DATA flip_mask<>+0(SB)/8, $0x0405060700010203
+DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b
+DATA flip_mask<>+16(SB)/8, $0x0405060700010203
+DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b
+GLOBL flip_mask<>(SB), RODATA, $32
+
+DATA K256<>+0(SB)/4, $0x428a2f98
+DATA K256<>+4(SB)/4, $0x71374491
+DATA K256<>+8(SB)/4, $0xb5c0fbcf
+DATA K256<>+12(SB)/4, $0xe9b5dba5
+DATA K256<>+16(SB)/4, $0x428a2f98
+DATA K256<>+20(SB)/4, $0x71374491
+DATA K256<>+24(SB)/4, $0xb5c0fbcf
+DATA K256<>+28(SB)/4, $0xe9b5dba5
+DATA K256<>+32(SB)/4, $0x3956c25b
+DATA K256<>+36(SB)/4, $0x59f111f1
+DATA K256<>+40(SB)/4, $0x923f82a4
+DATA K256<>+44(SB)/4, $0xab1c5ed5
+DATA K256<>+48(SB)/4, $0x3956c25b
+DATA K256<>+52(SB)/4, $0x59f111f1
+DATA K256<>+56(SB)/4, $0x923f82a4
+DATA K256<>+60(SB)/4, $0xab1c5ed5
+DATA K256<>+64(SB)/4, $0xd807aa98
+DATA K256<>+68(SB)/4, $0x12835b01
+DATA K256<>+72(SB)/4, $0x243185be
+DATA K256<>+76(SB)/4, $0x550c7dc3
+DATA K256<>+80(SB)/4, $0xd807aa98
+DATA K256<>+84(SB)/4, $0x12835b01
+DATA K256<>+88(SB)/4, $0x243185be
+DATA K256<>+92(SB)/4, $0x550c7dc3
+DATA K256<>+96(SB)/4, $0x72be5d74
+DATA K256<>+100(SB)/4, $0x80deb1fe
+DATA K256<>+104(SB)/4, $0x9bdc06a7
+DATA K256<>+108(SB)/4, $0xc19bf174
+DATA K256<>+112(SB)/4, $0x72be5d74
+DATA K256<>+116(SB)/4, $0x80deb1fe
+DATA K256<>+120(SB)/4, $0x9bdc06a7
+DATA K256<>+124(SB)/4, $0xc19bf174
+DATA K256<>+128(SB)/4, $0xe49b69c1
+DATA K256<>+132(SB)/4, $0xefbe4786
+DATA K256<>+136(SB)/4, $0x0fc19dc6
+DATA K256<>+140(SB)/4, $0x240ca1cc
+DATA K256<>+144(SB)/4, $0xe49b69c1
+DATA K256<>+148(SB)/4, $0xefbe4786
+DATA K256<>+152(SB)/4, $0x0fc19dc6
+DATA K256<>+156(SB)/4, $0x240ca1cc
+DATA K256<>+160(SB)/4, $0x2de92c6f
+DATA K256<>+164(SB)/4, $0x4a7484aa
+DATA K256<>+168(SB)/4, $0x5cb0a9dc
+DATA K256<>+172(SB)/4, $0x76f988da
+DATA K256<>+176(SB)/4, $0x2de92c6f
+DATA K256<>+180(SB)/4, $0x4a7484aa
+DATA K256<>+184(SB)/4, $0x5cb0a9dc
+DATA K256<>+188(SB)/4, $0x76f988da
+DATA K256<>+192(SB)/4, $0x983e5152
+DATA K256<>+196(SB)/4, $0xa831c66d
+DATA K256<>+200(SB)/4, $0xb00327c8
+DATA K256<>+204(SB)/4, $0xbf597fc7
+DATA K256<>+208(SB)/4, $0x983e5152
+DATA K256<>+212(SB)/4, $0xa831c66d
+DATA K256<>+216(SB)/4, $0xb00327c8
+DATA K256<>+220(SB)/4, $0xbf597fc7
+DATA K256<>+224(SB)/4, $0xc6e00bf3
+DATA K256<>+228(SB)/4, $0xd5a79147
+DATA K256<>+232(SB)/4, $0x06ca6351
+DATA K256<>+236(SB)/4, $0x14292967
+DATA K256<>+240(SB)/4, $0xc6e00bf3
+DATA K256<>+244(SB)/4, $0xd5a79147
+DATA K256<>+248(SB)/4, $0x06ca6351
+DATA K256<>+252(SB)/4, $0x14292967
+DATA K256<>+256(SB)/4, $0x27b70a85
+DATA K256<>+260(SB)/4, $0x2e1b2138
+DATA K256<>+264(SB)/4, $0x4d2c6dfc
+DATA K256<>+268(SB)/4, $0x53380d13
+DATA K256<>+272(SB)/4, $0x27b70a85
+DATA K256<>+276(SB)/4, $0x2e1b2138
+DATA K256<>+280(SB)/4, $0x4d2c6dfc
+DATA K256<>+284(SB)/4, $0x53380d13
+DATA K256<>+288(SB)/4, $0x650a7354
+DATA K256<>+292(SB)/4, $0x766a0abb
+DATA K256<>+296(SB)/4, $0x81c2c92e
+DATA K256<>+300(SB)/4, $0x92722c85
+DATA K256<>+304(SB)/4, $0x650a7354
+DATA K256<>+308(SB)/4, $0x766a0abb
+DATA K256<>+312(SB)/4, $0x81c2c92e
+DATA K256<>+316(SB)/4, $0x92722c85
+DATA K256<>+320(SB)/4, $0xa2bfe8a1
+DATA K256<>+324(SB)/4, $0xa81a664b
+DATA K256<>+328(SB)/4, $0xc24b8b70
+DATA K256<>+332(SB)/4, $0xc76c51a3
+DATA K256<>+336(SB)/4, $0xa2bfe8a1
+DATA K256<>+340(SB)/4, $0xa81a664b
+DATA K256<>+344(SB)/4, $0xc24b8b70
+DATA K256<>+348(SB)/4, $0xc76c51a3
+DATA K256<>+352(SB)/4, $0xd192e819
+DATA K256<>+356(SB)/4, $0xd6990624
+DATA K256<>+360(SB)/4, $0xf40e3585
+DATA K256<>+364(SB)/4, $0x106aa070
+DATA K256<>+368(SB)/4, $0xd192e819
+DATA K256<>+372(SB)/4, $0xd6990624
+DATA K256<>+376(SB)/4, $0xf40e3585
+DATA K256<>+380(SB)/4, $0x106aa070
+DATA K256<>+384(SB)/4, $0x19a4c116
+DATA K256<>+388(SB)/4, $0x1e376c08
+DATA K256<>+392(SB)/4, $0x2748774c
+DATA K256<>+396(SB)/4, $0x34b0bcb5
+DATA K256<>+400(SB)/4, $0x19a4c116
+DATA K256<>+404(SB)/4, $0x1e376c08
+DATA K256<>+408(SB)/4, $0x2748774c
+DATA K256<>+412(SB)/4, $0x34b0bcb5
+DATA K256<>+416(SB)/4, $0x391c0cb3
+DATA K256<>+420(SB)/4, $0x4ed8aa4a
+DATA K256<>+424(SB)/4, $0x5b9cca4f
+DATA K256<>+428(SB)/4, $0x682e6ff3
+DATA K256<>+432(SB)/4, $0x391c0cb3
+DATA K256<>+436(SB)/4, $0x4ed8aa4a
+DATA K256<>+440(SB)/4, $0x5b9cca4f
+DATA K256<>+444(SB)/4, $0x682e6ff3
+DATA K256<>+448(SB)/4, $0x748f82ee
+DATA K256<>+452(SB)/4, $0x78a5636f
+DATA K256<>+456(SB)/4, $0x84c87814
+DATA K256<>+460(SB)/4, $0x8cc70208
+DATA K256<>+464(SB)/4, $0x748f82ee
+DATA K256<>+468(SB)/4, $0x78a5636f
+DATA K256<>+472(SB)/4, $0x84c87814
+DATA K256<>+476(SB)/4, $0x8cc70208
+DATA K256<>+480(SB)/4, $0x90befffa
+DATA K256<>+484(SB)/4, $0xa4506ceb
+DATA K256<>+488(SB)/4, $0xbef9a3f7
+DATA K256<>+492(SB)/4, $0xc67178f2
+DATA K256<>+496(SB)/4, $0x90befffa
+DATA K256<>+500(SB)/4, $0xa4506ceb
+DATA K256<>+504(SB)/4, $0xbef9a3f7
+DATA K256<>+508(SB)/4, $0xc67178f2
+GLOBL K256<>(SB), RODATA|NOPTR, $512
+
+DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100
+DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff
+DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100
+DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff
+GLOBL shuff_00BA<>(SB), RODATA, $32
+
+DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff
+DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100
+DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff
+DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100
+GLOBL shuff_DC00<>(SB), RODATA, $32
+
+// func blockSHANI(dig *digest, p []byte)
+// Requires: AVX, SHA, SSE2, SSE4.1, SSSE3
+TEXT ·blockSHANI(SB), $0-32
        MOVQ    dig+0(FP), DI
        MOVQ    p_base+8(FP), SI
        MOVQ    p_len+16(FP), DX
@@ -4823,151 +4970,3 @@ roundLoop:
 
 done:
        RET
-
-DATA flip_mask<>+0(SB)/8, $0x0405060700010203
-DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b
-DATA flip_mask<>+16(SB)/8, $0x0405060700010203
-DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b
-GLOBL flip_mask<>(SB), RODATA, $32
-
-DATA K256<>+0(SB)/4, $0x428a2f98
-DATA K256<>+4(SB)/4, $0x71374491
-DATA K256<>+8(SB)/4, $0xb5c0fbcf
-DATA K256<>+12(SB)/4, $0xe9b5dba5
-DATA K256<>+16(SB)/4, $0x428a2f98
-DATA K256<>+20(SB)/4, $0x71374491
-DATA K256<>+24(SB)/4, $0xb5c0fbcf
-DATA K256<>+28(SB)/4, $0xe9b5dba5
-DATA K256<>+32(SB)/4, $0x3956c25b
-DATA K256<>+36(SB)/4, $0x59f111f1
-DATA K256<>+40(SB)/4, $0x923f82a4
-DATA K256<>+44(SB)/4, $0xab1c5ed5
-DATA K256<>+48(SB)/4, $0x3956c25b
-DATA K256<>+52(SB)/4, $0x59f111f1
-DATA K256<>+56(SB)/4, $0x923f82a4
-DATA K256<>+60(SB)/4, $0xab1c5ed5
-DATA K256<>+64(SB)/4, $0xd807aa98
-DATA K256<>+68(SB)/4, $0x12835b01
-DATA K256<>+72(SB)/4, $0x243185be
-DATA K256<>+76(SB)/4, $0x550c7dc3
-DATA K256<>+80(SB)/4, $0xd807aa98
-DATA K256<>+84(SB)/4, $0x12835b01
-DATA K256<>+88(SB)/4, $0x243185be
-DATA K256<>+92(SB)/4, $0x550c7dc3
-DATA K256<>+96(SB)/4, $0x72be5d74
-DATA K256<>+100(SB)/4, $0x80deb1fe
-DATA K256<>+104(SB)/4, $0x9bdc06a7
-DATA K256<>+108(SB)/4, $0xc19bf174
-DATA K256<>+112(SB)/4, $0x72be5d74
-DATA K256<>+116(SB)/4, $0x80deb1fe
-DATA K256<>+120(SB)/4, $0x9bdc06a7
-DATA K256<>+124(SB)/4, $0xc19bf174
-DATA K256<>+128(SB)/4, $0xe49b69c1
-DATA K256<>+132(SB)/4, $0xefbe4786
-DATA K256<>+136(SB)/4, $0x0fc19dc6
-DATA K256<>+140(SB)/4, $0x240ca1cc
-DATA K256<>+144(SB)/4, $0xe49b69c1
-DATA K256<>+148(SB)/4, $0xefbe4786
-DATA K256<>+152(SB)/4, $0x0fc19dc6
-DATA K256<>+156(SB)/4, $0x240ca1cc
-DATA K256<>+160(SB)/4, $0x2de92c6f
-DATA K256<>+164(SB)/4, $0x4a7484aa
-DATA K256<>+168(SB)/4, $0x5cb0a9dc
-DATA K256<>+172(SB)/4, $0x76f988da
-DATA K256<>+176(SB)/4, $0x2de92c6f
-DATA K256<>+180(SB)/4, $0x4a7484aa
-DATA K256<>+184(SB)/4, $0x5cb0a9dc
-DATA K256<>+188(SB)/4, $0x76f988da
-DATA K256<>+192(SB)/4, $0x983e5152
-DATA K256<>+196(SB)/4, $0xa831c66d
-DATA K256<>+200(SB)/4, $0xb00327c8
-DATA K256<>+204(SB)/4, $0xbf597fc7
-DATA K256<>+208(SB)/4, $0x983e5152
-DATA K256<>+212(SB)/4, $0xa831c66d
-DATA K256<>+216(SB)/4, $0xb00327c8
-DATA K256<>+220(SB)/4, $0xbf597fc7
-DATA K256<>+224(SB)/4, $0xc6e00bf3
-DATA K256<>+228(SB)/4, $0xd5a79147
-DATA K256<>+232(SB)/4, $0x06ca6351
-DATA K256<>+236(SB)/4, $0x14292967
-DATA K256<>+240(SB)/4, $0xc6e00bf3
-DATA K256<>+244(SB)/4, $0xd5a79147
-DATA K256<>+248(SB)/4, $0x06ca6351
-DATA K256<>+252(SB)/4, $0x14292967
-DATA K256<>+256(SB)/4, $0x27b70a85
-DATA K256<>+260(SB)/4, $0x2e1b2138
-DATA K256<>+264(SB)/4, $0x4d2c6dfc
-DATA K256<>+268(SB)/4, $0x53380d13
-DATA K256<>+272(SB)/4, $0x27b70a85
-DATA K256<>+276(SB)/4, $0x2e1b2138
-DATA K256<>+280(SB)/4, $0x4d2c6dfc
-DATA K256<>+284(SB)/4, $0x53380d13
-DATA K256<>+288(SB)/4, $0x650a7354
-DATA K256<>+292(SB)/4, $0x766a0abb
-DATA K256<>+296(SB)/4, $0x81c2c92e
-DATA K256<>+300(SB)/4, $0x92722c85
-DATA K256<>+304(SB)/4, $0x650a7354
-DATA K256<>+308(SB)/4, $0x766a0abb
-DATA K256<>+312(SB)/4, $0x81c2c92e
-DATA K256<>+316(SB)/4, $0x92722c85
-DATA K256<>+320(SB)/4, $0xa2bfe8a1
-DATA K256<>+324(SB)/4, $0xa81a664b
-DATA K256<>+328(SB)/4, $0xc24b8b70
-DATA K256<>+332(SB)/4, $0xc76c51a3
-DATA K256<>+336(SB)/4, $0xa2bfe8a1
-DATA K256<>+340(SB)/4, $0xa81a664b
-DATA K256<>+344(SB)/4, $0xc24b8b70
-DATA K256<>+348(SB)/4, $0xc76c51a3
-DATA K256<>+352(SB)/4, $0xd192e819
-DATA K256<>+356(SB)/4, $0xd6990624
-DATA K256<>+360(SB)/4, $0xf40e3585
-DATA K256<>+364(SB)/4, $0x106aa070
-DATA K256<>+368(SB)/4, $0xd192e819
-DATA K256<>+372(SB)/4, $0xd6990624
-DATA K256<>+376(SB)/4, $0xf40e3585
-DATA K256<>+380(SB)/4, $0x106aa070
-DATA K256<>+384(SB)/4, $0x19a4c116
-DATA K256<>+388(SB)/4, $0x1e376c08
-DATA K256<>+392(SB)/4, $0x2748774c
-DATA K256<>+396(SB)/4, $0x34b0bcb5
-DATA K256<>+400(SB)/4, $0x19a4c116
-DATA K256<>+404(SB)/4, $0x1e376c08
-DATA K256<>+408(SB)/4, $0x2748774c
-DATA K256<>+412(SB)/4, $0x34b0bcb5
-DATA K256<>+416(SB)/4, $0x391c0cb3
-DATA K256<>+420(SB)/4, $0x4ed8aa4a
-DATA K256<>+424(SB)/4, $0x5b9cca4f
-DATA K256<>+428(SB)/4, $0x682e6ff3
-DATA K256<>+432(SB)/4, $0x391c0cb3
-DATA K256<>+436(SB)/4, $0x4ed8aa4a
-DATA K256<>+440(SB)/4, $0x5b9cca4f
-DATA K256<>+444(SB)/4, $0x682e6ff3
-DATA K256<>+448(SB)/4, $0x748f82ee
-DATA K256<>+452(SB)/4, $0x78a5636f
-DATA K256<>+456(SB)/4, $0x84c87814
-DATA K256<>+460(SB)/4, $0x8cc70208
-DATA K256<>+464(SB)/4, $0x748f82ee
-DATA K256<>+468(SB)/4, $0x78a5636f
-DATA K256<>+472(SB)/4, $0x84c87814
-DATA K256<>+476(SB)/4, $0x8cc70208
-DATA K256<>+480(SB)/4, $0x90befffa
-DATA K256<>+484(SB)/4, $0xa4506ceb
-DATA K256<>+488(SB)/4, $0xbef9a3f7
-DATA K256<>+492(SB)/4, $0xc67178f2
-DATA K256<>+496(SB)/4, $0x90befffa
-DATA K256<>+500(SB)/4, $0xa4506ceb
-DATA K256<>+504(SB)/4, $0xbef9a3f7
-DATA K256<>+508(SB)/4, $0xc67178f2
-GLOBL K256<>(SB), RODATA|NOPTR, $512
-
-DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100
-DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff
-DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100
-DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff
-GLOBL shuff_00BA<>(SB), RODATA, $32
-
-DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff
-DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100
-DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff
-DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100
-GLOBL shuff_DC00<>(SB), RODATA, $32
index 434b6f253d41c96ca60307f9c4b9c862971f6ce3..4bb873ac7571fcefbf6ec07fae5ddadb9be24d36 100644 (file)
@@ -8,16 +8,13 @@ package sha256
 
 import "internal/cpu"
 
-var k = _K
-
 //go:noescape
-func sha256block(h []uint32, p []byte, k []uint32)
+func blockSHA2(dig *digest, p []byte)
 
 func block(dig *digest, p []byte) {
-       if !cpu.ARM64.HasSHA2 {
-               blockGeneric(dig, p)
+       if cpu.ARM64.HasSHA2 {
+               blockSHA2(dig, p)
        } else {
-               h := dig.h[:]
-               sha256block(h, p, k)
+               blockGeneric(dig, p)
        }
 }
index 6757310c34e34c4bfc57cbe3913856bbb9755f45..f6d19e35c66b67524ae1800a24ce5451f1264ebb 100644 (file)
        SHA256H2        V9.S4, V8, V3 \
        VMOV    V2.B16, V8.B16
 
-// func sha256block(h []uint32, p []byte, k []uint32)
-TEXT ·sha256block(SB),NOSPLIT,$0
-       MOVD    h_base+0(FP), R0                           // Hash value first address
-       MOVD    p_base+24(FP), R1                          // message first address
-       MOVD    k_base+48(FP), R2                          // k constants first address
-       MOVD    p_len+32(FP), R3                           // message length
+// func blockSHA2(dig *digest, p []byte)
+TEXT ·blockSHA2(SB),NOSPLIT,$0
+       MOVD    dig+0(FP), R0                              // Hash value first address
+       MOVD    p_base+8(FP), R1                           // message first address
+       MOVD    p_len+16(FP), R3                           // message length
+       MOVD    ·_K+0(SB), R2                              // k constants first address
        VLD1    (R0), [V0.S4, V1.S4]                       // load h(a,b,c,d,e,f,g,h)
        VLD1.P  64(R2), [V16.S4, V17.S4, V18.S4, V19.S4]
        VLD1.P  64(R2), [V20.S4, V21.S4, V22.S4, V23.S4]
similarity index 71%
rename from src/crypto/sha256/sha256block_decl.go
rename to src/crypto/sha256/sha256block_asm.go
index e79303938748ecb6905edd60d30249c6076d11e2..50e9615c5ef49a73a012cc2fbbfee4d5827df681 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (386 || amd64 || loong64 || ppc64 || ppc64le || riscv64 || s390x) && !purego
+//go:build (386 || loong64 || riscv64) && !purego
 
 package sha256
 
diff --git a/src/crypto/sha256/sha256block_ppc64x.go b/src/crypto/sha256/sha256block_ppc64x.go
new file mode 100644 (file)
index 0000000..ae54375
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (ppc64 || ppc64le) && !purego
+
+package sha256
+
+import "internal/godebug"
+
+// The POWER architecture doesn't have a way to turn off SHA-2 support at
+// runtime with GODEBUG=cpu.something=off, so introduce a new GODEBUG knob for
+// that. It's intentionally only checked at init() time, to avoid the
+// performance overhead of checking it on every block.
+var ppc64sha2 = godebug.New("#ppc64sha2").Value() != "off"
+
+//go:noescape
+func blockPOWER(dig *digest, p []byte)
+
+func block(dig *digest, p []byte) {
+       if ppc64sha2 {
+               blockPOWER(dig, p)
+       } else {
+               blockGeneric(dig, p)
+       }
+}
index ba8fa623c11f020d289396b50de6baa75db7ce30..a5f40ff04a52366c7482cf42f87d9b37824b40bd 100644 (file)
@@ -284,8 +284,8 @@ GLOBL ·kcon(SB), RODATA, $1088
 #define VPERMLE(va,vb,vc,vt)
 #endif
 
-// func block(dig *digest, p []byte)
-TEXT ·block(SB),0,$0-32
+// func blockPOWER(dig *digest, p []byte)
+TEXT ·blockPOWER(SB),0,$0-32
        MOVD    dig+0(FP), CTX
        MOVD    p_base+8(FP), INP
        MOVD    p_len+16(FP), LEN
index 0a1dc5785d2e447d02010d2639403d0680c31c8b..2abebc98e98cc1ce145e5cd72f3e3107aecb2333 100644 (file)
@@ -8,4 +8,13 @@ package sha256
 
 import "internal/cpu"
 
-var useAsm = cpu.S390X.HasSHA256
+//go:noescape
+func blockS390X(dig *digest, p []byte)
+
+func block(dig *digest, p []byte) {
+       if cpu.S390X.HasSHA256 {
+               blockS390X(dig, p)
+       } else {
+               blockGeneric(dig, p)
+       }
+}
index 757d62f5125aa1c3717f0e38dfa9dbe230e9f4c5..6372d67738bac7498063b40e75aebda39f2d1fc1 100644 (file)
@@ -6,17 +6,12 @@
 
 #include "textflag.h"
 
-// func block(dig *digest, p []byte)
-TEXT ·block(SB), NOSPLIT|NOFRAME, $0-32
-       MOVBZ  ·useAsm(SB), R4
+// func blockS390X(dig *digest, p []byte)
+TEXT ·blockS390X(SB), NOSPLIT|NOFRAME, $0-32
        LMG    dig+0(FP), R1, R3            // R2 = &p[0], R3 = len(p)
        MOVBZ  $2, R0                       // SHA-256 function code
-       CMPBEQ R4, $0, generic
 
 loop:
        KIMD R0, R2      // compute intermediate message digest (KIMD)
        BVS  loop        // continue if interrupted
        RET
-
-generic:
-       BR ·blockGeneric(SB)
diff --git a/src/crypto/sha512/fallback_test.go b/src/crypto/sha512/fallback_test.go
deleted file mode 100644 (file)
index b55a4a5..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build s390x && !purego
-
-package sha512
-
-import (
-       "fmt"
-       "io"
-       "testing"
-)
-
-// Tests the fallback code path in case the optimized asm
-// implementation cannot be used.
-// See also TestBlockGeneric.
-func TestGenericPath(t *testing.T) {
-       if !useAsm {
-               t.Skipf("assembly implementation unavailable")
-       }
-       useAsm = false
-       defer func() { useAsm = true }()
-       c := New()
-       in := "ΑΒΓΔΕϜΖΗΘΙΚΛΜΝΞΟΠϺϘΡΣΤΥΦΧΨΩ"
-       gold := "6922e319366d677f34c504af31bfcb29" +
-               "e531c125ecd08679362bffbd6b6ebfb9" +
-               "0dcc27dfc1f3d3b16a16c0763cf43b91" +
-               "40bbf9bbb7233724e9a0c6655b185d76"
-       if _, err := io.WriteString(c, in); err != nil {
-               t.Fatalf("could not write to c: %v", err)
-       }
-       out := fmt.Sprintf("%x", c.Sum(nil))
-       if out != gold {
-               t.Fatalf("mismatch: got %s, wanted %s", out, gold)
-       }
-}
index 4e2793100a3199062f8421d68735208b48e1d1f8..d62eb9291741b4d39b8bd6545b74b02f06f0793a 100644 (file)
@@ -8,13 +8,13 @@ package sha512
 
 import "internal/cpu"
 
+//go:noescape
+func blockSHA512(dig *digest, p []byte)
+
 func block(dig *digest, p []byte) {
        if cpu.ARM64.HasSHA512 {
-               blockAsm(dig, p)
-               return
+               blockSHA512(dig, p)
+       } else {
+               blockGeneric(dig, p)
        }
-       blockGeneric(dig, p)
 }
-
-//go:noescape
-func blockAsm(dig *digest, p []byte)
index 25f3dbfe43d5e617d5e7cbba6158f259831d3d95..15242e4bbc7f5a576fc29266820554698823420d 100644 (file)
@@ -40,8 +40,8 @@
        VADD    i3.D2, i1.D2, i4.D2 \
        SHA512H2        i0.D2, i1, i3
 
-// func blockAsm(dig *digest, p []byte)
-TEXT ·blockAsm(SB),NOSPLIT,$0
+// func blockSHA512(dig *digest, p []byte)
+TEXT ·blockSHA512(SB),NOSPLIT,$0
        MOVD    dig+0(FP), R0
        MOVD    p_base+8(FP), R1
        MOVD    p_len+16(FP), R2
similarity index 75%
rename from src/crypto/sha512/sha512block_decl.go
rename to src/crypto/sha512/sha512block_asm.go
index b8a7854e4ddac075411e0801071b387789a2eddc..888804678e43854dff53dc68464f121d319b8801 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (loong64 || ppc64 || ppc64le || riscv64 || s390x) && !purego
+//go:build (loong64 || riscv64) && !purego
 
 package sha512
 
diff --git a/src/crypto/sha512/sha512block_ppc64x.go b/src/crypto/sha512/sha512block_ppc64x.go
new file mode 100644 (file)
index 0000000..2f7793b
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (ppc64 || ppc64le) && !purego
+
+package sha512
+
+import "internal/godebug"
+
+// The POWER architecture doesn't have a way to turn off SHA-512 support at
+// runtime with GODEBUG=cpu.something=off, so introduce a new GODEBUG knob for
+// that. It's intentionally only checked at init() time, to avoid the
+// performance overhead of checking it on every block.
+var ppc64sha512 = godebug.New("#ppc64sha512").Value() != "off"
+
+//go:noescape
+func blockPOWER(dig *digest, p []byte)
+
+func block(dig *digest, p []byte) {
+       if ppc64sha512 {
+               blockPOWER(dig, p)
+       } else {
+               blockGeneric(dig, p)
+       }
+}
index 87aab80903c9b8beb7a9f4a4debbf69cd011d06b..cccce227976366040a95667d1ff8a53c9d56c0df 100644 (file)
@@ -304,8 +304,8 @@ GLOBL ·kcon(SB), RODATA, $1312
        VADDUDM         S0, h, h; \
        VADDUDM         s1, xj, xj
 
-// func block(dig *digest, p []byte)
-TEXT ·block(SB),0,$0-32
+// func blockPOWER(dig *digest, p []byte)
+TEXT ·blockPOWER(SB),0,$0-32
        MOVD    dig+0(FP), CTX
        MOVD    p_base+8(FP), INP
        MOVD    p_len+16(FP), LEN
index d0f09ea9edb07289b653c53ad89dcadf8b722e10..2d1b9ed3db5bea73c717968a63f739b8b5221394 100644 (file)
@@ -8,4 +8,13 @@ package sha512
 
 import "internal/cpu"
 
-var useAsm = cpu.S390X.HasSHA512
+//go:noescape
+func blockS390X(dig *digest, p []byte)
+
+func block(dig *digest, p []byte) {
+       if cpu.S390X.HasSHA512 {
+               blockS390X(dig, p)
+       } else {
+               blockGeneric(dig, p)
+       }
+}
index 230bd414d38bd07703cf8a0d357df50930066f8b..bd3cd43967fae071b1e55899a9e61cb7a2920e36 100644 (file)
@@ -6,17 +6,12 @@
 
 #include "textflag.h"
 
-// func block(dig *digest, p []byte)
-TEXT ·block(SB), NOSPLIT|NOFRAME, $0-32
-       MOVBZ  ·useAsm(SB), R4
+// func blockS390X(dig *digest, p []byte)
+TEXT ·blockS390X(SB), NOSPLIT|NOFRAME, $0-32
        LMG    dig+0(FP), R1, R3            // R2 = &p[0], R3 = len(p)
        MOVBZ  $3, R0                       // SHA-512 function code
-       CMPBEQ R4, $0, generic
 
 loop:
        KIMD R0, R2      // compute intermediate message digest (KIMD)
        BVS  loop        // continue if interrupted
        RET
-
-generic:
-       BR ·blockGeneric(SB)