crypto/sha256: add sha-ni implementation

author Ted Painter <ted.painter@intel.com>

Thu, 26 May 2022 00:43:31 +0000 (20:43 -0400)

committer Gopher Robot <gobot@golang.org>

Fri, 31 Mar 2023 15:55:23 +0000 (15:55 +0000)
author Ted Painter <ted.painter@intel.com>
Thu, 26 May 2022 00:43:31 +0000 (20:43 -0400)
committer Gopher Robot <gobot@golang.org>
Fri, 31 Mar 2023 15:55:23 +0000 (15:55 +0000)
diff --git a/src/crypto/sha256/sha256block_amd64.go b/src/crypto/sha256/sha256block_amd64.go

index 27464e2c12f895ad880edec51f055c8ff3fa23f2..b5d2c9b574a2a4451aa96425ee6a8f8269b6cdad 100644 (file)
--- a/src/crypto/sha256/sha256block_amd64.go
+++ b/src/crypto/sha256/sha256block_amd64.go
@@ -7,3 +7,4 @@ package sha256
  import "internal/cpu"
  
  var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
+var useSHA = useAVX2 && cpu.X86.HasSHA
diff --git a/src/crypto/sha256/sha256block_amd64.s b/src/crypto/sha256/sha256block_amd64.s

index f6af47c50e9f061fa06711e068450d52dddfc996..03535fb51ca2cf4f1fc1261ad35e05c5334102c7 100644 (file)
--- a/src/crypto/sha256/sha256block_amd64.s
+++ b/src/crypto/sha256/sha256block_amd64.s
@@ -179,7 +179,7 @@
  
  #define XFER  Y9
  
-#define BYTE_FLIP_MASK         Y13 // mask to convert LE -> BE
+#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
  #define X_BYTE_FLIP_MASK X13
  
  #define NUM_BYTES DX
@@ -232,14 +232,14 @@
         RORXL    $13, a, T1;                  \ // T1 = a >> 13                 // S0B
         ;                                     \
         XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)                                       // S1
-       XORL     g, y2;                       \ // y2 = f^g                                     // CH
+       XORL     g, y2;                       \ // y2 = f^g     // CH
         VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]       // y1 = (e >> 6)        // S1
         RORXL    $6, e, y1;                   \ // y1 = (e >> 6)                                                // S1
         ;                                     \
         ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
         XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)              // S1
         RORXL    $22, a, y1;                  \ // y1 = a >> 22                                                 // S0A
-       ADDL     h, d;                        \ // d = k + w + h + d                            // --
+       ADDL     h, d;                        \ // d = k + w + h + d    // --
         ;                                     \
         ANDL     b, y3;                       \ // y3 = (a|c)&b                                                 // MAJA
         VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
@@ -270,7 +270,7 @@
         MOVL    a, y3;                       \ // y3 = a                       // MAJA
         RORXL   $25, e, y0;                  \ // y0 = e >> 25                                  // S1A
         RORXL   $11, e, y1;                  \ // y1 = e >> 11                                  // S1B
-       ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h                         // --
+       ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h         // --
         ORL     c, y3;                       \ // y3 = a|c                                              // MAJA
         ;                                    \
         VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
@@ -316,7 +316,7 @@
         ;                                    \
         MOVL    a, y3;                       \ // y3 = a                                                        // MAJA
         RORXL   $25, e, y0;                  \ // y0 = e >> 25                                          // S1A
-       ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h                         // --
+       ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h                 // --
         ;                                    \
         VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
         RORXL   $11, e, y1;                  \ // y1 = e >> 11                                          // S1B
@@ -495,7 +495,7 @@
         ;                                  \
         XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)                          // S0
         RORXL $2, a, T1;                   \ // T1 = (a >> 2)                                           // S0
-       ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h   // --
+       ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h   // --
         ORL   c, y3;                       \ // y3 = a|c                                                                // MAJA
         ;                                  \
         XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)         // S0
@@ -531,7 +531,7 @@
         ;                                  \
         XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)                          // S0
         RORXL $2, a, T1;                   \ // T1 = (a >> 2)                                           // S0
-       ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h   // --
+       ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h   // --
         ORL   c, y3;                       \ // y3 = a|c                                                                // MAJA
         ;                                  \
         XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)         // S0
@@ -550,9 +550,80 @@
         ;                                  \
         ADDL  y3, h                        // h = t1 + S0 + MAJ                                 // --
  
+// Definitions for sha-ni version
+//
+// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
+// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
+//
+// Reference
+// S. Gulley, et al, "New Instructions Supporting the Secure Hash
+// Algorithm on Intel® Architecture Processors", July 2013
+// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
+//
+
+#define digestPtr      DI      // input/output, base pointer to digest hash vector H0, H1, ..., H7
+#define dataPtr                SI      // input, base pointer to first input data block
+#define numBytes       DX      // input, number of input bytes to be processed
+#define sha256Constants        AX      // round contants from K256 table, indexed by round number x 32
+#define msg            X0      // input data
+#define state0         X1      // round intermediates and outputs
+#define state1         X2
+#define m0             X3      // m0, m1,... m4 -- round message temps
+#define m1             X4
+#define m2             X5
+#define m3             X6
+#define m4             X7
+#define shufMask       X8      // input data endian conversion control mask
+#define abefSave       X9      // digest hash vector inter-block buffer abef
+#define cdghSave       X10     // digest hash vector inter-block buffer cdgh
+
+#define nop(m,a)               // nop instead of final SHA256MSG1 for first and last few rounds
+
+#define sha256msg1(m,a) \      // final SHA256MSG1 for middle rounds that require it
+       SHA256MSG1              m, a
+
+#define vmov(a,b) \            // msg copy for all but rounds 12-15
+       VMOVDQA         a, b
+
+#define vmovrev(a,b) \         // reverse copy for rounds 12-15
+       VMOVDQA         b, a
+
+// sha rounds 0 to 11
+// identical with the exception of the final msg op
+// which is replaced with a nop for rounds where it is not needed
+// refer to Gulley, et al for more information
+#define rounds0to11(m,a,c,sha256Msg1)                          \
+       VMOVDQU                 c*16(dataPtr), msg              \
+       PSHUFB                  shufMask, msg                   \
+       VMOVDQA                 msg, m                          \
+       PADDD                   (c*32)(sha256Constants), msg    \
+       SHA256RNDS2             msg, state0, state1             \
+       PSHUFD                  $0x0e, msg, msg                 \
+       SHA256RNDS2             msg, state1, state0             \
+       sha256Msg1              (m,a)
+
+// sha rounds 12 to 59
+// identical with the exception of the final msg op
+// and the reverse copy(m,msg) in round 12 which is required
+// after the last data load
+// refer to Gulley, et al for more information
+#define rounds12to59(m,c,a,t,sha256Msg1,movop)                 \
+       movop                   (m,msg)                         \
+       PADDD                   (c*32)(sha256Constants), msg    \
+       SHA256RNDS2             msg, state0, state1             \
+       VMOVDQA                 m, m4                           \
+       PALIGNR                 $4, a, m4                       \
+       PADDD                   m4, t                           \
+       SHA256MSG2              m, t                            \
+       PSHUFD                  $0x0e, msg, msg                 \
+       SHA256RNDS2             msg, state1, state0             \
+       sha256Msg1              (m,a)
+
  TEXT ·block(SB), 0, $536-32
-       CMPB ·useAVX2(SB), $1
-       JE   avx2
+       CMPB    ·useSHA(SB), $1
+       JE      sha_ni
+       CMPB    ·useAVX2(SB), $1
+       JE      avx2
  
         MOVQ p_base+8(FP), SI
         MOVQ p_len+16(FP), DX
@@ -862,6 +933,77 @@ done_hash:
         VZEROUPPER
         RET
  
+sha_ni:
+       MOVQ            dig+0(FP), digestPtr            // init digest hash vector H0, H1,..., H7 pointer
+       MOVQ            p_base+8(FP), dataPtr           // init input data base pointer
+       MOVQ            p_len+16(FP), numBytes          // get number of input bytes to hash
+       SHRQ            $6, numBytes                    // force modulo 64 input buffer length
+       SHLQ            $6, numBytes
+       CMPQ            numBytes, $0                    // exit early for zero-length input buffer
+       JEQ             done
+       ADDQ            dataPtr, numBytes               // point numBytes to end of input buffer
+       VMOVDQU         (0*16)(digestPtr), state0       // load initial hash values and reorder
+       VMOVDQU         (1*16)(digestPtr), state1       // DCBA, HGFE -> ABEF, CDGH
+       PSHUFD          $0xb1, state0, state0           // CDAB
+       PSHUFD          $0x1b, state1, state1           // EFGH
+       VMOVDQA         state0, m4
+       PALIGNR         $8, state1, state0              // ABEF
+       PBLENDW         $0xf0, m4, state1               // CDGH
+       VMOVDQA         flip_mask<>(SB), shufMask
+       LEAQ            K256<>(SB), sha256Constants
+
+roundLoop:
+       // save hash values for addition after rounds
+       VMOVDQA         state0, abefSave
+       VMOVDQA         state1, cdghSave
+
+       // do rounds 0-59
+       rounds0to11     (m0,-,0,nop)                    // 0-3
+       rounds0to11     (m1,m0,1,sha256msg1)            // 4-7
+       rounds0to11     (m2,m1,2,sha256msg1)            // 8-11
+       VMOVDQU         (3*16)(dataPtr), msg
+       PSHUFB          shufMask, msg
+       rounds12to59    (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15
+       rounds12to59    (m0,4,m3,m1,sha256msg1,vmov)    // 16-19
+       rounds12to59    (m1,5,m0,m2,sha256msg1,vmov)    // 20-23
+       rounds12to59    (m2,6,m1,m3,sha256msg1,vmov)    // 24-27
+       rounds12to59    (m3,7,m2,m0,sha256msg1,vmov)    // 28-31
+       rounds12to59    (m0,8,m3,m1,sha256msg1,vmov)    // 32-35
+       rounds12to59    (m1,9,m0,m2,sha256msg1,vmov)    // 36-39
+       rounds12to59    (m2,10,m1,m3,sha256msg1,vmov)   // 40-43
+       rounds12to59    (m3,11,m2,m0,sha256msg1,vmov)   // 44-47
+       rounds12to59    (m0,12,m3,m1,sha256msg1,vmov)   // 48-51
+       rounds12to59    (m1,13,m0,m2,nop,vmov)          // 52-55
+       rounds12to59    (m2,14,m1,m3,nop,vmov)          // 56-59
+
+       // do rounds 60-63
+       VMOVDQA         m3, msg
+       PADDD           (15*32)(sha256Constants), msg
+       SHA256RNDS2     msg, state0, state1
+       PSHUFD          $0x0e, msg, msg
+       SHA256RNDS2     msg, state1, state0
+
+       // add current hash values with previously saved
+       PADDD           abefSave, state0
+       PADDD           cdghSave, state1
+
+       // advance data pointer; loop until buffer empty
+       ADDQ            $64, dataPtr
+       CMPQ            numBytes, dataPtr
+       JNE             roundLoop
+
+       // write hash values back in the correct order
+       PSHUFD          $0x1b, state0, state0           // FEBA
+       PSHUFD          $0xb1, state1, state1           // DCHG
+       VMOVDQA         state0, m4
+       PBLENDW         $0xf0, state1, state0           // DCBA
+       PALIGNR         $8, m4, state1                  // HGFE
+       VMOVDQU         state0, (0*16)(digestPtr)
+       VMOVDQU         state1, (1*16)(digestPtr)
+
+done:
+       RET
+
  // shuffle byte order from LE to BE
  DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
  DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
author	Ted Painter <ted.painter@intel.com>
	Thu, 26 May 2022 00:43:31 +0000 (20:43 -0400)
committer	Gopher Robot <gobot@golang.org>
	Fri, 31 Mar 2023 15:55:23 +0000 (15:55 +0000)
src/crypto/sha256/sha256block_amd64.go		patch \| blob \| history
src/crypto/sha256/sha256block_amd64.s		patch \| blob \| history