]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/sha256: add sha-ni implementation
authorTed Painter <ted.painter@intel.com>
Thu, 26 May 2022 00:43:31 +0000 (20:43 -0400)
committerGopher Robot <gobot@golang.org>
Fri, 31 Mar 2023 15:55:23 +0000 (15:55 +0000)
goos: linux
goarch: amd64
pkg: crypto/sha256
cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz
                    │  bench.old   │              bench.new              │
                    │    sec/op    │   sec/op     vs base                │
Hash8Bytes/New-4      169.20n ± 7%   65.40n ± 5%  -61.35% (p=0.000 n=10)
Hash8Bytes/Sum224-4   166.10n ± 3%   65.20n ± 8%  -60.74% (p=0.000 n=10)
Hash8Bytes/Sum256-4   168.50n ± 6%   63.58n ± 7%  -62.27% (p=0.000 n=10)
Hash1K/New-4          2275.5n ± 5%   618.5n ± 2%  -72.82% (p=0.000 n=10)
Hash1K/Sum224-4       2364.5n ± 1%   618.1n ± 1%  -73.86% (p=0.000 n=10)
Hash1K/Sum256-4       2338.5n ± 2%   613.0n ± 2%  -73.79% (p=0.000 n=10)
Hash8K/New-4          17.530µ ± 2%   4.501µ ± 1%  -74.33% (p=0.000 n=10)
Hash8K/Sum224-4       17.456µ ± 2%   4.505µ ± 1%  -74.19% (p=0.000 n=10)
Hash8K/Sum256-4       17.417µ ± 2%   4.504µ ± 1%  -74.14% (p=0.000 n=10)
geomean                1.897µ        564.3n       -70.25%

                    │  bench.old   │               bench.new                │
                    │     B/s      │      B/s       vs base                 │
Hash8Bytes/New-4      45.11Mi ± 6%   116.66Mi ± 5%  +158.62% (p=0.000 n=10)
Hash8Bytes/Sum224-4   45.92Mi ± 3%   117.04Mi ± 8%  +154.89% (p=0.000 n=10)
Hash8Bytes/Sum256-4   45.29Mi ± 6%   120.00Mi ± 7%  +164.99% (p=0.000 n=10)
Hash1K/New-4          429.2Mi ± 5%   1578.9Mi ± 2%  +267.92% (p=0.000 n=10)
Hash1K/Sum224-4       413.0Mi ± 1%   1579.8Mi ± 1%  +282.49% (p=0.000 n=10)
Hash1K/Sum256-4       417.6Mi ± 1%   1593.1Mi ± 2%  +281.53% (p=0.000 n=10)
Hash8K/New-4          445.7Mi ± 1%   1735.9Mi ± 1%  +289.50% (p=0.000 n=10)
Hash8K/Sum224-4       447.6Mi ± 2%   1734.5Mi ± 1%  +287.54% (p=0.000 n=10)
Hash8K/Sum256-4       448.6Mi ± 2%   1734.8Mi ± 1%  +286.75% (p=0.000 n=10)
geomean               204.3Mi         686.8Mi       +236.11%

                    │  bench.old   │              bench.new              │
                    │     B/op     │    B/op     vs base                 │
Hash8Bytes/New-4      0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum224-4   0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum256-4   0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/New-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/Sum224-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/Sum256-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/New-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/Sum224-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/Sum256-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
geomean                          ²               +0.00%                ²
¹ all samples are equal
² summaries must be >0 to compute geomean

                    │  bench.old   │              bench.new              │
                    │  allocs/op   │ allocs/op   vs base                 │
Hash8Bytes/New-4      0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum224-4   0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8Bytes/Sum256-4   0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/New-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/Sum224-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash1K/Sum256-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/New-4          0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/Sum224-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
Hash8K/Sum256-4       0.000 ± 0%     0.000 ± 0%       ~ (p=1.000 n=10) ¹
geomean                          ²               +0.00%                ²
¹ all samples are equal
² summaries must be >0 to compute geomean

Fixes #50543.

Change-Id: Ie9783647fe82f40fcbd91989a96a24f2d3d5b9a0
Reviewed-on: https://go-review.googlesource.com/c/go/+/408795
Reviewed-by: Paulo Gomes <paulo.gomes.uk@gmail.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
src/crypto/sha256/sha256block_amd64.go
src/crypto/sha256/sha256block_amd64.s

index 27464e2c12f895ad880edec51f055c8ff3fa23f2..b5d2c9b574a2a4451aa96425ee6a8f8269b6cdad 100644 (file)
@@ -7,3 +7,4 @@ package sha256
 import "internal/cpu"
 
 var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
+var useSHA = useAVX2 && cpu.X86.HasSHA
index f6af47c50e9f061fa06711e068450d52dddfc996..03535fb51ca2cf4f1fc1261ad35e05c5334102c7 100644 (file)
 
 #define XFER  Y9
 
-#define BYTE_FLIP_MASK         Y13 // mask to convert LE -> BE
+#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
 #define X_BYTE_FLIP_MASK X13
 
 #define NUM_BYTES DX
        RORXL    $13, a, T1;                  \ // T1 = a >> 13                 // S0B
        ;                                     \
        XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)                                       // S1
-       XORL     g, y2;                       \ // y2 = f^g                                     // CH
+       XORL     g, y2;                       \ // y2 = f^g     // CH
        VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]       // y1 = (e >> 6)        // S1
        RORXL    $6, e, y1;                   \ // y1 = (e >> 6)                                                // S1
        ;                                     \
        ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
        XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)              // S1
        RORXL    $22, a, y1;                  \ // y1 = a >> 22                                                 // S0A
-       ADDL     h, d;                        \ // d = k + w + h + d                            // --
+       ADDL     h, d;                        \ // d = k + w + h + d    // --
        ;                                     \
        ANDL     b, y3;                       \ // y3 = (a|c)&b                                                 // MAJA
        VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
        MOVL    a, y3;                       \ // y3 = a                       // MAJA
        RORXL   $25, e, y0;                  \ // y0 = e >> 25                                  // S1A
        RORXL   $11, e, y1;                  \ // y1 = e >> 11                                  // S1B
-       ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h                         // --
+       ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h         // --
        ORL     c, y3;                       \ // y3 = a|c                                              // MAJA
        ;                                    \
        VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
        ;                                    \
        MOVL    a, y3;                       \ // y3 = a                                                        // MAJA
        RORXL   $25, e, y0;                  \ // y0 = e >> 25                                          // S1A
-       ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h                         // --
+       ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h                 // --
        ;                                    \
        VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
        RORXL   $11, e, y1;                  \ // y1 = e >> 11                                          // S1B
        ;                                  \
        XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)                          // S0
        RORXL $2, a, T1;                   \ // T1 = (a >> 2)                                           // S0
-       ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h   // --
+       ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h   // --
        ORL   c, y3;                       \ // y3 = a|c                                                                // MAJA
        ;                                  \
        XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)         // S0
        ;                                  \
        XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)                          // S0
        RORXL $2, a, T1;                   \ // T1 = (a >> 2)                                           // S0
-       ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h   // --
+       ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h   // --
        ORL   c, y3;                       \ // y3 = a|c                                                                // MAJA
        ;                                  \
        XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)         // S0
        ;                                  \
        ADDL  y3, h                        // h = t1 + S0 + MAJ                                 // --
 
+// Definitions for sha-ni version
+//
+// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
+// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
+//
+// Reference
+// S. Gulley, et al, "New Instructions Supporting the Secure Hash
+// Algorithm on Intel® Architecture Processors", July 2013
+// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
+//
+
+#define digestPtr      DI      // input/output, base pointer to digest hash vector H0, H1, ..., H7
+#define dataPtr                SI      // input, base pointer to first input data block
+#define numBytes       DX      // input, number of input bytes to be processed
+#define sha256Constants        AX      // round contants from K256 table, indexed by round number x 32
+#define msg            X0      // input data
+#define state0         X1      // round intermediates and outputs
+#define state1         X2
+#define m0             X3      // m0, m1,... m4 -- round message temps
+#define m1             X4
+#define m2             X5
+#define m3             X6
+#define m4             X7
+#define shufMask       X8      // input data endian conversion control mask
+#define abefSave       X9      // digest hash vector inter-block buffer abef
+#define cdghSave       X10     // digest hash vector inter-block buffer cdgh
+
+#define nop(m,a)               // nop instead of final SHA256MSG1 for first and last few rounds
+
+#define sha256msg1(m,a) \      // final SHA256MSG1 for middle rounds that require it
+       SHA256MSG1              m, a
+
+#define vmov(a,b) \            // msg copy for all but rounds 12-15
+       VMOVDQA         a, b
+
+#define vmovrev(a,b) \         // reverse copy for rounds 12-15
+       VMOVDQA         b, a
+
+// sha rounds 0 to 11
+// identical with the exception of the final msg op
+// which is replaced with a nop for rounds where it is not needed
+// refer to Gulley, et al for more information
+#define rounds0to11(m,a,c,sha256Msg1)                          \
+       VMOVDQU                 c*16(dataPtr), msg              \
+       PSHUFB                  shufMask, msg                   \
+       VMOVDQA                 msg, m                          \
+       PADDD                   (c*32)(sha256Constants), msg    \
+       SHA256RNDS2             msg, state0, state1             \
+       PSHUFD                  $0x0e, msg, msg                 \
+       SHA256RNDS2             msg, state1, state0             \
+       sha256Msg1              (m,a)
+
+// sha rounds 12 to 59
+// identical with the exception of the final msg op
+// and the reverse copy(m,msg) in round 12 which is required
+// after the last data load
+// refer to Gulley, et al for more information
+#define rounds12to59(m,c,a,t,sha256Msg1,movop)                 \
+       movop                   (m,msg)                         \
+       PADDD                   (c*32)(sha256Constants), msg    \
+       SHA256RNDS2             msg, state0, state1             \
+       VMOVDQA                 m, m4                           \
+       PALIGNR                 $4, a, m4                       \
+       PADDD                   m4, t                           \
+       SHA256MSG2              m, t                            \
+       PSHUFD                  $0x0e, msg, msg                 \
+       SHA256RNDS2             msg, state1, state0             \
+       sha256Msg1              (m,a)
+
 TEXT ·block(SB), 0, $536-32
-       CMPB ·useAVX2(SB), $1
-       JE   avx2
+       CMPB    ·useSHA(SB), $1
+       JE      sha_ni
+       CMPB    ·useAVX2(SB), $1
+       JE      avx2
 
        MOVQ p_base+8(FP), SI
        MOVQ p_len+16(FP), DX
@@ -862,6 +933,77 @@ done_hash:
        VZEROUPPER
        RET
 
+sha_ni:
+       MOVQ            dig+0(FP), digestPtr            // init digest hash vector H0, H1,..., H7 pointer
+       MOVQ            p_base+8(FP), dataPtr           // init input data base pointer
+       MOVQ            p_len+16(FP), numBytes          // get number of input bytes to hash
+       SHRQ            $6, numBytes                    // force modulo 64 input buffer length
+       SHLQ            $6, numBytes
+       CMPQ            numBytes, $0                    // exit early for zero-length input buffer
+       JEQ             done
+       ADDQ            dataPtr, numBytes               // point numBytes to end of input buffer
+       VMOVDQU         (0*16)(digestPtr), state0       // load initial hash values and reorder
+       VMOVDQU         (1*16)(digestPtr), state1       // DCBA, HGFE -> ABEF, CDGH
+       PSHUFD          $0xb1, state0, state0           // CDAB
+       PSHUFD          $0x1b, state1, state1           // EFGH
+       VMOVDQA         state0, m4
+       PALIGNR         $8, state1, state0              // ABEF
+       PBLENDW         $0xf0, m4, state1               // CDGH
+       VMOVDQA         flip_mask<>(SB), shufMask
+       LEAQ            K256<>(SB), sha256Constants
+
+roundLoop:
+       // save hash values for addition after rounds
+       VMOVDQA         state0, abefSave
+       VMOVDQA         state1, cdghSave
+
+       // do rounds 0-59
+       rounds0to11     (m0,-,0,nop)                    // 0-3
+       rounds0to11     (m1,m0,1,sha256msg1)            // 4-7
+       rounds0to11     (m2,m1,2,sha256msg1)            // 8-11
+       VMOVDQU         (3*16)(dataPtr), msg
+       PSHUFB          shufMask, msg
+       rounds12to59    (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15
+       rounds12to59    (m0,4,m3,m1,sha256msg1,vmov)    // 16-19
+       rounds12to59    (m1,5,m0,m2,sha256msg1,vmov)    // 20-23
+       rounds12to59    (m2,6,m1,m3,sha256msg1,vmov)    // 24-27
+       rounds12to59    (m3,7,m2,m0,sha256msg1,vmov)    // 28-31
+       rounds12to59    (m0,8,m3,m1,sha256msg1,vmov)    // 32-35
+       rounds12to59    (m1,9,m0,m2,sha256msg1,vmov)    // 36-39
+       rounds12to59    (m2,10,m1,m3,sha256msg1,vmov)   // 40-43
+       rounds12to59    (m3,11,m2,m0,sha256msg1,vmov)   // 44-47
+       rounds12to59    (m0,12,m3,m1,sha256msg1,vmov)   // 48-51
+       rounds12to59    (m1,13,m0,m2,nop,vmov)          // 52-55
+       rounds12to59    (m2,14,m1,m3,nop,vmov)          // 56-59
+
+       // do rounds 60-63
+       VMOVDQA         m3, msg
+       PADDD           (15*32)(sha256Constants), msg
+       SHA256RNDS2     msg, state0, state1
+       PSHUFD          $0x0e, msg, msg
+       SHA256RNDS2     msg, state1, state0
+
+       // add current hash values with previously saved
+       PADDD           abefSave, state0
+       PADDD           cdghSave, state1
+
+       // advance data pointer; loop until buffer empty
+       ADDQ            $64, dataPtr
+       CMPQ            numBytes, dataPtr
+       JNE             roundLoop
+
+       // write hash values back in the correct order
+       PSHUFD          $0x1b, state0, state0           // FEBA
+       PSHUFD          $0xb1, state1, state1           // DCHG
+       VMOVDQA         state0, m4
+       PBLENDW         $0xf0, state1, state0           // DCBA
+       PALIGNR         $8, m4, state1                  // HGFE
+       VMOVDQU         state0, (0*16)(digestPtr)
+       VMOVDQU         state1, (1*16)(digestPtr)
+
+done:
+       RET
+
 // shuffle byte order from LE to BE
 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b