From: Roland Shoemaker Date: Sun, 19 Jan 2025 17:24:50 +0000 (-0800) Subject: crypto/sha1: add sha-ni AMD64 implementation X-Git-Tag: go1.25rc1~154 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=63dcc7b9067722a9ded7a67501a898764778108a;p=gostls13.git crypto/sha1: add sha-ni AMD64 implementation Based on the Intel docs. Provides a ~44% speed-up compared to the AVX implementation and a ~57% speed-up compared to the generic AMD64 assembly implementation. │ /usr/local/google/home/bracewell/sha1-avx.bench │ /usr/local/google/home/bracewell/sha1-ni-stack.bench │ │ sec/op │ sec/op vs base │ Hash8Bytes/New-24 157.60n ± 0% 92.51n ± 0% -41.30% (p=0.000 n=20) Hash8Bytes/Sum-24 147.00n ± 0% 85.06n ± 0% -42.14% (p=0.000 n=20) Hash320Bytes/New-24 625.3n ± 0% 276.7n ± 0% -55.75% (p=0.000 n=20) Hash320Bytes/Sum-24 626.2n ± 0% 272.4n ± 0% -56.51% (p=0.000 n=20) Hash1K/New-24 1206.5n ± 0% 692.2n ± 0% -42.63% (p=0.000 n=20) Hash1K/Sum-24 1210.0n ± 0% 688.2n ± 0% -43.13% (p=0.000 n=20) Hash8K/New-24 7.744µ ± 0% 4.920µ ± 0% -36.46% (p=0.000 n=20) Hash8K/Sum-24 7.737µ ± 0% 4.913µ ± 0% -36.50% (p=0.000 n=20) geomean 971.5n 536.1n -44.81% │ /usr/local/google/home/bracewell/sha1-avx.bench │ /usr/local/google/home/bracewell/sha1-ni-stack.bench │ │ B/s │ B/s vs base │ Hash8Bytes/New-24 48.41Mi ± 0% 82.47Mi ± 0% +70.37% (p=0.000 n=20) Hash8Bytes/Sum-24 51.90Mi ± 0% 89.70Mi ± 0% +72.82% (p=0.000 n=20) Hash320Bytes/New-24 488.0Mi ± 0% 1103.0Mi ± 0% +126.01% (p=0.000 n=20) Hash320Bytes/Sum-24 487.4Mi ± 0% 1120.5Mi ± 0% +129.91% (p=0.000 n=20) Hash1K/New-24 809.6Mi ± 0% 1410.8Mi ± 0% +74.26% (p=0.000 n=20) Hash1K/Sum-24 806.9Mi ± 0% 1419.1Mi ± 0% +75.86% (p=0.000 n=20) Hash8K/New-24 1008.9Mi ± 0% 1588.0Mi ± 0% +57.40% (p=0.000 n=20) Hash8K/Sum-24 1009.8Mi ± 0% 1590.1Mi ± 0% +57.47% (p=0.000 n=20) geomean 375.8Mi 680.9Mi +81.20% │ /usr/local/google/home/bracewell/sha1-amd64.bench │ /usr/local/google/home/bracewell/sha1-ni-stack.bench │ │ sec/op │ sec/op vs base │ Hash8Bytes/New-24 153.90n ± 0% 92.51n ± 0% -39.89% (p=0.000 n=20) Hash8Bytes/Sum-24 145.90n ± 0% 85.06n ± 0% -41.70% (p=0.000 n=20) Hash320Bytes/New-24 666.8n ± 0% 276.7n ± 0% -58.50% (p=0.000 n=20) Hash320Bytes/Sum-24 660.3n ± 0% 272.4n ± 0% -58.75% (p=0.000 n=20) Hash1K/New-24 1810.5n ± 0% 692.2n ± 0% -61.77% (p=0.000 n=20) Hash1K/Sum-24 1806.0n ± 0% 688.2n ± 0% -61.90% (p=0.000 n=20) Hash8K/New-24 13.509µ ± 0% 4.920µ ± 0% -63.58% (p=0.000 n=20) Hash8K/Sum-24 13.515µ ± 0% 4.913µ ± 0% -63.65% (p=0.000 n=20) geomean 1.248µ 536.1n -57.05% │ /usr/local/google/home/bracewell/sha1-amd64.bench │ /usr/local/google/home/bracewell/sha1-ni-stack.bench │ │ B/s │ B/s vs base │ Hash8Bytes/New-24 49.57Mi ± 0% 82.47Mi ± 0% +66.37% (p=0.000 n=20) Hash8Bytes/Sum-24 52.29Mi ± 0% 89.70Mi ± 0% +71.52% (p=0.000 n=20) Hash320Bytes/New-24 457.7Mi ± 0% 1103.0Mi ± 0% +140.97% (p=0.000 n=20) Hash320Bytes/Sum-24 462.2Mi ± 0% 1120.5Mi ± 0% +142.45% (p=0.000 n=20) Hash1K/New-24 539.4Mi ± 0% 1410.8Mi ± 0% +161.57% (p=0.000 n=20) Hash1K/Sum-24 540.7Mi ± 0% 1419.1Mi ± 0% +162.44% (p=0.000 n=20) Hash8K/New-24 578.4Mi ± 0% 1588.0Mi ± 0% +174.57% (p=0.000 n=20) Hash8K/Sum-24 578.1Mi ± 0% 1590.1Mi ± 0% +175.07% (p=0.000 n=20) geomean 292.4Mi 680.9Mi +132.86% Change-Id: Ife90386ba410a80c2e6222c1fe4df2368c4e12b2 Reviewed-on: https://go-review.googlesource.com/c/go/+/642157 Reviewed-by: Filippo Valsorda Auto-Submit: Roland Shoemaker Reviewed-by: Neal Patel LUCI-TryBot-Result: Go LUCI --- diff --git a/src/crypto/sha1/_asm/sha1block_amd64_asm.go b/src/crypto/sha1/_asm/sha1block_amd64_asm.go index 750f5ce31c..575c5de747 100644 --- a/src/crypto/sha1/_asm/sha1block_amd64_asm.go +++ b/src/crypto/sha1/_asm/sha1block_amd64_asm.go @@ -25,6 +25,7 @@ func main() { ConstraintExpr("!purego") blockAMD64() blockAVX2() + blockSHANI() Generate() } diff --git a/src/crypto/sha1/_asm/sha1block_amd64_shani.go b/src/crypto/sha1/_asm/sha1block_amd64_shani.go new file mode 100644 index 0000000000..0a0160a823 --- /dev/null +++ b/src/crypto/sha1/_asm/sha1block_amd64_shani.go @@ -0,0 +1,164 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "fmt" + + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" +) + +// Implement the SHA-1 block function using the Intel(R) SHA extensions +// (SHA1RNDS4, SHA1NEXTE, SHA1MSG1, and SHA1MSG2). This implementation requires +// the AVX, SHA, SSE2, SSE4.1, and SSSE3 extensions. +// +// Reference: +// S. Gulley, et al, "New Instructions Supporting the Secure Hash +// Algorithm on Intel® Architecture Processors", July 2013 +// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html + +func blockSHANI() { + Implement("blockSHANI") + + digest := Load(Param("dig"), RDI) + data := Load(Param("p").Base(), RSI) + len := Load(Param("p").Len(), RDX) + + abcd := XMM() + msg0, msg1, msg2, msg3 := XMM(), XMM(), XMM(), XMM() + e0, e1 := XMM(), XMM() + shufMask := XMM() + + CMPQ(len, Imm(0)) + JEQ(LabelRef("done")) + ADDQ(data, len) + + stackPtr := GP64() + { + Comment("Allocate space on the stack for saving ABCD and E0, and align it to 16 bytes") + local := AllocLocal(32 + 16) + LEAQ(local.Offset(15), stackPtr) + tmp := GP64() + MOVQ(U64(15), tmp) + NOTQ(tmp) + ANDQ(tmp, stackPtr) + } + e0_save := Mem{Base: stackPtr} + abcd_save := Mem{Base: stackPtr}.Offset(16) + + Comment("Load initial hash state") + PINSRD(Imm(3), Mem{Base: digest}.Offset(16), e0) + VMOVDQU(Mem{Base: digest}, abcd) + PAND(upperMask(), e0) + PSHUFD(Imm(0x1b), abcd, abcd) + + VMOVDQA(flipMask(), shufMask) + + Label("loop") + + Comment("Save ABCD and E working values") + VMOVDQA(e0, e0_save) + VMOVDQA(abcd, abcd_save) + + Comment("Rounds 0-3") + VMOVDQU(Mem{Base: data}, msg0) + PSHUFB(shufMask, msg0) + PADDD(msg0, e0) + VMOVDQA(abcd, e1) + SHA1RNDS4(Imm(0), e0, abcd) + + Comment("Rounds 4-7") + VMOVDQU(Mem{Base: data}.Offset(16), msg1) + PSHUFB(shufMask, msg1) + SHA1NEXTE(msg1, e1) + VMOVDQA(abcd, e0) + SHA1RNDS4(Imm(0), e1, abcd) + SHA1MSG1(msg1, msg0) + + Comment("Rounds 8-11") + VMOVDQU(Mem{Base: data}.Offset(16*2), msg2) + PSHUFB(shufMask, msg2) + SHA1NEXTE(msg2, e0) + VMOVDQA(abcd, e1) + SHA1RNDS4(Imm(0), e0, abcd) + SHA1MSG1(msg2, msg1) + PXOR(msg2, msg0) + + // Rounds 12 through 67 use the same repeated pattern, with e0 and e1 ping-ponging + // back and forth, and each of the msg temporaries moving up one every four rounds. + msgs := []VecVirtual{msg3, msg0, msg1, msg2} + for i := range 14 { + Comment(fmt.Sprintf("Rounds %d-%d", 12+(i*4), 12+(i*4)+3)) + a, b := e1, e0 + if i == 0 { + VMOVDQU(Mem{Base: data}.Offset(16*3), msg3) + PSHUFB(shufMask, msg3) + } + if i%2 == 1 { + a, b = e0, e1 + } + imm := uint64((12 + i*4) / 20) + + SHA1NEXTE(msgs[i%4], a) + VMOVDQA(abcd, b) + SHA1MSG2(msgs[i%4], msgs[(1+i)%4]) + SHA1RNDS4(Imm(imm), a, abcd) + SHA1MSG1(msgs[i%4], msgs[(3+i)%4]) + PXOR(msgs[i%4], msgs[(2+i)%4]) + } + + Comment("Rounds 68-71") + SHA1NEXTE(msg1, e1) + VMOVDQA(abcd, e0) + SHA1MSG2(msg1, msg2) + SHA1RNDS4(Imm(3), e1, abcd) + PXOR(msg1, msg3) + + Comment("Rounds 72-75") + SHA1NEXTE(msg2, e0) + VMOVDQA(abcd, e1) + SHA1MSG2(msg2, msg3) + SHA1RNDS4(Imm(3), e0, abcd) + + Comment("Rounds 76-79") + SHA1NEXTE(msg3, e1) + VMOVDQA(abcd, e0) + SHA1RNDS4(Imm(3), e1, abcd) + + Comment("Add saved E and ABCD") + SHA1NEXTE(e0_save, e0) + PADDD(abcd_save, abcd) + + Comment("Check if we are done, if not return to the loop") + ADDQ(Imm(64), data) + CMPQ(data, len) + JNE(LabelRef("loop")) + + Comment("Write the hash state back to digest") + PSHUFD(Imm(0x1b), abcd, abcd) + VMOVDQU(abcd, Mem{Base: digest}) + PEXTRD(Imm(3), e0, Mem{Base: digest}.Offset(16)) + + Label("done") + RET() +} + +func flipMask() Mem { + mask := GLOBL("shuffle_mask", RODATA) + // 0x000102030405060708090a0b0c0d0e0f + DATA(0x00, U64(0x08090a0b0c0d0e0f)) + DATA(0x08, U64(0x0001020304050607)) + return mask +} + +func upperMask() Mem { + mask := GLOBL("upper_mask", RODATA) + // 0xFFFFFFFF000000000000000000000000 + DATA(0x00, U64(0x0000000000000000)) + DATA(0x08, U64(0xFFFFFFFF00000000)) + return mask +} diff --git a/src/crypto/sha1/sha1block_amd64.go b/src/crypto/sha1/sha1block_amd64.go index 10376d1dcc..a497d41144 100644 --- a/src/crypto/sha1/sha1block_amd64.go +++ b/src/crypto/sha1/sha1block_amd64.go @@ -14,10 +14,16 @@ func blockAVX2(dig *digest, p []byte) //go:noescape func blockAMD64(dig *digest, p []byte) +//go:noescape +func blockSHANI(dig *digest, p []byte) + var useAVX2 = cpu.X86.HasAVX && cpu.X86.HasAVX2 && cpu.X86.HasBMI1 && cpu.X86.HasBMI2 +var useSHANI = cpu.X86.HasAVX && cpu.X86.HasSHA && cpu.X86.HasSSE41 && cpu.X86.HasSSSE3 func block(dig *digest, p []byte) { - if useAVX2 && len(p) >= 256 { + if useSHANI { + blockSHANI(dig, p) + } else if useAVX2 && len(p) >= 256 { // blockAVX2 calculates sha1 for 2 block per iteration // it also interleaves precalculation for next block. // So it may read up-to 192 bytes past end of p diff --git a/src/crypto/sha1/sha1block_amd64.s b/src/crypto/sha1/sha1block_amd64.s index 9c7aa14677..2a5fd2e8e5 100644 --- a/src/crypto/sha1/sha1block_amd64.s +++ b/src/crypto/sha1/sha1block_amd64.s @@ -3049,3 +3049,212 @@ DATA BSWAP_SHUFB_CTL<>+20(SB)/4, $0x04050607 DATA BSWAP_SHUFB_CTL<>+24(SB)/4, $0x08090a0b DATA BSWAP_SHUFB_CTL<>+28(SB)/4, $0x0c0d0e0f GLOBL BSWAP_SHUFB_CTL<>(SB), RODATA, $32 + +// func blockSHANI(dig *digest, p []byte) +// Requires: AVX, SHA, SSE2, SSE4.1, SSSE3 +TEXT ·blockSHANI(SB), $48-32 + MOVQ dig+0(FP), DI + MOVQ p_base+8(FP), SI + MOVQ p_len+16(FP), DX + CMPQ DX, $0x00 + JEQ done + ADDQ SI, DX + + // Allocate space on the stack for saving ABCD and E0, and align it to 16 bytes + LEAQ 15(SP), AX + MOVQ $0x000000000000000f, CX + NOTQ CX + ANDQ CX, AX + + // Load initial hash state + PINSRD $0x03, 16(DI), X5 + VMOVDQU (DI), X0 + PAND upper_mask<>+0(SB), X5 + PSHUFD $0x1b, X0, X0 + VMOVDQA shuffle_mask<>+0(SB), X7 + +loop: + // Save ABCD and E working values + VMOVDQA X5, (AX) + VMOVDQA X0, 16(AX) + + // Rounds 0-3 + VMOVDQU (SI), X1 + PSHUFB X7, X1 + PADDD X1, X5 + VMOVDQA X0, X6 + SHA1RNDS4 $0x00, X5, X0 + + // Rounds 4-7 + VMOVDQU 16(SI), X2 + PSHUFB X7, X2 + SHA1NEXTE X2, X6 + VMOVDQA X0, X5 + SHA1RNDS4 $0x00, X6, X0 + SHA1MSG1 X2, X1 + + // Rounds 8-11 + VMOVDQU 32(SI), X3 + PSHUFB X7, X3 + SHA1NEXTE X3, X5 + VMOVDQA X0, X6 + SHA1RNDS4 $0x00, X5, X0 + SHA1MSG1 X3, X2 + PXOR X3, X1 + + // Rounds 12-15 + VMOVDQU 48(SI), X4 + PSHUFB X7, X4 + SHA1NEXTE X4, X6 + VMOVDQA X0, X5 + SHA1MSG2 X4, X1 + SHA1RNDS4 $0x00, X6, X0 + SHA1MSG1 X4, X3 + PXOR X4, X2 + + // Rounds 16-19 + SHA1NEXTE X1, X5 + VMOVDQA X0, X6 + SHA1MSG2 X1, X2 + SHA1RNDS4 $0x00, X5, X0 + SHA1MSG1 X1, X4 + PXOR X1, X3 + + // Rounds 20-23 + SHA1NEXTE X2, X6 + VMOVDQA X0, X5 + SHA1MSG2 X2, X3 + SHA1RNDS4 $0x01, X6, X0 + SHA1MSG1 X2, X1 + PXOR X2, X4 + + // Rounds 24-27 + SHA1NEXTE X3, X5 + VMOVDQA X0, X6 + SHA1MSG2 X3, X4 + SHA1RNDS4 $0x01, X5, X0 + SHA1MSG1 X3, X2 + PXOR X3, X1 + + // Rounds 28-31 + SHA1NEXTE X4, X6 + VMOVDQA X0, X5 + SHA1MSG2 X4, X1 + SHA1RNDS4 $0x01, X6, X0 + SHA1MSG1 X4, X3 + PXOR X4, X2 + + // Rounds 32-35 + SHA1NEXTE X1, X5 + VMOVDQA X0, X6 + SHA1MSG2 X1, X2 + SHA1RNDS4 $0x01, X5, X0 + SHA1MSG1 X1, X4 + PXOR X1, X3 + + // Rounds 36-39 + SHA1NEXTE X2, X6 + VMOVDQA X0, X5 + SHA1MSG2 X2, X3 + SHA1RNDS4 $0x01, X6, X0 + SHA1MSG1 X2, X1 + PXOR X2, X4 + + // Rounds 40-43 + SHA1NEXTE X3, X5 + VMOVDQA X0, X6 + SHA1MSG2 X3, X4 + SHA1RNDS4 $0x02, X5, X0 + SHA1MSG1 X3, X2 + PXOR X3, X1 + + // Rounds 44-47 + SHA1NEXTE X4, X6 + VMOVDQA X0, X5 + SHA1MSG2 X4, X1 + SHA1RNDS4 $0x02, X6, X0 + SHA1MSG1 X4, X3 + PXOR X4, X2 + + // Rounds 48-51 + SHA1NEXTE X1, X5 + VMOVDQA X0, X6 + SHA1MSG2 X1, X2 + SHA1RNDS4 $0x02, X5, X0 + SHA1MSG1 X1, X4 + PXOR X1, X3 + + // Rounds 52-55 + SHA1NEXTE X2, X6 + VMOVDQA X0, X5 + SHA1MSG2 X2, X3 + SHA1RNDS4 $0x02, X6, X0 + SHA1MSG1 X2, X1 + PXOR X2, X4 + + // Rounds 56-59 + SHA1NEXTE X3, X5 + VMOVDQA X0, X6 + SHA1MSG2 X3, X4 + SHA1RNDS4 $0x02, X5, X0 + SHA1MSG1 X3, X2 + PXOR X3, X1 + + // Rounds 60-63 + SHA1NEXTE X4, X6 + VMOVDQA X0, X5 + SHA1MSG2 X4, X1 + SHA1RNDS4 $0x03, X6, X0 + SHA1MSG1 X4, X3 + PXOR X4, X2 + + // Rounds 64-67 + SHA1NEXTE X1, X5 + VMOVDQA X0, X6 + SHA1MSG2 X1, X2 + SHA1RNDS4 $0x03, X5, X0 + SHA1MSG1 X1, X4 + PXOR X1, X3 + + // Rounds 68-71 + SHA1NEXTE X2, X6 + VMOVDQA X0, X5 + SHA1MSG2 X2, X3 + SHA1RNDS4 $0x03, X6, X0 + PXOR X2, X4 + + // Rounds 72-75 + SHA1NEXTE X3, X5 + VMOVDQA X0, X6 + SHA1MSG2 X3, X4 + SHA1RNDS4 $0x03, X5, X0 + + // Rounds 76-79 + SHA1NEXTE X4, X6 + VMOVDQA X0, X5 + SHA1RNDS4 $0x03, X6, X0 + + // Add saved E and ABCD + SHA1NEXTE (AX), X5 + PADDD 16(AX), X0 + + // Check if we are done, if not return to the loop + ADDQ $0x40, SI + CMPQ SI, DX + JNE loop + + // Write the hash state back to digest + PSHUFD $0x1b, X0, X0 + VMOVDQU X0, (DI) + PEXTRD $0x03, X5, 16(DI) + +done: + RET + +DATA upper_mask<>+0(SB)/8, $0x0000000000000000 +DATA upper_mask<>+8(SB)/8, $0xffffffff00000000 +GLOBL upper_mask<>(SB), RODATA, $16 + +DATA shuffle_mask<>+0(SB)/8, $0x08090a0b0c0d0e0f +DATA shuffle_mask<>+8(SB)/8, $0x0001020304050607 +GLOBL shuffle_mask<>(SB), RODATA, $16