--- /dev/null
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "fmt"
+
+ . "github.com/mmcloughlin/avo/build"
+ . "github.com/mmcloughlin/avo/operand"
+ . "github.com/mmcloughlin/avo/reg"
+)
+
+// Implement the SHA-1 block function using the Intel(R) SHA extensions
+// (SHA1RNDS4, SHA1NEXTE, SHA1MSG1, and SHA1MSG2). This implementation requires
+// the AVX, SHA, SSE2, SSE4.1, and SSSE3 extensions.
+//
+// Reference:
+// S. Gulley, et al, "New Instructions Supporting the Secure Hash
+// Algorithm on Intel® Architecture Processors", July 2013
+// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
+
+func blockSHANI() {
+ Implement("blockSHANI")
+
+ digest := Load(Param("dig"), RDI)
+ data := Load(Param("p").Base(), RSI)
+ len := Load(Param("p").Len(), RDX)
+
+ abcd := XMM()
+ msg0, msg1, msg2, msg3 := XMM(), XMM(), XMM(), XMM()
+ e0, e1 := XMM(), XMM()
+ shufMask := XMM()
+
+ CMPQ(len, Imm(0))
+ JEQ(LabelRef("done"))
+ ADDQ(data, len)
+
+ stackPtr := GP64()
+ {
+ Comment("Allocate space on the stack for saving ABCD and E0, and align it to 16 bytes")
+ local := AllocLocal(32 + 16)
+ LEAQ(local.Offset(15), stackPtr)
+ tmp := GP64()
+ MOVQ(U64(15), tmp)
+ NOTQ(tmp)
+ ANDQ(tmp, stackPtr)
+ }
+ e0_save := Mem{Base: stackPtr}
+ abcd_save := Mem{Base: stackPtr}.Offset(16)
+
+ Comment("Load initial hash state")
+ PINSRD(Imm(3), Mem{Base: digest}.Offset(16), e0)
+ VMOVDQU(Mem{Base: digest}, abcd)
+ PAND(upperMask(), e0)
+ PSHUFD(Imm(0x1b), abcd, abcd)
+
+ VMOVDQA(flipMask(), shufMask)
+
+ Label("loop")
+
+ Comment("Save ABCD and E working values")
+ VMOVDQA(e0, e0_save)
+ VMOVDQA(abcd, abcd_save)
+
+ Comment("Rounds 0-3")
+ VMOVDQU(Mem{Base: data}, msg0)
+ PSHUFB(shufMask, msg0)
+ PADDD(msg0, e0)
+ VMOVDQA(abcd, e1)
+ SHA1RNDS4(Imm(0), e0, abcd)
+
+ Comment("Rounds 4-7")
+ VMOVDQU(Mem{Base: data}.Offset(16), msg1)
+ PSHUFB(shufMask, msg1)
+ SHA1NEXTE(msg1, e1)
+ VMOVDQA(abcd, e0)
+ SHA1RNDS4(Imm(0), e1, abcd)
+ SHA1MSG1(msg1, msg0)
+
+ Comment("Rounds 8-11")
+ VMOVDQU(Mem{Base: data}.Offset(16*2), msg2)
+ PSHUFB(shufMask, msg2)
+ SHA1NEXTE(msg2, e0)
+ VMOVDQA(abcd, e1)
+ SHA1RNDS4(Imm(0), e0, abcd)
+ SHA1MSG1(msg2, msg1)
+ PXOR(msg2, msg0)
+
+ // Rounds 12 through 67 use the same repeated pattern, with e0 and e1 ping-ponging
+ // back and forth, and each of the msg temporaries moving up one every four rounds.
+ msgs := []VecVirtual{msg3, msg0, msg1, msg2}
+ for i := range 14 {
+ Comment(fmt.Sprintf("Rounds %d-%d", 12+(i*4), 12+(i*4)+3))
+ a, b := e1, e0
+ if i == 0 {
+ VMOVDQU(Mem{Base: data}.Offset(16*3), msg3)
+ PSHUFB(shufMask, msg3)
+ }
+ if i%2 == 1 {
+ a, b = e0, e1
+ }
+ imm := uint64((12 + i*4) / 20)
+
+ SHA1NEXTE(msgs[i%4], a)
+ VMOVDQA(abcd, b)
+ SHA1MSG2(msgs[i%4], msgs[(1+i)%4])
+ SHA1RNDS4(Imm(imm), a, abcd)
+ SHA1MSG1(msgs[i%4], msgs[(3+i)%4])
+ PXOR(msgs[i%4], msgs[(2+i)%4])
+ }
+
+ Comment("Rounds 68-71")
+ SHA1NEXTE(msg1, e1)
+ VMOVDQA(abcd, e0)
+ SHA1MSG2(msg1, msg2)
+ SHA1RNDS4(Imm(3), e1, abcd)
+ PXOR(msg1, msg3)
+
+ Comment("Rounds 72-75")
+ SHA1NEXTE(msg2, e0)
+ VMOVDQA(abcd, e1)
+ SHA1MSG2(msg2, msg3)
+ SHA1RNDS4(Imm(3), e0, abcd)
+
+ Comment("Rounds 76-79")
+ SHA1NEXTE(msg3, e1)
+ VMOVDQA(abcd, e0)
+ SHA1RNDS4(Imm(3), e1, abcd)
+
+ Comment("Add saved E and ABCD")
+ SHA1NEXTE(e0_save, e0)
+ PADDD(abcd_save, abcd)
+
+ Comment("Check if we are done, if not return to the loop")
+ ADDQ(Imm(64), data)
+ CMPQ(data, len)
+ JNE(LabelRef("loop"))
+
+ Comment("Write the hash state back to digest")
+ PSHUFD(Imm(0x1b), abcd, abcd)
+ VMOVDQU(abcd, Mem{Base: digest})
+ PEXTRD(Imm(3), e0, Mem{Base: digest}.Offset(16))
+
+ Label("done")
+ RET()
+}
+
+func flipMask() Mem {
+ mask := GLOBL("shuffle_mask", RODATA)
+ // 0x000102030405060708090a0b0c0d0e0f
+ DATA(0x00, U64(0x08090a0b0c0d0e0f))
+ DATA(0x08, U64(0x0001020304050607))
+ return mask
+}
+
+func upperMask() Mem {
+ mask := GLOBL("upper_mask", RODATA)
+ // 0xFFFFFFFF000000000000000000000000
+ DATA(0x00, U64(0x0000000000000000))
+ DATA(0x08, U64(0xFFFFFFFF00000000))
+ return mask
+}
DATA BSWAP_SHUFB_CTL<>+24(SB)/4, $0x08090a0b
DATA BSWAP_SHUFB_CTL<>+28(SB)/4, $0x0c0d0e0f
GLOBL BSWAP_SHUFB_CTL<>(SB), RODATA, $32
+
+// func blockSHANI(dig *digest, p []byte)
+// Requires: AVX, SHA, SSE2, SSE4.1, SSSE3
+TEXT ·blockSHANI(SB), $48-32
+ MOVQ dig+0(FP), DI
+ MOVQ p_base+8(FP), SI
+ MOVQ p_len+16(FP), DX
+ CMPQ DX, $0x00
+ JEQ done
+ ADDQ SI, DX
+
+ // Allocate space on the stack for saving ABCD and E0, and align it to 16 bytes
+ LEAQ 15(SP), AX
+ MOVQ $0x000000000000000f, CX
+ NOTQ CX
+ ANDQ CX, AX
+
+ // Load initial hash state
+ PINSRD $0x03, 16(DI), X5
+ VMOVDQU (DI), X0
+ PAND upper_mask<>+0(SB), X5
+ PSHUFD $0x1b, X0, X0
+ VMOVDQA shuffle_mask<>+0(SB), X7
+
+loop:
+ // Save ABCD and E working values
+ VMOVDQA X5, (AX)
+ VMOVDQA X0, 16(AX)
+
+ // Rounds 0-3
+ VMOVDQU (SI), X1
+ PSHUFB X7, X1
+ PADDD X1, X5
+ VMOVDQA X0, X6
+ SHA1RNDS4 $0x00, X5, X0
+
+ // Rounds 4-7
+ VMOVDQU 16(SI), X2
+ PSHUFB X7, X2
+ SHA1NEXTE X2, X6
+ VMOVDQA X0, X5
+ SHA1RNDS4 $0x00, X6, X0
+ SHA1MSG1 X2, X1
+
+ // Rounds 8-11
+ VMOVDQU 32(SI), X3
+ PSHUFB X7, X3
+ SHA1NEXTE X3, X5
+ VMOVDQA X0, X6
+ SHA1RNDS4 $0x00, X5, X0
+ SHA1MSG1 X3, X2
+ PXOR X3, X1
+
+ // Rounds 12-15
+ VMOVDQU 48(SI), X4
+ PSHUFB X7, X4
+ SHA1NEXTE X4, X6
+ VMOVDQA X0, X5
+ SHA1MSG2 X4, X1
+ SHA1RNDS4 $0x00, X6, X0
+ SHA1MSG1 X4, X3
+ PXOR X4, X2
+
+ // Rounds 16-19
+ SHA1NEXTE X1, X5
+ VMOVDQA X0, X6
+ SHA1MSG2 X1, X2
+ SHA1RNDS4 $0x00, X5, X0
+ SHA1MSG1 X1, X4
+ PXOR X1, X3
+
+ // Rounds 20-23
+ SHA1NEXTE X2, X6
+ VMOVDQA X0, X5
+ SHA1MSG2 X2, X3
+ SHA1RNDS4 $0x01, X6, X0
+ SHA1MSG1 X2, X1
+ PXOR X2, X4
+
+ // Rounds 24-27
+ SHA1NEXTE X3, X5
+ VMOVDQA X0, X6
+ SHA1MSG2 X3, X4
+ SHA1RNDS4 $0x01, X5, X0
+ SHA1MSG1 X3, X2
+ PXOR X3, X1
+
+ // Rounds 28-31
+ SHA1NEXTE X4, X6
+ VMOVDQA X0, X5
+ SHA1MSG2 X4, X1
+ SHA1RNDS4 $0x01, X6, X0
+ SHA1MSG1 X4, X3
+ PXOR X4, X2
+
+ // Rounds 32-35
+ SHA1NEXTE X1, X5
+ VMOVDQA X0, X6
+ SHA1MSG2 X1, X2
+ SHA1RNDS4 $0x01, X5, X0
+ SHA1MSG1 X1, X4
+ PXOR X1, X3
+
+ // Rounds 36-39
+ SHA1NEXTE X2, X6
+ VMOVDQA X0, X5
+ SHA1MSG2 X2, X3
+ SHA1RNDS4 $0x01, X6, X0
+ SHA1MSG1 X2, X1
+ PXOR X2, X4
+
+ // Rounds 40-43
+ SHA1NEXTE X3, X5
+ VMOVDQA X0, X6
+ SHA1MSG2 X3, X4
+ SHA1RNDS4 $0x02, X5, X0
+ SHA1MSG1 X3, X2
+ PXOR X3, X1
+
+ // Rounds 44-47
+ SHA1NEXTE X4, X6
+ VMOVDQA X0, X5
+ SHA1MSG2 X4, X1
+ SHA1RNDS4 $0x02, X6, X0
+ SHA1MSG1 X4, X3
+ PXOR X4, X2
+
+ // Rounds 48-51
+ SHA1NEXTE X1, X5
+ VMOVDQA X0, X6
+ SHA1MSG2 X1, X2
+ SHA1RNDS4 $0x02, X5, X0
+ SHA1MSG1 X1, X4
+ PXOR X1, X3
+
+ // Rounds 52-55
+ SHA1NEXTE X2, X6
+ VMOVDQA X0, X5
+ SHA1MSG2 X2, X3
+ SHA1RNDS4 $0x02, X6, X0
+ SHA1MSG1 X2, X1
+ PXOR X2, X4
+
+ // Rounds 56-59
+ SHA1NEXTE X3, X5
+ VMOVDQA X0, X6
+ SHA1MSG2 X3, X4
+ SHA1RNDS4 $0x02, X5, X0
+ SHA1MSG1 X3, X2
+ PXOR X3, X1
+
+ // Rounds 60-63
+ SHA1NEXTE X4, X6
+ VMOVDQA X0, X5
+ SHA1MSG2 X4, X1
+ SHA1RNDS4 $0x03, X6, X0
+ SHA1MSG1 X4, X3
+ PXOR X4, X2
+
+ // Rounds 64-67
+ SHA1NEXTE X1, X5
+ VMOVDQA X0, X6
+ SHA1MSG2 X1, X2
+ SHA1RNDS4 $0x03, X5, X0
+ SHA1MSG1 X1, X4
+ PXOR X1, X3
+
+ // Rounds 68-71
+ SHA1NEXTE X2, X6
+ VMOVDQA X0, X5
+ SHA1MSG2 X2, X3
+ SHA1RNDS4 $0x03, X6, X0
+ PXOR X2, X4
+
+ // Rounds 72-75
+ SHA1NEXTE X3, X5
+ VMOVDQA X0, X6
+ SHA1MSG2 X3, X4
+ SHA1RNDS4 $0x03, X5, X0
+
+ // Rounds 76-79
+ SHA1NEXTE X4, X6
+ VMOVDQA X0, X5
+ SHA1RNDS4 $0x03, X6, X0
+
+ // Add saved E and ABCD
+ SHA1NEXTE (AX), X5
+ PADDD 16(AX), X0
+
+ // Check if we are done, if not return to the loop
+ ADDQ $0x40, SI
+ CMPQ SI, DX
+ JNE loop
+
+ // Write the hash state back to digest
+ PSHUFD $0x1b, X0, X0
+ VMOVDQU X0, (DI)
+ PEXTRD $0x03, X5, 16(DI)
+
+done:
+ RET
+
+DATA upper_mask<>+0(SB)/8, $0x0000000000000000
+DATA upper_mask<>+8(SB)/8, $0xffffffff00000000
+GLOBL upper_mask<>(SB), RODATA, $16
+
+DATA shuffle_mask<>+0(SB)/8, $0x08090a0b0c0d0e0f
+DATA shuffle_mask<>+8(SB)/8, $0x0001020304050607
+GLOBL shuffle_mask<>(SB), RODATA, $16