internal/chacha8rand: implement func block in assembly

author Xiaolin Zhao <zhaoxiaolin@loongson.cn>

Wed, 9 Apr 2025 02:10:16 +0000 (10:10 +0800)

committer abner chenc <chenguoqi@loongson.cn>

Tue, 15 Apr 2025 11:54:59 +0000 (04:54 -0700)
author Xiaolin Zhao <zhaoxiaolin@loongson.cn>
Wed, 9 Apr 2025 02:10:16 +0000 (10:10 +0800)
committer abner chenc <chenguoqi@loongson.cn>
Tue, 15 Apr 2025 11:54:59 +0000 (04:54 -0700)
diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go

index c355cb42f7ec3bfe02a0272002da2fa58073d552..a1a47487e131ca582790addf50b2ed69a4c5267e 100644 (file)
--- a/src/go/build/deps_test.go
+++ b/src/go/build/deps_test.go
@@ -68,7 +68,7 @@ var depsRules = `
           unicode/utf16;
  
         internal/goarch < internal/abi;
-       internal/byteorder, internal/goarch < internal/chacha8rand;
+       internal/byteorder, internal/cpu, internal/goarch < internal/chacha8rand;
  
         # RUNTIME is the core runtime group of packages, all of them very light-weight.
         internal/abi,
diff --git a/src/internal/chacha8rand/chacha8.go b/src/internal/chacha8rand/chacha8.go

index 96fb8726cb367930b0622e835ab2851214fbd122..14a3c04d018420a926376652007cc97dbac743a5 100644 (file)
--- a/src/internal/chacha8rand/chacha8.go
+++ b/src/internal/chacha8rand/chacha8.go
@@ -7,7 +7,16 @@
  // and must have minimal dependencies.
  package chacha8rand
  
-import "internal/byteorder"
+import (
+       "internal/byteorder"
+       "internal/cpu"
+       "unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+       offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
+)
  
  const (
         ctrInc = 4  // increment counter by 4 between block calls
diff --git a/src/internal/chacha8rand/chacha8_loong64.s b/src/internal/chacha8rand/chacha8_loong64.s

new file mode 100644 (file)

index 0000000..caa1426
--- /dev/null
+++ b/src/internal/chacha8rand/chacha8_loong64.s
@@ -0,0 +1,145 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+DATA   ·chachaConst+0x00(SB)/4, $0x61707865
+DATA   ·chachaConst+0x04(SB)/4, $0x3320646e
+DATA   ·chachaConst+0x08(SB)/4, $0x79622d32
+DATA   ·chachaConst+0x0c(SB)/4, $0x6b206574
+GLOBL  ·chachaConst(SB), NOPTR|RODATA, $32
+
+DATA   ·chachaIncRot+0x00(SB)/4, $0x00000000
+DATA   ·chachaIncRot+0x04(SB)/4, $0x00000001
+DATA   ·chachaIncRot+0x08(SB)/4, $0x00000002
+DATA   ·chachaIncRot+0x0c(SB)/4, $0x00000003
+GLOBL  ·chachaIncRot(SB), NOPTR|RODATA, $32
+
+// QR is the ChaCha8 quarter-round on a, b, c, and d.
+#define QR(a, b, c, d) \
+       VADDW   a, b, a; \
+       VXORV   d, a, d; \
+       VROTRW  $16, d; \
+       VADDW   c, d, c; \
+       VXORV   b, c, b; \
+       VROTRW  $20, b; \
+       VADDW   a, b, a; \
+       VXORV   d, a, d; \
+       VROTRW  $24, d; \
+       VADDW   c, d, c; \
+       VXORV   b, c, b; \
+       VROTRW  $25, b
+
+
+// func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32)
+TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
+       // seed in R4
+       // blocks in R5
+       // counter in R6
+
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
+       BNE     R7, lsx_chacha8
+       JMP     ·block_generic<ABIInternal>(SB)
+       RET
+
+lsx_chacha8:
+       MOVV    $·chachaConst(SB), R10
+       MOVV    $·chachaIncRot(SB), R11
+
+       // load contants
+       // VLDREPL.W  $0, R10, V0
+       WORD    $0x30200140
+       // VLDREPL.W  $1, R10, V1
+       WORD    $0x30200541
+       // VLDREPL.W  $2, R10, V2
+       WORD    $0x30200942
+       // VLDREPL.W  $3, R10, V3
+       WORD    $0x30200d43
+
+       // load 4-32bit data from incRotMatrix added to counter
+       VMOVQ   (R11), V30
+
+       // load seed
+       // VLDREPL.W  $0, R4, V4
+       WORD    $0x30200084
+       // VLDREPL.W  $1, R4, V5
+       WORD    $0x30200485
+       // VLDREPL.W  $2, R4, V6
+       WORD    $0x30200886
+       // VLDREPL.W  $3, R4, V7
+       WORD    $0x30200c87
+       // VLDREPL.W  $4, R4, V8
+       WORD    $0x30201088
+       // VLDREPL.W  $5, R4, V9
+       WORD    $0x30201489
+       // VLDREPL.W  $6, R4, V10
+       WORD    $0x3020188a
+       // VLDREPL.W  $7, R4, V11
+       WORD    $0x30201c8b
+
+       // load counter and update counter
+       VMOVQ   R6, V12.W4
+       VADDW   V12, V30, V12
+
+       // zeros for remaining three matrix entries
+       VXORV   V13, V13, V13
+       VXORV   V14, V14, V14
+       VXORV   V15, V15, V15
+
+       // save seed state for adding back later
+       VORV    V4, V13, V20
+       VORV    V5, V13, V21
+       VORV    V6, V13, V22
+       VORV    V7, V13, V23
+       VORV    V8, V13, V24
+       VORV    V9, V13, V25
+       VORV    V10, V13, V26
+       VORV    V11, V13, V27
+
+       // 4 iterations. Each iteration is 8 quarter-rounds.
+       MOVV    $4, R7
+loop:
+       QR(V0, V4, V8, V12)
+       QR(V1, V5, V9, V13)
+       QR(V2, V6, V10, V14)
+       QR(V3, V7, V11, V15)
+
+       QR(V0, V5, V10, V15)
+       QR(V1, V6, V11, V12)
+       QR(V2, V7, V8, V13)
+       QR(V3, V4, V9, V14)
+
+       SUBV    $1, R7
+       BNE     R7, R0, loop
+
+       // add seed back
+       VADDW   V4, V20, V4
+       VADDW   V5, V21, V5
+       VADDW   V6, V22, V6
+       VADDW   V7, V23, V7
+       VADDW   V8, V24, V8
+       VADDW   V9, V25, V9
+       VADDW   V10, V26, V10
+       VADDW   V11, V27, V11
+
+       // store blocks back to output buffer
+       VMOVQ   V0, (R5)
+       VMOVQ   V1, 16(R5)
+       VMOVQ   V2, 32(R5)
+       VMOVQ   V3, 48(R5)
+       VMOVQ   V4, 64(R5)
+       VMOVQ   V5, 80(R5)
+       VMOVQ   V6, 96(R5)
+       VMOVQ   V7, 112(R5)
+       VMOVQ   V8, 128(R5)
+       VMOVQ   V9, 144(R5)
+       VMOVQ   V10, 160(R5)
+       VMOVQ   V11, 176(R5)
+       VMOVQ   V12, 192(R5)
+       VMOVQ   V13, 208(R5)
+       VMOVQ   V14, 224(R5)
+       VMOVQ   V15, 240(R5)
+
+       RET
diff --git a/src/internal/chacha8rand/chacha8_stub.s b/src/internal/chacha8rand/chacha8_stub.s

index 09be558fcb5d64bdc86ae903c9da3b591a3fd868..92858c118f4c0d41316cb4a8b13a4ef998fe392f 100644 (file)
--- a/src/internal/chacha8rand/chacha8_stub.s
+++ b/src/internal/chacha8rand/chacha8_stub.s
@@ -2,7 +2,7 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-//go:build !amd64 && !arm64
+//go:build !amd64 && !arm64 && !loong64
  
  #include "textflag.h"
author	Xiaolin Zhao <zhaoxiaolin@loongson.cn>
	Wed, 9 Apr 2025 02:10:16 +0000 (10:10 +0800)
committer	abner chenc <chenguoqi@loongson.cn>
	Tue, 15 Apr 2025 11:54:59 +0000 (04:54 -0700)
src/go/build/deps_test.go		patch \| blob \| history
src/internal/chacha8rand/chacha8.go		patch \| blob \| history
src/internal/chacha8rand/chacha8_loong64.s	[new file with mode: 0644]	patch \| blob
src/internal/chacha8rand/chacha8_stub.s		patch \| blob \| history