]> Cypherpunks repositories - gostls13.git/commitdiff
internal/chacha8rand: implement func block in assembly
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>
Wed, 9 Apr 2025 02:10:16 +0000 (10:10 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Tue, 15 Apr 2025 11:54:59 +0000 (04:54 -0700)
Benchmark result on Loongson-3A6000:
goos: linux
goarch: loong64
pkg: math/rand/v2 + internal/chacha8rand
cpu: Loongson-3A6000-HV @ 2500.00MHz
                         |  bench.old   |              bench.new              |
                         |    sec/op    |   sec/op     vs base                |
ChaCha8MarshalBinary        67.39n ± 0%   65.96n ± 0%   -2.12% (p=0.000 n=10)
ChaCha8MarshalBinaryRead    80.93n ± 0%   78.31n ± 0%   -3.23% (p=0.000 n=10)
ChaCha8                    10.610n ± 0%   5.129n ± 0%  -51.66% (p=0.000 n=10)
ChaCha8Read                 51.30n ± 0%   28.05n ± 0%  -45.31% (p=0.000 n=10)
Block                      218.50n ± 0%   45.48n ± 0%  -79.19% (p=0.000 n=10)
geomean                     57.86n        32.05n       -44.62%

Benchmark result on Loongson-3A5000:
goos: linux
goarch: loong64
pkg: math/rand/v2 + internal/chacha8rand
cpu: Loongson-3A5000 @ 2500.00MHz
                         |  bench.old   |              bench.new              |
                         |    sec/op    |   sec/op     vs base                |
ChaCha8MarshalBinary        116.3n ± 0%   116.6n ± 0%   +0.26% (p=0.015 n=10)
ChaCha8MarshalBinaryRead    142.6n ± 0%   142.0n ± 0%   -0.39% (p=0.000 n=10)
ChaCha8                    16.270n ± 0%   6.848n ± 0%  -57.91% (p=0.000 n=10)
ChaCha8Read                 78.65n ± 0%   47.39n ± 1%  -39.74% (p=0.000 n=10)
Block                      301.50n ± 0%   91.85n ± 0%  -69.53% (p=0.000 n=10)
geomean                     91.45n        54.79n       -40.09%

Change-Id: I64d80c81d2df288fecff80ae23ef89f0fb54cdfa
Reviewed-on: https://go-review.googlesource.com/c/go/+/664035
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/go/build/deps_test.go
src/internal/chacha8rand/chacha8.go
src/internal/chacha8rand/chacha8_loong64.s [new file with mode: 0644]
src/internal/chacha8rand/chacha8_stub.s

index c355cb42f7ec3bfe02a0272002da2fa58073d552..a1a47487e131ca582790addf50b2ed69a4c5267e 100644 (file)
@@ -68,7 +68,7 @@ var depsRules = `
          unicode/utf16;
 
        internal/goarch < internal/abi;
-       internal/byteorder, internal/goarch < internal/chacha8rand;
+       internal/byteorder, internal/cpu, internal/goarch < internal/chacha8rand;
 
        # RUNTIME is the core runtime group of packages, all of them very light-weight.
        internal/abi,
index 96fb8726cb367930b0622e835ab2851214fbd122..14a3c04d018420a926376652007cc97dbac743a5 100644 (file)
@@ -7,7 +7,16 @@
 // and must have minimal dependencies.
 package chacha8rand
 
-import "internal/byteorder"
+import (
+       "internal/byteorder"
+       "internal/cpu"
+       "unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+       offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
+)
 
 const (
        ctrInc = 4  // increment counter by 4 between block calls
diff --git a/src/internal/chacha8rand/chacha8_loong64.s b/src/internal/chacha8rand/chacha8_loong64.s
new file mode 100644 (file)
index 0000000..caa1426
--- /dev/null
@@ -0,0 +1,145 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+DATA   ·chachaConst+0x00(SB)/4, $0x61707865
+DATA   ·chachaConst+0x04(SB)/4, $0x3320646e
+DATA   ·chachaConst+0x08(SB)/4, $0x79622d32
+DATA   ·chachaConst+0x0c(SB)/4, $0x6b206574
+GLOBL  ·chachaConst(SB), NOPTR|RODATA, $32
+
+DATA   ·chachaIncRot+0x00(SB)/4, $0x00000000
+DATA   ·chachaIncRot+0x04(SB)/4, $0x00000001
+DATA   ·chachaIncRot+0x08(SB)/4, $0x00000002
+DATA   ·chachaIncRot+0x0c(SB)/4, $0x00000003
+GLOBL  ·chachaIncRot(SB), NOPTR|RODATA, $32
+
+// QR is the ChaCha8 quarter-round on a, b, c, and d.
+#define QR(a, b, c, d) \
+       VADDW   a, b, a; \
+       VXORV   d, a, d; \
+       VROTRW  $16, d; \
+       VADDW   c, d, c; \
+       VXORV   b, c, b; \
+       VROTRW  $20, b; \
+       VADDW   a, b, a; \
+       VXORV   d, a, d; \
+       VROTRW  $24, d; \
+       VADDW   c, d, c; \
+       VXORV   b, c, b; \
+       VROTRW  $25, b
+
+
+// func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32)
+TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
+       // seed in R4
+       // blocks in R5
+       // counter in R6
+
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
+       BNE     R7, lsx_chacha8
+       JMP     ·block_generic<ABIInternal>(SB)
+       RET
+
+lsx_chacha8:
+       MOVV    $·chachaConst(SB), R10
+       MOVV    $·chachaIncRot(SB), R11
+
+       // load contants
+       // VLDREPL.W  $0, R10, V0
+       WORD    $0x30200140
+       // VLDREPL.W  $1, R10, V1
+       WORD    $0x30200541
+       // VLDREPL.W  $2, R10, V2
+       WORD    $0x30200942
+       // VLDREPL.W  $3, R10, V3
+       WORD    $0x30200d43
+
+       // load 4-32bit data from incRotMatrix added to counter
+       VMOVQ   (R11), V30
+
+       // load seed
+       // VLDREPL.W  $0, R4, V4
+       WORD    $0x30200084
+       // VLDREPL.W  $1, R4, V5
+       WORD    $0x30200485
+       // VLDREPL.W  $2, R4, V6
+       WORD    $0x30200886
+       // VLDREPL.W  $3, R4, V7
+       WORD    $0x30200c87
+       // VLDREPL.W  $4, R4, V8
+       WORD    $0x30201088
+       // VLDREPL.W  $5, R4, V9
+       WORD    $0x30201489
+       // VLDREPL.W  $6, R4, V10
+       WORD    $0x3020188a
+       // VLDREPL.W  $7, R4, V11
+       WORD    $0x30201c8b
+
+       // load counter and update counter
+       VMOVQ   R6, V12.W4
+       VADDW   V12, V30, V12
+
+       // zeros for remaining three matrix entries
+       VXORV   V13, V13, V13
+       VXORV   V14, V14, V14
+       VXORV   V15, V15, V15
+
+       // save seed state for adding back later
+       VORV    V4, V13, V20
+       VORV    V5, V13, V21
+       VORV    V6, V13, V22
+       VORV    V7, V13, V23
+       VORV    V8, V13, V24
+       VORV    V9, V13, V25
+       VORV    V10, V13, V26
+       VORV    V11, V13, V27
+
+       // 4 iterations. Each iteration is 8 quarter-rounds.
+       MOVV    $4, R7
+loop:
+       QR(V0, V4, V8, V12)
+       QR(V1, V5, V9, V13)
+       QR(V2, V6, V10, V14)
+       QR(V3, V7, V11, V15)
+
+       QR(V0, V5, V10, V15)
+       QR(V1, V6, V11, V12)
+       QR(V2, V7, V8, V13)
+       QR(V3, V4, V9, V14)
+
+       SUBV    $1, R7
+       BNE     R7, R0, loop
+
+       // add seed back
+       VADDW   V4, V20, V4
+       VADDW   V5, V21, V5
+       VADDW   V6, V22, V6
+       VADDW   V7, V23, V7
+       VADDW   V8, V24, V8
+       VADDW   V9, V25, V9
+       VADDW   V10, V26, V10
+       VADDW   V11, V27, V11
+
+       // store blocks back to output buffer
+       VMOVQ   V0, (R5)
+       VMOVQ   V1, 16(R5)
+       VMOVQ   V2, 32(R5)
+       VMOVQ   V3, 48(R5)
+       VMOVQ   V4, 64(R5)
+       VMOVQ   V5, 80(R5)
+       VMOVQ   V6, 96(R5)
+       VMOVQ   V7, 112(R5)
+       VMOVQ   V8, 128(R5)
+       VMOVQ   V9, 144(R5)
+       VMOVQ   V10, 160(R5)
+       VMOVQ   V11, 176(R5)
+       VMOVQ   V12, 192(R5)
+       VMOVQ   V13, 208(R5)
+       VMOVQ   V14, 224(R5)
+       VMOVQ   V15, 240(R5)
+
+       RET
index 09be558fcb5d64bdc86ae903c9da3b591a3fd868..92858c118f4c0d41316cb4a8b13a4ef998fe392f 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build !amd64 && !arm64
+//go:build !amd64 && !arm64 && !loong64
 
 #include "textflag.h"