From b8ed752d6f2bd4d7edb48c8614a133182e7646ee Mon Sep 17 00:00:00 2001 From: Xiaolin Zhao Date: Wed, 9 Apr 2025 10:10:16 +0800 Subject: [PATCH] internal/chacha8rand: implement func block in assembly MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Benchmark result on Loongson-3A6000: goos: linux goarch: loong64 pkg: math/rand/v2 + internal/chacha8rand cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | ChaCha8MarshalBinary 67.39n ± 0% 65.96n ± 0% -2.12% (p=0.000 n=10) ChaCha8MarshalBinaryRead 80.93n ± 0% 78.31n ± 0% -3.23% (p=0.000 n=10) ChaCha8 10.610n ± 0% 5.129n ± 0% -51.66% (p=0.000 n=10) ChaCha8Read 51.30n ± 0% 28.05n ± 0% -45.31% (p=0.000 n=10) Block 218.50n ± 0% 45.48n ± 0% -79.19% (p=0.000 n=10) geomean 57.86n 32.05n -44.62% Benchmark result on Loongson-3A5000: goos: linux goarch: loong64 pkg: math/rand/v2 + internal/chacha8rand cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | ChaCha8MarshalBinary 116.3n ± 0% 116.6n ± 0% +0.26% (p=0.015 n=10) ChaCha8MarshalBinaryRead 142.6n ± 0% 142.0n ± 0% -0.39% (p=0.000 n=10) ChaCha8 16.270n ± 0% 6.848n ± 0% -57.91% (p=0.000 n=10) ChaCha8Read 78.65n ± 0% 47.39n ± 1% -39.74% (p=0.000 n=10) Block 301.50n ± 0% 91.85n ± 0% -69.53% (p=0.000 n=10) geomean 91.45n 54.79n -40.09% Change-Id: I64d80c81d2df288fecff80ae23ef89f0fb54cdfa Reviewed-on: https://go-review.googlesource.com/c/go/+/664035 Reviewed-by: Michael Pratt Reviewed-by: abner chenc Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI --- src/go/build/deps_test.go | 2 +- src/internal/chacha8rand/chacha8.go | 11 +- src/internal/chacha8rand/chacha8_loong64.s | 145 +++++++++++++++++++++ src/internal/chacha8rand/chacha8_stub.s | 2 +- 4 files changed, 157 insertions(+), 3 deletions(-) create mode 100644 src/internal/chacha8rand/chacha8_loong64.s diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go index c355cb42f7..a1a47487e1 100644 --- a/src/go/build/deps_test.go +++ b/src/go/build/deps_test.go @@ -68,7 +68,7 @@ var depsRules = ` unicode/utf16; internal/goarch < internal/abi; - internal/byteorder, internal/goarch < internal/chacha8rand; + internal/byteorder, internal/cpu, internal/goarch < internal/chacha8rand; # RUNTIME is the core runtime group of packages, all of them very light-weight. internal/abi, diff --git a/src/internal/chacha8rand/chacha8.go b/src/internal/chacha8rand/chacha8.go index 96fb8726cb..14a3c04d01 100644 --- a/src/internal/chacha8rand/chacha8.go +++ b/src/internal/chacha8rand/chacha8.go @@ -7,7 +7,16 @@ // and must have minimal dependencies. package chacha8rand -import "internal/byteorder" +import ( + "internal/byteorder" + "internal/cpu" + "unsafe" +) + +// Offsets into internal/cpu records for use in assembly. +const ( + offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX) +) const ( ctrInc = 4 // increment counter by 4 between block calls diff --git a/src/internal/chacha8rand/chacha8_loong64.s b/src/internal/chacha8rand/chacha8_loong64.s new file mode 100644 index 0000000000..caa1426a05 --- /dev/null +++ b/src/internal/chacha8rand/chacha8_loong64.s @@ -0,0 +1,145 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +DATA ·chachaConst+0x00(SB)/4, $0x61707865 +DATA ·chachaConst+0x04(SB)/4, $0x3320646e +DATA ·chachaConst+0x08(SB)/4, $0x79622d32 +DATA ·chachaConst+0x0c(SB)/4, $0x6b206574 +GLOBL ·chachaConst(SB), NOPTR|RODATA, $32 + +DATA ·chachaIncRot+0x00(SB)/4, $0x00000000 +DATA ·chachaIncRot+0x04(SB)/4, $0x00000001 +DATA ·chachaIncRot+0x08(SB)/4, $0x00000002 +DATA ·chachaIncRot+0x0c(SB)/4, $0x00000003 +GLOBL ·chachaIncRot(SB), NOPTR|RODATA, $32 + +// QR is the ChaCha8 quarter-round on a, b, c, and d. +#define QR(a, b, c, d) \ + VADDW a, b, a; \ + VXORV d, a, d; \ + VROTRW $16, d; \ + VADDW c, d, c; \ + VXORV b, c, b; \ + VROTRW $20, b; \ + VADDW a, b, a; \ + VXORV d, a, d; \ + VROTRW $24, d; \ + VADDW c, d, c; \ + VXORV b, c, b; \ + VROTRW $25, b + + +// func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32) +TEXT ·block(SB), NOSPLIT, $0 + // seed in R4 + // blocks in R5 + // counter in R6 + + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7 + BNE R7, lsx_chacha8 + JMP ·block_generic(SB) + RET + +lsx_chacha8: + MOVV $·chachaConst(SB), R10 + MOVV $·chachaIncRot(SB), R11 + + // load contants + // VLDREPL.W $0, R10, V0 + WORD $0x30200140 + // VLDREPL.W $1, R10, V1 + WORD $0x30200541 + // VLDREPL.W $2, R10, V2 + WORD $0x30200942 + // VLDREPL.W $3, R10, V3 + WORD $0x30200d43 + + // load 4-32bit data from incRotMatrix added to counter + VMOVQ (R11), V30 + + // load seed + // VLDREPL.W $0, R4, V4 + WORD $0x30200084 + // VLDREPL.W $1, R4, V5 + WORD $0x30200485 + // VLDREPL.W $2, R4, V6 + WORD $0x30200886 + // VLDREPL.W $3, R4, V7 + WORD $0x30200c87 + // VLDREPL.W $4, R4, V8 + WORD $0x30201088 + // VLDREPL.W $5, R4, V9 + WORD $0x30201489 + // VLDREPL.W $6, R4, V10 + WORD $0x3020188a + // VLDREPL.W $7, R4, V11 + WORD $0x30201c8b + + // load counter and update counter + VMOVQ R6, V12.W4 + VADDW V12, V30, V12 + + // zeros for remaining three matrix entries + VXORV V13, V13, V13 + VXORV V14, V14, V14 + VXORV V15, V15, V15 + + // save seed state for adding back later + VORV V4, V13, V20 + VORV V5, V13, V21 + VORV V6, V13, V22 + VORV V7, V13, V23 + VORV V8, V13, V24 + VORV V9, V13, V25 + VORV V10, V13, V26 + VORV V11, V13, V27 + + // 4 iterations. Each iteration is 8 quarter-rounds. + MOVV $4, R7 +loop: + QR(V0, V4, V8, V12) + QR(V1, V5, V9, V13) + QR(V2, V6, V10, V14) + QR(V3, V7, V11, V15) + + QR(V0, V5, V10, V15) + QR(V1, V6, V11, V12) + QR(V2, V7, V8, V13) + QR(V3, V4, V9, V14) + + SUBV $1, R7 + BNE R7, R0, loop + + // add seed back + VADDW V4, V20, V4 + VADDW V5, V21, V5 + VADDW V6, V22, V6 + VADDW V7, V23, V7 + VADDW V8, V24, V8 + VADDW V9, V25, V9 + VADDW V10, V26, V10 + VADDW V11, V27, V11 + + // store blocks back to output buffer + VMOVQ V0, (R5) + VMOVQ V1, 16(R5) + VMOVQ V2, 32(R5) + VMOVQ V3, 48(R5) + VMOVQ V4, 64(R5) + VMOVQ V5, 80(R5) + VMOVQ V6, 96(R5) + VMOVQ V7, 112(R5) + VMOVQ V8, 128(R5) + VMOVQ V9, 144(R5) + VMOVQ V10, 160(R5) + VMOVQ V11, 176(R5) + VMOVQ V12, 192(R5) + VMOVQ V13, 208(R5) + VMOVQ V14, 224(R5) + VMOVQ V15, 240(R5) + + RET diff --git a/src/internal/chacha8rand/chacha8_stub.s b/src/internal/chacha8rand/chacha8_stub.s index 09be558fcb..92858c118f 100644 --- a/src/internal/chacha8rand/chacha8_stub.s +++ b/src/internal/chacha8rand/chacha8_stub.s @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 && !arm64 +//go:build !amd64 && !arm64 && !loong64 #include "textflag.h" -- 2.50.0