From 6ba91df153dfe955f38e326c18ee38999571837b Mon Sep 17 00:00:00 2001 From: Xiaolin Zhao Date: Tue, 31 Dec 2024 21:02:47 +0800 Subject: [PATCH] math: implement func archExp and archExp2 in assembly on loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit goos: linux goarch: loong64 pkg: math cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Exp 26.30n ± 0% 12.93n ± 0% -50.85% (p=0.000 n=10) ExpGo 26.86n ± 0% 26.92n ± 0% +0.22% (p=0.000 n=10) Expm1 16.76n ± 0% 16.75n ± 0% ~ (p=0.060 n=10) Exp2 23.05n ± 0% 12.12n ± 0% -47.42% (p=0.000 n=10) Exp2Go 23.41n ± 0% 23.47n ± 0% +0.28% (p=0.000 n=10) geomean 22.97n 17.54n -23.64% goos: linux goarch: loong64 pkg: math/cmplx cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Exp 51.32n ± 0% 35.41n ± 0% -30.99% (p=0.000 n=10) goos: linux goarch: loong64 pkg: math cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Exp 50.27n ± 0% 48.75n ± 1% -3.01% (p=0.000 n=10) ExpGo 50.72n ± 0% 50.44n ± 0% -0.55% (p=0.000 n=10) Expm1 28.40n ± 0% 28.32n ± 0% ~ (p=0.360 n=10) Exp2 50.09n ± 0% 21.49n ± 1% -57.10% (p=0.000 n=10) Exp2Go 50.05n ± 0% 49.69n ± 0% -0.72% (p=0.000 n=10) geomean 44.85n 37.52n -16.35% goos: linux goarch: loong64 pkg: math/cmplx cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Exp 88.56n ± 0% 67.29n ± 0% -24.03% (p=0.000 n=10) Change-Id: I89e456d26fc075d83335ee4a31227d2aface5714 Reviewed-on: https://go-review.googlesource.com/c/go/+/653935 Reviewed-by: Junyang Shao Reviewed-by: abner chenc LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Pratt --- src/math/exp2_asm.go | 2 +- src/math/exp2_noasm.go | 2 +- src/math/exp_asm.go | 2 +- src/math/exp_loong64.s | 236 +++++++++++++++++++++++++++++++++++++++++ src/math/exp_noasm.go | 2 +- 5 files changed, 240 insertions(+), 4 deletions(-) create mode 100644 src/math/exp_loong64.s diff --git a/src/math/exp2_asm.go b/src/math/exp2_asm.go index c26b2c3fab..1e78759374 100644 --- a/src/math/exp2_asm.go +++ b/src/math/exp2_asm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build arm64 +//go:build arm64 || loong64 package math diff --git a/src/math/exp2_noasm.go b/src/math/exp2_noasm.go index c2b409329f..847138b622 100644 --- a/src/math/exp2_noasm.go +++ b/src/math/exp2_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !arm64 +//go:build !arm64 && !loong64 package math diff --git a/src/math/exp_asm.go b/src/math/exp_asm.go index 424442845b..125529fca3 100644 --- a/src/math/exp_asm.go +++ b/src/math/exp_asm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build amd64 || arm64 || s390x +//go:build amd64 || arm64 || loong64 || s390x package math diff --git a/src/math/exp_loong64.s b/src/math/exp_loong64.s new file mode 100644 index 0000000000..bf2823f888 --- /dev/null +++ b/src/math/exp_loong64.s @@ -0,0 +1,236 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +#define NearZero 0x3e30000000000000 // 2**-28 +#define PosInf 0x7ff0000000000000 +#define FracMask 0x000fffffffffffff +#define C1 0x3cb0000000000000 // 2**-52 + +DATA exprodata<>+0(SB)/8, $0.0 +DATA exprodata<>+8(SB)/8, $0.5 +DATA exprodata<>+16(SB)/8, $1.0 +DATA exprodata<>+24(SB)/8, $2.0 +DATA exprodata<>+32(SB)/8, $6.93147180369123816490e-01 // Ln2Hi +DATA exprodata<>+40(SB)/8, $1.90821492927058770002e-10 // Ln2Lo +DATA exprodata<>+48(SB)/8, $1.44269504088896338700e+00 // Log2e +DATA exprodata<>+56(SB)/8, $7.09782712893383973096e+02 // Overflow +DATA exprodata<>+64(SB)/8, $-7.45133219101941108420e+02 // Underflow +DATA exprodata<>+72(SB)/8, $1.0239999999999999e+03 // Overflow2 +DATA exprodata<>+80(SB)/8, $-1.0740e+03 // Underflow2 +DATA exprodata<>+88(SB)/8, $3.7252902984619141e-09 // NearZero +GLOBL exprodata<>+0(SB), NOPTR|RODATA, $96 + +DATA expmultirodata<>+0(SB)/8, $1.66666666666666657415e-01 // P1 +DATA expmultirodata<>+8(SB)/8, $-2.77777777770155933842e-03 // P2 +DATA expmultirodata<>+16(SB)/8, $6.61375632143793436117e-05 // P3 +DATA expmultirodata<>+24(SB)/8, $-1.65339022054652515390e-06 // P4 +DATA expmultirodata<>+32(SB)/8, $4.13813679705723846039e-08 // P5 +GLOBL expmultirodata<>+0(SB), NOPTR|RODATA, $40 + +// Exp returns e**x, the base-e exponential of x. +// This is an assembly implementation of the method used for function Exp in file exp.go. +// +// func Exp(x float64) float64 +TEXT ·archExp(SB),$0-16 + MOVD x+0(FP), F0 // F0 = x + + MOVV $exprodata<>+0(SB), R10 + MOVD 56(R10), F1 // Overflow + MOVD 64(R10), F2 // Underflow + MOVD 88(R10), F3 // NearZero + MOVD 16(R10), F17 // 1.0 + + CMPEQD F0, F0, FCC0 + BFPF isNaN // x = NaN, return NaN + + CMPGTD F0, F1, FCC0 + BFPT overflow // x > Overflow, return PosInf + + CMPGTD F2, F0, FCC0 + BFPT underflow // x < Underflow, return 0 + + ABSD F0, F5 + CMPGTD F3, F5, FCC0 + BFPT nearzero // fabs(x) < NearZero, return 1 + x + + // argument reduction, x = k*ln2 + r, |r| <= 0.5*ln2 + // computed as r = hi - lo for extra precision. + MOVD 0(R10), F5 + MOVD 8(R10), F3 + MOVD 48(R10), F2 + CMPGTD F0, F5, FCC0 + BFPT add // x > 0 +sub: + FMSUBD F3, F2, F0, F3 // Log2e*x - 0.5 + JMP 2(PC) +add: + FMADDD F3, F2, F0, F3 // Log2e*x + 0.5 + + FTINTRZVD F3, F4 // float64 -> int64 + MOVV F4, R5 // R5 = int(k) + FFINTDV F4, F3 // int64 -> float64 + + MOVD 32(R10), F4 + MOVD 40(R10), F5 + FNMSUBD F0, F3, F4, F4 + MULD F3, F5, F5 + SUBD F5, F4, F6 + MULD F6, F6, F7 + + // compute c + MOVV $expmultirodata<>+0(SB), R11 + MOVD 32(R11), F8 + MOVD 24(R11), F9 + FMADDD F9, F8, F7, F13 + MOVD 16(R11), F10 + FMADDD F10, F13, F7, F13 + MOVD 8(R11), F11 + FMADDD F11, F13, F7, F13 + MOVD 0(R11), F12 + FMADDD F12, F13, F7, F13 + FNMSUBD F6, F13, F7, F13 + + // compute y + MOVD 24(R10), F14 + SUBD F13, F14, F14 + MULD F6, F13, F15 + DIVD F14, F15, F15 + SUBD F15, F5, F15 + SUBD F4, F15, F15 + SUBD F15, F17, F16 + + // inline Ldexp(y, k), benefit: + // 1, no parameter pass overhead. + // 2, skip unnecessary checks for Inf/NaN/Zero + MOVV F16, R4 + MOVV $FracMask, R9 + AND R9, R4, R6 // fraction + SRLV $52, R4, R7 // exponent + ADDV R5, R7 + MOVV $1, R12 + BGE R7, R12, normal + ADDV $52, R7 // denormal + MOVV $C1, R8 + MOVV R8, F17 +normal: + SLLV $52, R7 + OR R7, R6, R4 + MOVV R4, F0 + MULD F17, F0 // return m * x + MOVD F0, ret+8(FP) + RET +nearzero: + ADDD F17, F0, F0 +isNaN: + MOVD F0, ret+8(FP) + RET +underflow: + MOVV R0, ret+8(FP) + RET +overflow: + MOVV $PosInf, R4 + MOVV R4, ret+8(FP) + RET + + +// Exp2 returns 2**x, the base-2 exponential of x. +// This is an assembly implementation of the method used for function Exp2 in file exp.go. +// +// func Exp2(x float64) float64 +TEXT ·archExp2(SB),$0-16 + MOVD x+0(FP), F0 // F0 = x + + MOVV $exprodata<>+0(SB), R10 + MOVD 72(R10), F1 // Overflow2 + MOVD 80(R10), F2 // Underflow2 + MOVD 88(R10), F3 // NearZero + + CMPEQD F0, F0, FCC0 + BFPF isNaN // x = NaN, return NaN + + CMPGTD F0, F1, FCC0 + BFPT overflow // x > Overflow, return PosInf + + CMPGTD F2, F0, FCC0 + BFPT underflow // x < Underflow, return 0 + + // argument reduction; x = r*lg(e) + k with |r| <= ln(2)/2 + // computed as r = hi - lo for extra precision. + MOVD 0(R10), F10 + MOVD 8(R10), F2 + CMPGTD F0, F10, FCC0 + BFPT add +sub: + SUBD F2, F0, F3 // x - 0.5 + JMP 2(PC) +add: + ADDD F2, F0, F3 // x + 0.5 + + FTINTRZVD F3, F4 + MOVV F4, R5 + FFINTDV F4, F3 + + MOVD 32(R10), F4 + MOVD 40(R10), F5 + SUBD F3, F0, F3 + MULD F3, F4 + FNMSUBD F10, F3, F5, F5 + SUBD F5, F4, F6 + MULD F6, F6, F7 + + // compute c + MOVV $expmultirodata<>+0(SB), R11 + MOVD 32(R11), F8 + MOVD 24(R11), F9 + FMADDD F9, F8, F7, F13 + MOVD 16(R11), F10 + FMADDD F10, F13, F7, F13 + MOVD 8(R11), F11 + FMADDD F11, F13, F7, F13 + MOVD 0(R11), F12 + FMADDD F12, F13, F7, F13 + FNMSUBD F6, F13, F7, F13 + + // compute y + MOVD 24(R10), F14 + SUBD F13, F14, F14 + MULD F6, F13, F15 + DIVD F14, F15 + + MOVD 16(R10), F17 + SUBD F15, F5, F15 + SUBD F4, F15, F15 + SUBD F15, F17, F16 + + // inline Ldexp(y, k), benefit: + // 1, no parameter pass overhead. + // 2, skip unnecessary checks for Inf/NaN/Zero + MOVV F16, R4 + MOVV $FracMask, R9 + SRLV $52, R4, R7 // exponent + AND R9, R4, R6 // fraction + ADDV R5, R7 + MOVV $1, R12 + BGE R7, R12, normal + + ADDV $52, R7 // denormal + MOVV $C1, R8 + MOVV R8, F17 +normal: + SLLV $52, R7 + OR R7, R6, R4 + MOVV R4, F0 + MULD F17, F0 +isNaN: + MOVD F0, ret+8(FP) + RET +underflow: + MOVV R0, ret+8(FP) + RET +overflow: + MOVV $PosInf, R4 + MOVV R4, ret+8(FP) + RET diff --git a/src/math/exp_noasm.go b/src/math/exp_noasm.go index bd3f02412a..bf5e84b736 100644 --- a/src/math/exp_noasm.go +++ b/src/math/exp_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 && !arm64 && !s390x +//go:build !amd64 && !arm64 && !loong64 && !s390x package math -- 2.51.0