From 4d10aba35eebe9cb4a0b6627815dc1fbddc97100 Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Sun, 3 Mar 2019 14:48:50 -0800 Subject: [PATCH] math/big: add fast path for amd64 addVW for large z MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This matches the pure Go fast path added in the previous commit. I will leave other architectures to those with ready access to hardware. name old time/op new time/op delta AddVW/1-8 3.60ns ± 3% 3.59ns ± 1% ~ (p=0.147 n=91+86) AddVW/2-8 3.92ns ± 1% 3.91ns ± 2% -0.36% (p=0.000 n=86+92) AddVW/3-8 4.33ns ± 5% 4.46ns ± 5% +2.94% (p=0.000 n=96+97) AddVW/4-8 4.76ns ± 5% 4.82ns ± 5% +1.28% (p=0.000 n=95+92) AddVW/5-8 5.40ns ± 1% 5.42ns ± 0% +0.47% (p=0.000 n=76+71) AddVW/10-8 8.03ns ± 1% 7.80ns ± 5% -2.90% (p=0.000 n=73+96) AddVW/100-8 43.8ns ± 5% 17.9ns ± 1% -59.12% (p=0.000 n=94+81) AddVW/1000-8 428ns ± 4% 85ns ± 6% -80.20% (p=0.000 n=96+99) AddVW/10000-8 4.22µs ± 2% 1.80µs ± 3% -57.32% (p=0.000 n=69+92) AddVW/100000-8 44.8µs ± 8% 31.5µs ± 3% -29.76% (p=0.000 n=99+90) name old time/op new time/op delta SubVW/1-8 3.53ns ± 2% 3.63ns ± 5% +2.97% (p=0.000 n=94+93) SubVW/2-8 4.33ns ± 5% 4.01ns ± 2% -7.36% (p=0.000 n=90+85) SubVW/3-8 4.32ns ± 2% 4.32ns ± 5% ~ (p=0.084 n=87+97) SubVW/4-8 4.70ns ± 2% 4.83ns ± 6% +2.77% (p=0.000 n=85+96) SubVW/5-8 5.84ns ± 1% 5.35ns ± 1% -8.35% (p=0.000 n=87+87) SubVW/10-8 8.01ns ± 4% 7.54ns ± 4% -5.84% (p=0.000 n=98+97) SubVW/100-8 43.9ns ± 5% 17.9ns ± 1% -59.20% (p=0.000 n=98+76) SubVW/1000-8 426ns ± 2% 85ns ± 3% -80.13% (p=0.000 n=90+98) SubVW/10000-8 4.24µs ± 2% 1.81µs ± 3% -57.28% (p=0.000 n=74+91) SubVW/100000-8 44.5µs ± 4% 31.5µs ± 2% -29.33% (p=0.000 n=84+91) Change-Id: I10dd361cbaca22197c27e7734c0f50065292afbb Reviewed-on: https://go-review.googlesource.com/c/go/+/164969 Run-TryBot: Josh Bleecher Snyder TryBot-Result: Gobot Gobot Reviewed-by: Robert Griesemer --- src/math/big/arith_amd64.s | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s index e9c8887523..a0d1660f51 100644 --- a/src/math/big/arith_amd64.s +++ b/src/math/big/arith_amd64.s @@ -143,6 +143,8 @@ E2: NEGQ CX // func addVW(z, x []Word, y Word) (c Word) TEXT ·addVW(SB),NOSPLIT,$0 MOVQ z_len+8(FP), DI + CMPQ DI, $32 + JG large MOVQ x+24(FP), R8 MOVQ y+48(FP), CX // c = y MOVQ z+0(FP), R10 @@ -189,12 +191,16 @@ L3: // n > 0 E3: MOVQ CX, c+56(FP) // return c RET +large: + JMP ·addVWlarge(SB) // func subVW(z, x []Word, y Word) (c Word) // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) TEXT ·subVW(SB),NOSPLIT,$0 MOVQ z_len+8(FP), DI + CMPQ DI, $32 + JG large MOVQ x+24(FP), R8 MOVQ y+48(FP), CX // c = y MOVQ z+0(FP), R10 @@ -242,6 +248,8 @@ L4: // n > 0 E4: MOVQ CX, c+56(FP) // return c RET +large: + JMP ·subVWlarge(SB) // func shlVU(z, x []Word, s uint) (c Word) -- 2.50.0