]> Cypherpunks repositories - gostls13.git/commitdiff
hash/crc32: improve performance of ppc64SlicingUpdateBy8 on ppc64le
authorArchana R <aravind5@in.ibm.com>
Mon, 20 Sep 2021 04:50:02 +0000 (23:50 -0500)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Wed, 22 Sep 2021 16:53:49 +0000 (16:53 +0000)
Reduce the number of instructions in the short loop of
ppc64SlicingUpdateBy8 function by combining MOVWZ and SRD into a SRD
with appropriate parameters performing the same operation and remove
MOVWZ R7,R7 from the loop
This change produces the following improvements on POWER9. None of the
other tests regress. Improvments on other POWERPC platforms similar.

name                                     old time/op    new time/op
delta

CRC32/poly=IEEE/size=15/align=0            80.5ns ± 0%    70.6ns ± 0%
-12%
CRC32/poly=IEEE/size=15/align=1            80.5ns ± 0%    70.6ns ± 0%
-12%
CRC32/poly=IEEE/size=512/align=1            151ns ± 0%     139ns ± 0%
-7%
CRC32/poly=IEEE/size=1kB/align=1            167ns ± 0%     155ns ± 0%
-7%
CRC32/poly=Castagnoli/size=15/align=0      80.2ns ± 0%    70.5ns ± 0%
-12%
CRC32/poly=Castagnoli/size=15/align=1      80.2ns ± 0%    70.5ns ± 0%
-12%
CRC32/poly=Castagnoli/size=512/align=1      150ns ± 0%     139ns ± 0%
-7%
CRC32/poly=Castagnoli/size=1kB/align=1      166ns ± 0%     155ns ± 0%
-6%

Change-Id: I424709041c30d1c637b595d0845e3ae78dc3e0a0
Reviewed-on: https://go-review.googlesource.com/c/go/+/350989
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>

src/hash/crc32/crc32_ppc64le.s

index 10d5dd61dbcb7791cb32377249c0bfaed77cc9fd..763d3270f30d264894acc9aa44c3c6b429cd33c9 100644 (file)
@@ -112,19 +112,17 @@ loop:
        ANDCC   $7,R6,R8        // any leftover bytes
        BEQ     done            // none --> done
        MOVD    R8,CTR          // byte count
-
+        PCALIGN $16             // align short loop
 short:
        MOVBZ   0(R5),R8        // get v
        MOVBZ   R7,R9           // byte(crc) -> R8 BE vs LE?
-       MOVWZ   R7,R14
-       SRD     $8,R14,R14      // crc>>8
+        SRD     $8,R7,R14       // crc>>8
        XOR     R8,R9,R8        // byte(crc)^v -> R8
        ADD     $1,R5           // ptr to next v
        SLD     $2,R8           // convert index-> bytes
        ADD     R8,R4,R9        // &tab[byte(crc)^v]
        MOVWZ   0(R9),R10       // tab[byte(crc)^v]
        XOR     R10,R14,R7       // loop crc in R7
-       MOVWZ   R7,R7           // 32 bits
        BC      16,0,short
 done:
        NOR     R7,R7,R7        // ^crc