]> Cypherpunks repositories - gostls13.git/commitdiff
hash/crc32: improve asm for ppc64SlicingUpdateBy8
authorJayanth Krishnamurthy <jayanth.krishnamurthy@ibm.com>
Mon, 29 Apr 2024 17:37:27 +0000 (12:37 -0500)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Mon, 6 May 2024 12:09:50 +0000 (12:09 +0000)
Improvements are made in the assembler code which improves  time and
space by 9-10%.
1. ANDCC, followed by SLD is combined and replaced by CLRLSLDI.
2. MOVWZ can use an indexed load and eliminate an ADD instruction in some cases.
Example: ADD R7,R10,R7 followed by MOVWZ 0(R7),R5 can be replaced with just MOVWZ (R7)(R10),R5.
3. Optimizations for the block after the "short" label includes the same MOVWZ use of indexed load, as well as other improvements.

The gain from code  changes can be seen as follows, generated by
benchstat:

goos: linux
goarch: ppc64le
pkg: hash/crc32
cpu: POWER10
                                     |  oldCrc.out   |  newCrc.out                       |
                                     |    sec/op     |   sec/op     vs base              |
CRC32/poly=IEEE/size=15/align=0-12      50.19n ±  1%   39.85n ± 0%  -20.59% (p=0.002 n=6)
CRC32/poly=IEEE/size=15/align=1-12      50.18n ±  1%   39.87n ± 0%  -20.54% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=0-12      40.25n ±  0%   36.95n ± 0%   -8.19% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=1-12      40.31n ±  0%   36.95n ± 0%   -8.36% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=0-12     38.03n ±  0%   38.17n ± 0%   +0.37% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=1-12     89.19n ±  1%   73.65n ± 0%  -17.43% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=0-12     50.73n ±  7%   50.14n ± 0%   -1.18% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=1-12    101.00n ± 37%   81.58n ± 0%  -19.23% (p=0.002 n=6)
CRC32/poly=IEEE/size=4kB/align=0-12     98.30n ± 45%   93.05n ± 0%   -5.34% (p=0.043 n=6)
CRC32/poly=IEEE/size=4kB/align=1-12     140.8n ±  0%   125.8n ± 0%  -10.65% (p=0.002 n=6)
CRC32/poly=IEEE/size=32kB/align=0-12    525.8n ±  0%   528.5n ± 0%   +0.52% (p=0.011 n=6)
CRC32/poly=IEEE/size=32kB/align=1-12    584.4n ±  1%   576.3n ± 0%   -1.39% (p=0.002 n=6)
geomean                                 90.51n         81.74n        -9.69%

                             |    oldCrc.out |    newCrc.out              |
                                     |      B/s      |     B/s       vs base       |
CRC32/poly=IEEE/size=15/align=0-12     285.0Mi ±  1%    359.0Mi ± 0%  +25.94% (p=0.002 n=6)
CRC32/poly=IEEE/size=15/align=1-12     285.1Mi ±  1%    358.8Mi ± 0%  +25.86% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=0-12     947.8Mi ±  0%   1032.3Mi ± 0%   +8.91% (p=0.002 n=6)
CRC32/poly=IEEE/size=40/align=1-12     946.2Mi ±  0%   1032.5Mi ± 0%   +9.12% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=0-12    12.54Gi ±  0%    12.49Gi ± 0%   -0.37% (p=0.002 n=6)
CRC32/poly=IEEE/size=512/align=1-12    5.346Gi ±  1%    6.475Gi ± 0%  +21.12% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=0-12    18.80Gi ±  7%    19.02Gi ± 0%   +1.20% (p=0.002 n=6)
CRC32/poly=IEEE/size=1kB/align=1-12    9.454Gi ± 27%   11.690Gi ± 0%  +23.66% (p=0.002 n=6)
CRC32/poly=IEEE/size=4kB/align=0-12    38.86Gi ± 31%    41.00Gi ± 0%   +5.49% (p=0.041 n=6)
CRC32/poly=IEEE/size=4kB/align=1-12    27.10Gi ±  0%    30.32Gi ± 0%  +11.89% (p=0.002 n=6)
CRC32/poly=IEEE/size=32kB/align=0-12   58.05Gi ±  0%    57.74Gi ± 0%   -0.53% (p=0.009 n=6)
CRC32/poly=IEEE/size=32kB/align=1-12   52.22Gi ±  1%    52.95Gi ± 0%   +1.41% (p=0.002 n=6)
geomean                                6.074Gi          6.724Gi       +10.70%

Change-Id: I378c0e84e798656384a8009f4ac48b51614489b2
Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-ppc64_power8,gotip-linux-ppc64le_power8,gotip-linux-ppc64le_power9,gotip-linux-ppc64le_power10
Reviewed-on: https://go-review.googlesource.com/c/go/+/582395
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Eli Bendersky <eliben@google.com>
src/hash/crc32/crc32_ppc64le.s

index 84ef2133127dcf40ac8558da7dab0e65c6bafbef..fb7c783f93d6e1ae05053b980001b133ae09faec 100644 (file)
@@ -63,67 +63,56 @@ loop:
        RLDICL  $40,R9,$56,R17  // p[7]
        SLD     $2,R17,R17      // p[7]*4
        RLDICL  $40,R7,$56,R8   // crc>>24
-       ADD     R17,R10,R17     // &tab[0][p[7]]
        SLD     $2,R8,R8        // crc>>24*4
        RLDICL  $48,R9,$56,R18  // p[6]
        SLD     $2,R18,R18      // p[6]*4
+       MOVWZ   (R10)(R17),R21  // tab[0][p[7]]
        ADD     $1024,R10,R10   // tab[1]
-       MOVWZ   0(R17),R21      // tab[0][p[7]]
        RLDICL  $56,R9,$56,R19  // p[5]
-       ADD     R10,R18,R18     // &tab[1][p[6]]
        SLD     $2,R19,R19      // p[5]*4:1
-       MOVWZ   0(R18),R22      // tab[1][p[6]]
+       MOVWZ   (R10)(R18),R22  // tab[1][p[6]]
        ADD     $1024,R10,R10   // tab[2]
        XOR     R21,R22,R21     // xor done R22
-       ADD     R19,R10,R19     // &tab[2][p[5]]
-       ANDCC   $255,R9,R20     // p[4] ??
-       SLD     $2,R20,R20      // p[4]*4
-       MOVWZ   0(R19),R23      // tab[2][p[5]]
+       CLRLSLDI $56,R9,$2,R20
+       MOVWZ   (R10)(R19),R23  // tab[2][p[5]]
        ADD     $1024,R10,R10   // &tab[3]
-       ADD     R20,R10,R20     // tab[3][p[4]]
        XOR     R21,R23,R21     // xor done R23
-       ADD     $1024,R10,R10   // &tab[4]
-       MOVWZ   0(R20),R24      // tab[3][p[4]]
-       ADD     R10,R8,R23      // &tab[4][crc>>24]
+       MOVWZ   (R10)(R20),R24  // tab[3][p[4]]
+       ADD     $1024,R10,R10   // &tab[4]
        XOR     R21,R24,R21     // xor done R24
-       MOVWZ   0(R23),R25      // tab[4][crc>>24]
+       MOVWZ   (R10)(R8),R25   // tab[4][crc>>24]
        RLDICL  $48,R7,$56,R24  // crc>>16&0xFF
        XOR     R21,R25,R21     // xor done R25
        ADD     $1024,R10,R10   // &tab[5]
        SLD     $2,R24,R24      // crc>>16&0xFF*4
-       ADD     R24,R10,R24     // &tab[5][crc>>16&0xFF]
-       MOVWZ   0(R24),R26      // tab[5][crc>>16&0xFF]
+       MOVWZ   (R10)(R24),R26  // tab[5][crc>>16&0xFF]
        XOR     R21,R26,R21     // xor done R26
        RLDICL  $56,R7,$56,R25  // crc>>8
        ADD     $1024,R10,R10   // &tab[6]
        SLD     $2,R25,R25      // crc>>8&FF*2
-       ADD     R25,R10,R25     // &tab[6][crc>>8&0xFF]
        MOVBZ   R7,R26          // crc&0xFF
-       ADD     $1024,R10,R10   // &tab[7]
-       MOVWZ   0(R25),R27      // tab[6][crc>>8&0xFF]
+       MOVWZ   (R10)(R25),R27  // tab[6][crc>>8&0xFF]
+       ADD     $1024,R10,R10   // &tab[7]
        SLD     $2,R26,R26      // crc&0xFF*2
        XOR     R21,R27,R21     // xor done R27
-       ADD     R26,R10,R26     // &tab[7][crc&0xFF]
        ADD     $8,R5           // p = p[8:]
-       MOVWZ   0(R26),R28      // tab[7][crc&0xFF]
+       MOVWZ   (R10)(R26),R28  // tab[7][crc&0xFF]
        XOR     R21,R28,R21     // xor done R28
        MOVWZ   R21,R7          // crc for next round
-       BC      16,0,loop       // next 8 bytes
+       BDNZ    loop
        ANDCC   $7,R6,R8        // any leftover bytes
        BEQ     done            // none --> done
        MOVD    R8,CTR          // byte count
        PCALIGN $16             // align short loop
 short:
-       MOVBZ   0(R5),R8        // get v
-       MOVBZ   R7,R9           // byte(crc) -> R8 BE vs LE?
-       SRD     $8,R7,R14       // crc>>8
-       XOR     R8,R9,R8        // byte(crc)^v -> R8
-       ADD     $1,R5           // ptr to next v
-       SLD     $2,R8           // convert index-> bytes
-       ADD     R8,R4,R9        // &tab[byte(crc)^v]
-       MOVWZ   0(R9),R10       // tab[byte(crc)^v]
-       XOR     R10,R14,R7       // loop crc in R7
-       BC      16,0,short
+       MOVBZ   0(R5),R8        // get v
+       XOR     R8,R7,R8        // byte(crc)^v -> R8
+       RLDIC   $2,R8,$54,R8    // rldicl r8,r8,2,22
+       SRD     $8,R7,R14       // crc>>8
+       MOVWZ   (R4)(R8),R10
+       ADD     $1,R5
+       XOR     R10,R14,R7      // loop crc in R7
+       BDNZ    short
 done:
        NOR     R7,R7,R7        // ^crc
        MOVW    R7,ret+40(FP)   // return crc
@@ -333,7 +322,7 @@ cool_top:
        LVX     (R4+off112),V23 // next in buffer
 
        ADD     $128,R4         // bump up buffer pointer
-       BC      16,0,cool_top   // are we done?
+       BDNZ    cool_top        // are we done?
 
 first_cool_down: