]> Cypherpunks repositories - gostls13.git/commitdiff
hash/crc32: use LDP instead of LDR in crc32 computation
authorruinan <ruinan.sun@arm.com>
Mon, 16 May 2022 01:53:39 +0000 (01:53 +0000)
committerEric Fang <eric.fang@arm.com>
Wed, 10 Aug 2022 08:57:33 +0000 (08:57 +0000)
Since recent ARM CPUs support CRC late forwarding of results from a
producer to a consumer, the second CRC instruction can be executed
before the first CRC instruction complete. By loading 16 bytes once with
LDP and ordering two CRC instructions here we can make the data
forwarding happen to get better performance.

Benchmarks                             old         ThisRun    delta
CRC32/poly=IEEE/size=15/align=0-160    1.24GB/s    1.26GB/s   +1.92%
CRC32/poly=IEEE/size=15/align=1-160    1.24GB/s    1.26GB/s   +1.92%
CRC32/poly=IEEE/size=40/align=0-160    2.89GB/s    2.87GB/s   -0.72%
CRC32/poly=IEEE/size=40/align=1-160    2.86GB/s    2.87GB/s     ~
CRC32/poly=IEEE/size=512/align=0-160   9.29GB/s   14.47GB/s  +55.69%
CRC32/poly=IEEE/size=512/align=1-160   9.26GB/s   13.88GB/s  +49.89%
CRC32/poly=IEEE/size=1kB/align=0-160   10.2GB/s    17.6GB/s  +72.97%
CRC32/poly=IEEE/size=1kB/align=1-160   10.1GB/s    17.2GB/s  +69.29%
CRC32/poly=IEEE/size=4kB/align=0-160   10.5GB/s    20.9GB/s  +99.01%
CRC32/poly=IEEE/size=4kB/align=1-160   10.5GB/s    20.5GB/s  +95.02%
CRC32/poly=IEEE/size=32kB/align=0-160  11.1GB/s    22.0GB/s  +98.40%
CRC32/poly=IEEE/size=32kB/align=1-160  11.1GB/s    21.6GB/s  +94.80%

CRC32/poly=Castagnoli/size=15/align=0-160   1.26GB/s   1.26GB/s     ~
CRC32/poly=Castagnoli/size=15/align=1-160   1.25GB/s   1.26GB/s     ~
CRC32/poly=Castagnoli/size=40/align=0-160   2.88GB/s   3.02GB/s   +5.18%
CRC32/poly=Castagnoli/size=40/align=1-160   2.85GB/s   3.01GB/s   +5.57%
CRC32/poly=Castagnoli/size=512/align=0-160  9.24GB/s  14.71GB/s  +59.29%
CRC32/poly=Castagnoli/size=512/align=1-160  9.21GB/s  13.45GB/s  +45.92%
CRC32/poly=Castagnoli/size=1kB/align=0-160  10.1GB/s   17.8GB/s  +75.81%
CRC32/poly=Castagnoli/size=1kB/align=1-160  10.1GB/s   17.0GB/s  +67.80%
CRC32/poly=Castagnoli/size=4kB/align=0-160  10.5GB/s   21.0GB/s  +99.67%
CRC32/poly=Castagnoli/size=4kB/align=1-160  10.5GB/s   20.5GB/s  +94.26%
CRC32/poly=Castagnoli/size=32kB/align=0-160 11.1GB/s   22.0GB/s  +98.39%
CRC32/poly=Castagnoli/size=32kB/align=1-160 11.1GB/s   21.7GB/s  +95.63%

Change-Id: Ifc7be5048cafac242e7b75f652e4aafa9aeae844
Reviewed-on: https://go-review.googlesource.com/c/go/+/407854
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Eric Fang <eric.fang@arm.com>
Reviewed-by: Keith Randall <khr@golang.org>
src/hash/crc32/crc32_arm64.s

index 53274c56235ed04de0159ae64c77a2ed290cbf4e..85a113f9de7114522a56bf37c29a6316f5a030a0 100644 (file)
@@ -12,19 +12,22 @@ TEXT ·castagnoliUpdate(SB),NOSPLIT,$0-36
        MOVD    p+8(FP), R13  // data pointer
        MOVD    p_len+16(FP), R11  // len(p)
 
-       CMP     $8, R11
-       BLT     less_than_8
-
 update:
-       MOVD.P  8(R13), R10
+       CMP     $16, R11
+       BLT     less_than_16
+       LDP.P   16(R13), (R8, R10)
+       CRC32CX R8, R9
        CRC32CX R10, R9
-       SUB     $8, R11
-
-       CMP     $8, R11
-       BLT     less_than_8
+       SUB     $16, R11
 
        JMP     update
 
+less_than_16:
+       TBZ     $3, R11, less_than_8
+
+       MOVD.P  8(R13), R10
+       CRC32CX R10, R9
+
 less_than_8:
        TBZ     $2, R11, less_than_4
 
@@ -55,19 +58,22 @@ TEXT ·ieeeUpdate(SB),NOSPLIT,$0-36
        MOVD    p+8(FP), R13  // data pointer
        MOVD    p_len+16(FP), R11  // len(p)
 
-       CMP     $8, R11
-       BLT     less_than_8
-
 update:
-       MOVD.P  8(R13), R10
+       CMP     $16, R11
+       BLT     less_than_16
+       LDP.P   16(R13), (R8, R10)
+       CRC32X  R8, R9
        CRC32X  R10, R9
-       SUB     $8, R11
-
-       CMP     $8, R11
-       BLT     less_than_8
+       SUB     $16, R11
 
        JMP     update
 
+less_than_16:
+       TBZ $3, R11, less_than_8
+
+       MOVD.P  8(R13), R10
+       CRC32X  R10, R9
+
 less_than_8:
        TBZ     $2, R11, less_than_4