From c81dfdd47aa44bb1da0f60e36742cc0103da4837 Mon Sep 17 00:00:00 2001 From: ruinan Date: Mon, 16 May 2022 01:53:39 +0000 Subject: [PATCH] hash/crc32: use LDP instead of LDR in crc32 computation Since recent ARM CPUs support CRC late forwarding of results from a producer to a consumer, the second CRC instruction can be executed before the first CRC instruction complete. By loading 16 bytes once with LDP and ordering two CRC instructions here we can make the data forwarding happen to get better performance. Benchmarks old ThisRun delta CRC32/poly=IEEE/size=15/align=0-160 1.24GB/s 1.26GB/s +1.92% CRC32/poly=IEEE/size=15/align=1-160 1.24GB/s 1.26GB/s +1.92% CRC32/poly=IEEE/size=40/align=0-160 2.89GB/s 2.87GB/s -0.72% CRC32/poly=IEEE/size=40/align=1-160 2.86GB/s 2.87GB/s ~ CRC32/poly=IEEE/size=512/align=0-160 9.29GB/s 14.47GB/s +55.69% CRC32/poly=IEEE/size=512/align=1-160 9.26GB/s 13.88GB/s +49.89% CRC32/poly=IEEE/size=1kB/align=0-160 10.2GB/s 17.6GB/s +72.97% CRC32/poly=IEEE/size=1kB/align=1-160 10.1GB/s 17.2GB/s +69.29% CRC32/poly=IEEE/size=4kB/align=0-160 10.5GB/s 20.9GB/s +99.01% CRC32/poly=IEEE/size=4kB/align=1-160 10.5GB/s 20.5GB/s +95.02% CRC32/poly=IEEE/size=32kB/align=0-160 11.1GB/s 22.0GB/s +98.40% CRC32/poly=IEEE/size=32kB/align=1-160 11.1GB/s 21.6GB/s +94.80% CRC32/poly=Castagnoli/size=15/align=0-160 1.26GB/s 1.26GB/s ~ CRC32/poly=Castagnoli/size=15/align=1-160 1.25GB/s 1.26GB/s ~ CRC32/poly=Castagnoli/size=40/align=0-160 2.88GB/s 3.02GB/s +5.18% CRC32/poly=Castagnoli/size=40/align=1-160 2.85GB/s 3.01GB/s +5.57% CRC32/poly=Castagnoli/size=512/align=0-160 9.24GB/s 14.71GB/s +59.29% CRC32/poly=Castagnoli/size=512/align=1-160 9.21GB/s 13.45GB/s +45.92% CRC32/poly=Castagnoli/size=1kB/align=0-160 10.1GB/s 17.8GB/s +75.81% CRC32/poly=Castagnoli/size=1kB/align=1-160 10.1GB/s 17.0GB/s +67.80% CRC32/poly=Castagnoli/size=4kB/align=0-160 10.5GB/s 21.0GB/s +99.67% CRC32/poly=Castagnoli/size=4kB/align=1-160 10.5GB/s 20.5GB/s +94.26% CRC32/poly=Castagnoli/size=32kB/align=0-160 11.1GB/s 22.0GB/s +98.39% CRC32/poly=Castagnoli/size=32kB/align=1-160 11.1GB/s 21.7GB/s +95.63% Change-Id: Ifc7be5048cafac242e7b75f652e4aafa9aeae844 Reviewed-on: https://go-review.googlesource.com/c/go/+/407854 Reviewed-by: Keith Randall Reviewed-by: Cherry Mui TryBot-Result: Gopher Robot Run-TryBot: Eric Fang Reviewed-by: Keith Randall --- src/hash/crc32/crc32_arm64.s | 38 +++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/hash/crc32/crc32_arm64.s b/src/hash/crc32/crc32_arm64.s index 53274c5623..85a113f9de 100644 --- a/src/hash/crc32/crc32_arm64.s +++ b/src/hash/crc32/crc32_arm64.s @@ -12,19 +12,22 @@ TEXT ·castagnoliUpdate(SB),NOSPLIT,$0-36 MOVD p+8(FP), R13 // data pointer MOVD p_len+16(FP), R11 // len(p) - CMP $8, R11 - BLT less_than_8 - update: - MOVD.P 8(R13), R10 + CMP $16, R11 + BLT less_than_16 + LDP.P 16(R13), (R8, R10) + CRC32CX R8, R9 CRC32CX R10, R9 - SUB $8, R11 - - CMP $8, R11 - BLT less_than_8 + SUB $16, R11 JMP update +less_than_16: + TBZ $3, R11, less_than_8 + + MOVD.P 8(R13), R10 + CRC32CX R10, R9 + less_than_8: TBZ $2, R11, less_than_4 @@ -55,19 +58,22 @@ TEXT ·ieeeUpdate(SB),NOSPLIT,$0-36 MOVD p+8(FP), R13 // data pointer MOVD p_len+16(FP), R11 // len(p) - CMP $8, R11 - BLT less_than_8 - update: - MOVD.P 8(R13), R10 + CMP $16, R11 + BLT less_than_16 + LDP.P 16(R13), (R8, R10) + CRC32X R8, R9 CRC32X R10, R9 - SUB $8, R11 - - CMP $8, R11 - BLT less_than_8 + SUB $16, R11 JMP update +less_than_16: + TBZ $3, R11, less_than_8 + + MOVD.P 8(R13), R10 + CRC32X R10, R9 + less_than_8: TBZ $2, R11, less_than_4 -- 2.48.1