]> Cypherpunks repositories - gostls13.git/commitdiff
hash/crc32: optimize arm64 crc32 implementation
authorWei Xiao <wei.xiao@arm.com>
Fri, 17 Mar 2017 03:31:09 +0000 (03:31 +0000)
committerCherry Zhang <cherryyz@google.com>
Thu, 13 Apr 2017 12:44:10 +0000 (12:44 +0000)
ARMv8 defines crc32 instruction.

Comparing to the original crc32 calculation, this patch makes use of
crc32 instructions to do crc32 calculation instead of the multiple
lookup table algorithms.

ARMv8 provides IEEE and Castagnoli polynomials for crc32 calculation
so that the perfomance of these two types of crc32 get significant
improved.

name                                        old time/op   new time/op    delta
CRC32/poly=IEEE/size=15/align=0-32            117ns ± 0%      38ns ± 0%   -67.44%
CRC32/poly=IEEE/size=15/align=1-32            117ns ± 0%      38ns ± 0%   -67.52%
CRC32/poly=IEEE/size=40/align=0-32            129ns ± 0%      41ns ± 0%   -68.37%
CRC32/poly=IEEE/size=40/align=1-32            129ns ± 0%      41ns ± 0%   -68.29%
CRC32/poly=IEEE/size=512/align=0-32           828ns ± 0%     246ns ± 0%   -70.29%
CRC32/poly=IEEE/size=512/align=1-32           828ns ± 0%     132ns ± 0%   -84.06%
CRC32/poly=IEEE/size=1kB/align=0-32          1.58µs ± 0%    0.46µs ± 0%   -70.98%
CRC32/poly=IEEE/size=1kB/align=1-32          1.58µs ± 0%    0.46µs ± 0%   -70.92%
CRC32/poly=IEEE/size=4kB/align=0-32          6.06µs ± 0%    1.74µs ± 0%   -71.27%
CRC32/poly=IEEE/size=4kB/align=1-32          6.10µs ± 0%    1.74µs ± 0%   -71.44%
CRC32/poly=IEEE/size=32kB/align=0-32         48.3µs ± 0%    13.7µs ± 0%   -71.61%
CRC32/poly=IEEE/size=32kB/align=1-32         48.3µs ± 0%    13.7µs ± 0%   -71.60%
CRC32/poly=Castagnoli/size=15/align=0-32      116ns ± 0%      38ns ± 0%   -67.07%
CRC32/poly=Castagnoli/size=15/align=1-32      116ns ± 0%      38ns ± 0%   -66.90%
CRC32/poly=Castagnoli/size=40/align=0-32      127ns ± 0%      40ns ± 0%   -68.11%
CRC32/poly=Castagnoli/size=40/align=1-32      127ns ± 0%      40ns ± 0%   -68.11%
CRC32/poly=Castagnoli/size=512/align=0-32     828ns ± 0%     132ns ± 0%   -84.06%
CRC32/poly=Castagnoli/size=512/align=1-32     827ns ± 0%     132ns ± 0%   -84.04%
CRC32/poly=Castagnoli/size=1kB/align=0-32    1.59µs ± 0%    0.22µs ± 0%   -85.89%
CRC32/poly=Castagnoli/size=1kB/align=1-32    1.58µs ± 0%    0.22µs ± 0%   -85.79%
CRC32/poly=Castagnoli/size=4kB/align=0-32    6.14µs ± 0%    0.77µs ± 0%   -87.40%
CRC32/poly=Castagnoli/size=4kB/align=1-32    6.06µs ± 0%    0.77µs ± 0%   -87.25%
CRC32/poly=Castagnoli/size=32kB/align=0-32   48.3µs ± 0%     5.9µs ± 0%   -87.71%
CRC32/poly=Castagnoli/size=32kB/align=1-32   48.4µs ± 0%     6.0µs ± 0%   -87.69%
CRC32/poly=Koopman/size=15/align=0-32         104ns ± 0%     104ns ± 0%    +0.00%
CRC32/poly=Koopman/size=15/align=1-32         104ns ± 0%     104ns ± 0%    +0.00%
CRC32/poly=Koopman/size=40/align=0-32         235ns ± 0%     235ns ± 0%    +0.00%
CRC32/poly=Koopman/size=40/align=1-32         235ns ± 0%     235ns ± 0%    +0.00%
CRC32/poly=Koopman/size=512/align=0-32       2.71µs ± 0%    2.71µs ± 0%    -0.07%
CRC32/poly=Koopman/size=512/align=1-32       2.71µs ± 0%    2.71µs ± 0%    -0.04%
CRC32/poly=Koopman/size=1kB/align=0-32       5.40µs ± 0%    5.39µs ± 0%    -0.06%
CRC32/poly=Koopman/size=1kB/align=1-32       5.40µs ± 0%    5.40µs ± 0%    +0.02%
CRC32/poly=Koopman/size=4kB/align=0-32       21.5µs ± 0%    21.5µs ± 0%    -0.16%
CRC32/poly=Koopman/size=4kB/align=1-32       21.5µs ± 0%    21.5µs ± 0%    -0.05%
CRC32/poly=Koopman/size=32kB/align=0-32       172µs ± 0%     172µs ± 0%    -0.07%
CRC32/poly=Koopman/size=32kB/align=1-32       172µs ± 0%     172µs ± 0%    -0.01%

name                                        old speed     new speed      delta
CRC32/poly=IEEE/size=15/align=0-32          128MB/s ± 0%   394MB/s ± 0%  +207.95%
CRC32/poly=IEEE/size=15/align=1-32          128MB/s ± 0%   394MB/s ± 0%  +208.09%
CRC32/poly=IEEE/size=40/align=0-32          310MB/s ± 0%   979MB/s ± 0%  +216.07%
CRC32/poly=IEEE/size=40/align=1-32          310MB/s ± 0%   979MB/s ± 0%  +216.16%
CRC32/poly=IEEE/size=512/align=0-32         618MB/s ± 0%  2074MB/s ± 0%  +235.72%
CRC32/poly=IEEE/size=512/align=1-32         618MB/s ± 0%  3852MB/s ± 0%  +523.55%
CRC32/poly=IEEE/size=1kB/align=0-32         646MB/s ± 0%  2225MB/s ± 0%  +244.57%
CRC32/poly=IEEE/size=1kB/align=1-32         647MB/s ± 0%  2225MB/s ± 0%  +243.87%
CRC32/poly=IEEE/size=4kB/align=0-32         676MB/s ± 0%  2352MB/s ± 0%  +248.02%
CRC32/poly=IEEE/size=4kB/align=1-32         672MB/s ± 0%  2352MB/s ± 0%  +250.15%
CRC32/poly=IEEE/size=32kB/align=0-32        678MB/s ± 0%  2387MB/s ± 0%  +252.17%
CRC32/poly=IEEE/size=32kB/align=1-32        678MB/s ± 0%  2388MB/s ± 0%  +252.11%
CRC32/poly=Castagnoli/size=15/align=0-32    129MB/s ± 0%   393MB/s ± 0%  +205.51%
CRC32/poly=Castagnoli/size=15/align=1-32    129MB/s ± 0%   390MB/s ± 0%  +203.41%
CRC32/poly=Castagnoli/size=40/align=0-32    314MB/s ± 0%   988MB/s ± 0%  +215.04%
CRC32/poly=Castagnoli/size=40/align=1-32    314MB/s ± 0%   987MB/s ± 0%  +214.68%
CRC32/poly=Castagnoli/size=512/align=0-32   618MB/s ± 0%  3860MB/s ± 0%  +524.32%
CRC32/poly=Castagnoli/size=512/align=1-32   619MB/s ± 0%  3859MB/s ± 0%  +523.66%
CRC32/poly=Castagnoli/size=1kB/align=0-32   645MB/s ± 0%  4568MB/s ± 0%  +608.56%
CRC32/poly=Castagnoli/size=1kB/align=1-32   650MB/s ± 0%  4567MB/s ± 0%  +602.94%
CRC32/poly=Castagnoli/size=4kB/align=0-32   667MB/s ± 0%  5297MB/s ± 0%  +693.81%
CRC32/poly=Castagnoli/size=4kB/align=1-32   676MB/s ± 0%  5297MB/s ± 0%  +684.00%
CRC32/poly=Castagnoli/size=32kB/align=0-32  678MB/s ± 0%  5519MB/s ± 0%  +713.83%
CRC32/poly=Castagnoli/size=32kB/align=1-32  677MB/s ± 0%  5497MB/s ± 0%  +712.04%
CRC32/poly=Koopman/size=15/align=0-32       143MB/s ± 0%   144MB/s ± 0%    +0.27%
CRC32/poly=Koopman/size=15/align=1-32       143MB/s ± 0%   144MB/s ± 0%    +0.33%
CRC32/poly=Koopman/size=40/align=0-32       169MB/s ± 0%   170MB/s ± 0%    +0.12%
CRC32/poly=Koopman/size=40/align=1-32       170MB/s ± 0%   170MB/s ± 0%    +0.08%
CRC32/poly=Koopman/size=512/align=0-32      189MB/s ± 0%   189MB/s ± 0%    +0.07%
CRC32/poly=Koopman/size=512/align=1-32      189MB/s ± 0%   189MB/s ± 0%    +0.04%
CRC32/poly=Koopman/size=1kB/align=0-32      190MB/s ± 0%   190MB/s ± 0%    +0.05%
CRC32/poly=Koopman/size=1kB/align=1-32      190MB/s ± 0%   190MB/s ± 0%    -0.01%
CRC32/poly=Koopman/size=4kB/align=0-32      190MB/s ± 0%   190MB/s ± 0%    +0.15%
CRC32/poly=Koopman/size=4kB/align=1-32      190MB/s ± 0%   191MB/s ± 0%    +0.05%
CRC32/poly=Koopman/size=32kB/align=0-32     191MB/s ± 0%   191MB/s ± 0%    +0.06%
CRC32/poly=Koopman/size=32kB/align=1-32     191MB/s ± 0%   191MB/s ± 0%    +0.02%

Also fix a bug of arm64 assembler

The optimization is mainly contributed by Fangming.Fang <fangming.fang@arm.com>

Change-Id: I900678c2e445d7e8ad9e2a9ab3305d649230905f
Reviewed-on: https://go-review.googlesource.com/40074
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

src/cmd/internal/obj/arm64/asm7.go
src/hash/crc32/crc32_arm64.go [new file with mode: 0644]
src/hash/crc32/crc32_arm64.s [new file with mode: 0644]
src/hash/crc32/crc32_otherarch.go
src/runtime/os_darwin_arm64.go
src/runtime/os_linux_arm64.go

index b9529dd163f3a893de41a745ad7a6010e7bd8d56..6950b0181e63a41009b5a225bf988e3a1b7877e9 100644 (file)
@@ -2564,7 +2564,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
                }
                o1 |= ((uint32(v) & 0x20) << (31 - 5)) | ((uint32(v) & 0x1F) << 19)
                o1 |= uint32(c.brdist(p, 0, 14, 2) << 5)
-               o1 |= uint32(p.Reg)
+               o1 |= uint32(p.Reg & 31)
 
        case 41: /* eret, nop, others with no operands */
                o1 = c.op0(p, p.As)
diff --git a/src/hash/crc32/crc32_arm64.go b/src/hash/crc32/crc32_arm64.go
new file mode 100644 (file)
index 0000000..2df3702
--- /dev/null
@@ -0,0 +1,51 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// ARM64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
+// description of the interface that each architecture-specific file
+// implements.
+
+package crc32
+
+func supportsCRC32() bool
+func castagnoliUpdate(crc uint32, p []byte) uint32
+func ieeeUpdate(crc uint32, p []byte) uint32
+
+var hasCRC32 = supportsCRC32()
+
+func archAvailableCastagnoli() bool {
+    return hasCRC32
+}
+
+func archInitCastagnoli() {
+    if !hasCRC32 {
+        panic("arch-specific crc32 instruction for Catagnoli not available")
+    }
+}
+
+func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
+    if !hasCRC32 {
+        panic("arch-specific crc32 instruction for Castagnoli not available")
+    }
+
+    return ^castagnoliUpdate(^crc, p)
+}
+
+func archAvailableIEEE() bool {
+    return hasCRC32
+}
+
+func archInitIEEE() {
+    if !hasCRC32 {
+        panic("arch-specific crc32 instruction for IEEE not available")
+    }
+}
+
+func archUpdateIEEE(crc uint32, p []byte) uint32 {
+    if !hasCRC32 {
+        panic("arch-specific crc32 instruction for IEEE not available")
+    }
+
+    return ^ieeeUpdate(^crc, p)
+}
diff --git a/src/hash/crc32/crc32_arm64.s b/src/hash/crc32/crc32_arm64.s
new file mode 100644 (file)
index 0000000..26a86e4
--- /dev/null
@@ -0,0 +1,97 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// castagnoliUpdate updates the non-inverted crc with the given data.
+
+// func castagnoliUpdate(crc uint32, p []byte) uint32
+TEXT ·castagnoliUpdate(SB),NOSPLIT,$0-36
+       MOVWU   crc+0(FP), R9  // CRC value
+       MOVD    p+8(FP), R13  // data pointer
+       MOVD    p_len+16(FP), R11  // len(p)
+
+       CMP     $8, R11
+       BLT     less_than_8
+
+update:
+       MOVD.P  8(R13), R10
+       CRC32CX R10, R9
+       SUB     $8, R11
+
+       CMP     $8, R11
+       BLT     less_than_8
+
+       JMP     update
+
+less_than_8:
+       TBZ     $2, R11, less_than_4
+
+       MOVWU.P 4(R13), R10
+       CRC32CW R10, R9
+
+less_than_4:
+       TBZ     $1, R11, less_than_2
+
+       MOVHU.P 2(R13), R10
+       CRC32CH R10, R9
+
+less_than_2:
+       TBZ     $0, R11, done
+
+       MOVBU   (R13), R10
+       CRC32CB R10, R9
+
+done:
+       MOVWU   R9, ret+32(FP)
+       RET
+
+// ieeeUpdate updates the non-inverted crc with the given data.
+
+// func ieeeUpdate(crc uint32, p []byte) uint32
+TEXT ·ieeeUpdate(SB),NOSPLIT,$0-36
+       MOVWU   crc+0(FP), R9  // CRC value
+       MOVD    p+8(FP), R13  // data pointer
+       MOVD    p_len+16(FP), R11  // len(p)
+
+       CMP     $8, R11
+       BLT     less_than_8
+
+update:
+       MOVD.P  8(R13), R10
+       CRC32X  R10, R9
+       SUB     $8, R11
+
+       CMP     $8, R11
+       BLT     less_than_8
+
+       JMP     update
+
+less_than_8:
+       TBZ     $2, R11, less_than_4
+
+       MOVWU.P 4(R13), R10
+       CRC32W  R10, R9
+
+less_than_4:
+       TBZ     $1, R11, less_than_2
+
+       MOVHU.P 2(R13), R10
+       CRC32H  R10, R9
+
+less_than_2:
+       TBZ     $0, R11, done
+
+       MOVBU   (R13), R10
+       CRC32B  R10, R9
+
+done:
+       MOVWU   R9, ret+32(FP)
+       RET
+
+// func supportsCRC32() bool
+TEXT ·supportsCRC32(SB),NOSPLIT,$0-1
+       MOVB    runtime·supportCRC32(SB), R0
+       MOVB    R0, ret+0(FP)
+       RET
index 3565046c794d010200ecf8c83c575736fa63ba44..6f3510a279bf7c09d0bf3155ab8c7bb5bf418d42 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!amd64p32,!s390x,!ppc64le
+// +build !amd64,!amd64p32,!s390x,!ppc64le,!arm64
 
 package crc32
 
index 8de132d8e2fab9419705cc1faefd250acb69d21f..01285afa190959b54efea59b9ab580461c9ede8a 100644 (file)
@@ -4,6 +4,8 @@
 
 package runtime
 
+var supportCRC32 = false
+
 //go:nosplit
 func cputicks() int64 {
        // Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
index bdc341d962b383f38273806a8b8d81dec12dfbb9..f2a2916c37f90b899663fb5c4ba735d48bbd9b23 100644 (file)
@@ -4,7 +4,12 @@
 
 package runtime
 
+const (
+        _ARM64_FEATURE_HAS_CRC32 = 0x80
+)
+
 var randomNumber uint32
+var supportCRC32 bool
 
 func archauxv(tag, val uintptr) {
        switch tag {
@@ -14,6 +19,8 @@ func archauxv(tag, val uintptr) {
                // it as a byte array.
                randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
                        uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
+        case _AT_HWCAP:
+                supportCRC32 = val & _ARM64_FEATURE_HAS_CRC32 != 0
        }
 }