From 2027b00e63d9128eaba4a0164072380561c0fc9c Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sun, 30 Aug 2015 22:29:00 +0200 Subject: [PATCH] hash/crc32: add AMD64 optimized IEEE CRC calculation IEEE is the most commonly used CRC-32 polynomial, used by zip, gzip and others. Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf benchmark old ns/op new ns/op delta BenchmarkIEEECrc1KB-8 3193 352 -88.98% BenchmarkIEEECrc4KB-8 5025 1307 -73.99% BenchmarkCastagnoliCrc1KB-8 126 126 +0.00% benchmark old MB/s new MB/s speedup BenchmarkIEEECrc1KB-8 320.68 2901.92 9.05x BenchmarkIEEECrc4KB-8 815.08 3131.80 3.84x BenchmarkCastagnoliCrc1KB-8 8100.80 8109.78 1.00x Change-Id: I99c9a48365f631827f516e44f97e86155f03cb90 Reviewed-on: https://go-review.googlesource.com/14080 Reviewed-by: Keith Randall --- src/hash/crc32/crc32.go | 12 +- src/hash/crc32/crc32_amd64.go | 54 ++++++ src/hash/crc32/crc32_amd64.s | 169 ++++++++++++++++++ .../{crc32_amd64x.go => crc32_amd64p32.go} | 16 +- src/hash/crc32/crc32_generic.go | 11 ++ 5 files changed, 251 insertions(+), 11 deletions(-) create mode 100644 src/hash/crc32/crc32_amd64.go rename src/hash/crc32/{crc32_amd64x.go => crc32_amd64p32.go} (63%) diff --git a/src/hash/crc32/crc32.go b/src/hash/crc32/crc32.go index 1b5e0dbde0..d41755536e 100644 --- a/src/hash/crc32/crc32.go +++ b/src/hash/crc32/crc32.go @@ -151,15 +151,11 @@ func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 { // Update returns the result of adding the bytes in p to the crc. func Update(crc uint32, tab *Table, p []byte) uint32 { - if tab == castagnoliTable { + switch tab { + case castagnoliTable: return updateCastagnoli(crc, p) - } - // only use slicing-by-8 when input is larger than 4KB - if tab == IEEETable && len(p) >= 4096 { - iEEETable8Once.Do(func() { - iEEETable8 = makeTable8(IEEE) - }) - return updateSlicingBy8(crc, iEEETable8, p) + case IEEETable: + return updateIEEE(crc, p) } return update(crc, tab, p) } diff --git a/src/hash/crc32/crc32_amd64.go b/src/hash/crc32/crc32_amd64.go new file mode 100644 index 0000000000..13e483db85 --- /dev/null +++ b/src/hash/crc32/crc32_amd64.go @@ -0,0 +1,54 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package crc32 + +// This file contains the code to call the SSE 4.2 version of the Castagnoli +// and IEEE CRC. + +// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use +// CPUID to test for SSE 4.1, 4.2 and CLMUL support. +func haveSSE41() bool +func haveSSE42() bool +func haveCLMUL() bool + +// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32 +// instruction. +func castagnoliSSE42(crc uint32, p []byte) uint32 + +// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ +// instruction as well as SSE 4.1. +func ieeeCLMUL(crc uint32, p []byte) uint32 + +var sse42 = haveSSE42() +var useFastIEEE = haveCLMUL() && haveSSE41() + +func updateCastagnoli(crc uint32, p []byte) uint32 { + if sse42 { + return castagnoliSSE42(crc, p) + } + return update(crc, castagnoliTable, p) +} + +func updateIEEE(crc uint32, p []byte) uint32 { + if useFastIEEE && len(p) >= 64 { + left := len(p) & 15 + do := len(p) - left + crc = ^ieeeCLMUL(^crc, p[:do]) + if left > 0 { + crc = update(crc, IEEETable, p[do:]) + } + return crc + } + + // only use slicing-by-8 when input is >= 4KB + if len(p) >= 4096 { + iEEETable8Once.Do(func() { + iEEETable8 = makeTable8(IEEE) + }) + return updateSlicingBy8(crc, iEEETable8, p) + } + + return update(crc, IEEETable, p) +} diff --git a/src/hash/crc32/crc32_amd64.s b/src/hash/crc32/crc32_amd64.s index 30b0d0691c..11d9bb53d8 100644 --- a/src/hash/crc32/crc32_amd64.s +++ b/src/hash/crc32/crc32_amd64.s @@ -62,3 +62,172 @@ TEXT ·haveSSE42(SB),NOSPLIT,$0 MOVB CX, ret+0(FP) RET +// func haveCLMUL() bool +TEXT ·haveCLMUL(SB),NOSPLIT,$0 + XORQ AX, AX + INCL AX + CPUID + SHRQ $1, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// func haveSSE41() bool +TEXT ·haveSSE41(SB),NOSPLIT,$0 + XORQ AX, AX + INCL AX + CPUID + SHRQ $19, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// CRC32 polynomial data +// +// These constants are lifted from the +// Linux kernel, since they avoid the costly +// PSHUFB 16 byte reversal proposed in the +// original Intel paper. +DATA r2r1<>+0(SB)/8, $0x154442bd4 +DATA r2r1<>+8(SB)/8, $0x1c6e41596 +DATA r4r3<>+0(SB)/8, $0x1751997d0 +DATA r4r3<>+8(SB)/8, $0x0ccaa009e +DATA rupoly<>+0(SB)/8, $0x1db710641 +DATA rupoly<>+8(SB)/8, $0x1f7011641 +DATA r5<>+0(SB)/8, $0x163cd6124 + +GLOBL r2r1<>(SB),RODATA,$16 +GLOBL r4r3<>(SB),RODATA,$16 +GLOBL rupoly<>(SB),RODATA,$16 +GLOBL r5<>(SB),RODATA,$8 + +// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// len(p) must be at least 64, and must be a multiple of 16. + +// func ieeeCLMUL(crc uint32, p []byte) uint32 +TEXT ·ieeeCLMUL(SB),NOSPLIT,$0 + MOVL crc+0(FP), X0 // Initial CRC value + MOVQ p+8(FP), SI // data pointer + MOVQ p_len+16(FP), CX // len(p) + + MOVOU (SI), X1 + MOVOU 16(SI), X2 + MOVOU 32(SI), X3 + MOVOU 48(SI), X4 + PXOR X0, X1 + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + CMPQ CX, $64 // Less than 64 bytes left + JB remain64 + + MOVOA r2r1<>+0(SB), X0 +loopback64: + MOVOA X1, X5 + MOVOA X2, X6 + MOVOA X3, X7 + MOVOA X4, X8 + + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0, X0, X2 + PCLMULQDQ $0, X0, X3 + PCLMULQDQ $0, X0, X4 + + /* Load next early */ + MOVOU (SI), X11 + MOVOU 16(SI), X12 + MOVOU 32(SI), X13 + MOVOU 48(SI), X14 + + PCLMULQDQ $0x11, X0, X5 + PCLMULQDQ $0x11, X0, X6 + PCLMULQDQ $0x11, X0, X7 + PCLMULQDQ $0x11, X0, X8 + + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + PXOR X8, X4 + + PXOR X11, X1 + PXOR X12, X2 + PXOR X13, X3 + PXOR X14, X4 + + ADDQ $0x40, DI + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + CMPQ CX, $64 // Less than 64 bytes left? + JGE loopback64 + + /* Fold result into a single register (X1) */ +remain64: + MOVOA r4r3<>+0(SB), X0 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X2, X1 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X3, X1 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X4, X1 + + /* If there is less than 16 bytes left we are done */ + CMPQ CX, $16 + JB finish + + /* Encode 16 bytes */ +remain16: + MOVOU (SI), X10 + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X10, X1 + SUBQ $16, CX + ADDQ $16, SI + CMPQ CX, $16 + JGE remain16 + +finish: + /* Fold final result into 32 bits and return it */ + PCMPEQB X3, X3 + PCLMULQDQ $1, X1, X0 + PSRLDQ $8, X1 + PXOR X0, X1 + + MOVOA X1, X2 + MOVQ r5<>+0(SB), X0 + + /* Creates 32 bit mask. Note that we don't care about upper half. */ + PSRLQ $32, X3 + + PSRLDQ $4, X2 + PAND X3, X1 + PCLMULQDQ $0, X0, X1 + PXOR X2, X1 + + MOVOA rupoly<>+0(SB), X0 + + MOVOA X1, X2 + PAND X3, X1 + PCLMULQDQ $0x10, X0, X1 + PAND X3, X1 + PCLMULQDQ $0, X0, X1 + PXOR X2, X1 + + /* PEXTRD $1, X1, AX (SSE 4.1) */ + BYTE $0x66; BYTE $0x0f; BYTE $0x3a; + BYTE $0x16; BYTE $0xc8; BYTE $0x01; + MOVL AX, ret+32(FP) + + RET diff --git a/src/hash/crc32/crc32_amd64x.go b/src/hash/crc32/crc32_amd64p32.go similarity index 63% rename from src/hash/crc32/crc32_amd64x.go rename to src/hash/crc32/crc32_amd64p32.go index b7e359930a..40241c5835 100644 --- a/src/hash/crc32/crc32_amd64x.go +++ b/src/hash/crc32/crc32_amd64p32.go @@ -2,14 +2,12 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build amd64 amd64p32 - package crc32 // This file contains the code to call the SSE 4.2 version of the Castagnoli // CRC. -// haveSSE42 is defined in crc_amd64.s and uses CPUID to test for SSE 4.2 +// haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2 // support. func haveSSE42() bool @@ -25,3 +23,15 @@ func updateCastagnoli(crc uint32, p []byte) uint32 { } return update(crc, castagnoliTable, p) } + +func updateIEEE(crc uint32, p []byte) uint32 { + // only use slicing-by-8 when input is >= 4KB + if len(p) >= 4096 { + iEEETable8Once.Do(func() { + iEEETable8 = makeTable8(IEEE) + }) + return updateSlicingBy8(crc, iEEETable8, p) + } + + return update(crc, IEEETable, p) +} diff --git a/src/hash/crc32/crc32_generic.go b/src/hash/crc32/crc32_generic.go index 416c1b7c55..d2355c83df 100644 --- a/src/hash/crc32/crc32_generic.go +++ b/src/hash/crc32/crc32_generic.go @@ -12,3 +12,14 @@ package crc32 func updateCastagnoli(crc uint32, p []byte) uint32 { return update(crc, castagnoliTable, p) } + +func updateIEEE(crc uint32, p []byte) uint32 { + // only use slicing-by-8 when input is >= 4KB + if len(p) >= 4096 { + iEEETable8Once.Do(func() { + iEEETable8 = makeTable8(IEEE) + }) + return updateSlicingBy8(crc, iEEETable8, p) + } + return update(crc, IEEETable, p) +} -- 2.48.1