// Update returns the result of adding the bytes in p to the crc.
func Update(crc uint32, tab *Table, p []byte) uint32 {
- if tab == castagnoliTable {
+ switch tab {
+ case castagnoliTable:
return updateCastagnoli(crc, p)
- }
- // only use slicing-by-8 when input is larger than 4KB
- if tab == IEEETable && len(p) >= 4096 {
- iEEETable8Once.Do(func() {
- iEEETable8 = makeTable8(IEEE)
- })
- return updateSlicingBy8(crc, iEEETable8, p)
+ case IEEETable:
+ return updateIEEE(crc, p)
}
return update(crc, tab, p)
}
--- /dev/null
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package crc32
+
+// This file contains the code to call the SSE 4.2 version of the Castagnoli
+// and IEEE CRC.
+
+// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
+// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
+func haveSSE41() bool
+func haveSSE42() bool
+func haveCLMUL() bool
+
+// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
+// instruction.
+func castagnoliSSE42(crc uint32, p []byte) uint32
+
+// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
+// instruction as well as SSE 4.1.
+func ieeeCLMUL(crc uint32, p []byte) uint32
+
+var sse42 = haveSSE42()
+var useFastIEEE = haveCLMUL() && haveSSE41()
+
+func updateCastagnoli(crc uint32, p []byte) uint32 {
+ if sse42 {
+ return castagnoliSSE42(crc, p)
+ }
+ return update(crc, castagnoliTable, p)
+}
+
+func updateIEEE(crc uint32, p []byte) uint32 {
+ if useFastIEEE && len(p) >= 64 {
+ left := len(p) & 15
+ do := len(p) - left
+ crc = ^ieeeCLMUL(^crc, p[:do])
+ if left > 0 {
+ crc = update(crc, IEEETable, p[do:])
+ }
+ return crc
+ }
+
+ // only use slicing-by-8 when input is >= 4KB
+ if len(p) >= 4096 {
+ iEEETable8Once.Do(func() {
+ iEEETable8 = makeTable8(IEEE)
+ })
+ return updateSlicingBy8(crc, iEEETable8, p)
+ }
+
+ return update(crc, IEEETable, p)
+}
MOVB CX, ret+0(FP)
RET
+// func haveCLMUL() bool
+TEXT ·haveCLMUL(SB),NOSPLIT,$0
+ XORQ AX, AX
+ INCL AX
+ CPUID
+ SHRQ $1, CX
+ ANDQ $1, CX
+ MOVB CX, ret+0(FP)
+ RET
+
+// func haveSSE41() bool
+TEXT ·haveSSE41(SB),NOSPLIT,$0
+ XORQ AX, AX
+ INCL AX
+ CPUID
+ SHRQ $19, CX
+ ANDQ $1, CX
+ MOVB CX, ret+0(FP)
+ RET
+
+// CRC32 polynomial data
+//
+// These constants are lifted from the
+// Linux kernel, since they avoid the costly
+// PSHUFB 16 byte reversal proposed in the
+// original Intel paper.
+DATA r2r1<>+0(SB)/8, $0x154442bd4
+DATA r2r1<>+8(SB)/8, $0x1c6e41596
+DATA r4r3<>+0(SB)/8, $0x1751997d0
+DATA r4r3<>+8(SB)/8, $0x0ccaa009e
+DATA rupoly<>+0(SB)/8, $0x1db710641
+DATA rupoly<>+8(SB)/8, $0x1f7011641
+DATA r5<>+0(SB)/8, $0x163cd6124
+
+GLOBL r2r1<>(SB),RODATA,$16
+GLOBL r4r3<>(SB),RODATA,$16
+GLOBL rupoly<>(SB),RODATA,$16
+GLOBL r5<>(SB),RODATA,$8
+
+// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+// len(p) must be at least 64, and must be a multiple of 16.
+
+// func ieeeCLMUL(crc uint32, p []byte) uint32
+TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
+ MOVL crc+0(FP), X0 // Initial CRC value
+ MOVQ p+8(FP), SI // data pointer
+ MOVQ p_len+16(FP), CX // len(p)
+
+ MOVOU (SI), X1
+ MOVOU 16(SI), X2
+ MOVOU 32(SI), X3
+ MOVOU 48(SI), X4
+ PXOR X0, X1
+ ADDQ $64, SI // buf+=64
+ SUBQ $64, CX // len-=64
+ CMPQ CX, $64 // Less than 64 bytes left
+ JB remain64
+
+ MOVOA r2r1<>+0(SB), X0
+loopback64:
+ MOVOA X1, X5
+ MOVOA X2, X6
+ MOVOA X3, X7
+ MOVOA X4, X8
+
+ PCLMULQDQ $0, X0, X1
+ PCLMULQDQ $0, X0, X2
+ PCLMULQDQ $0, X0, X3
+ PCLMULQDQ $0, X0, X4
+
+ /* Load next early */
+ MOVOU (SI), X11
+ MOVOU 16(SI), X12
+ MOVOU 32(SI), X13
+ MOVOU 48(SI), X14
+
+ PCLMULQDQ $0x11, X0, X5
+ PCLMULQDQ $0x11, X0, X6
+ PCLMULQDQ $0x11, X0, X7
+ PCLMULQDQ $0x11, X0, X8
+
+ PXOR X5, X1
+ PXOR X6, X2
+ PXOR X7, X3
+ PXOR X8, X4
+
+ PXOR X11, X1
+ PXOR X12, X2
+ PXOR X13, X3
+ PXOR X14, X4
+
+ ADDQ $0x40, DI
+ ADDQ $64, SI // buf+=64
+ SUBQ $64, CX // len-=64
+ CMPQ CX, $64 // Less than 64 bytes left?
+ JGE loopback64
+
+ /* Fold result into a single register (X1) */
+remain64:
+ MOVOA r4r3<>+0(SB), X0
+
+ MOVOA X1, X5
+ PCLMULQDQ $0, X0, X1
+ PCLMULQDQ $0x11, X0, X5
+ PXOR X5, X1
+ PXOR X2, X1
+
+ MOVOA X1, X5
+ PCLMULQDQ $0, X0, X1
+ PCLMULQDQ $0x11, X0, X5
+ PXOR X5, X1
+ PXOR X3, X1
+
+ MOVOA X1, X5
+ PCLMULQDQ $0, X0, X1
+ PCLMULQDQ $0x11, X0, X5
+ PXOR X5, X1
+ PXOR X4, X1
+
+ /* If there is less than 16 bytes left we are done */
+ CMPQ CX, $16
+ JB finish
+
+ /* Encode 16 bytes */
+remain16:
+ MOVOU (SI), X10
+ MOVOA X1, X5
+ PCLMULQDQ $0, X0, X1
+ PCLMULQDQ $0x11, X0, X5
+ PXOR X5, X1
+ PXOR X10, X1
+ SUBQ $16, CX
+ ADDQ $16, SI
+ CMPQ CX, $16
+ JGE remain16
+
+finish:
+ /* Fold final result into 32 bits and return it */
+ PCMPEQB X3, X3
+ PCLMULQDQ $1, X1, X0
+ PSRLDQ $8, X1
+ PXOR X0, X1
+
+ MOVOA X1, X2
+ MOVQ r5<>+0(SB), X0
+
+ /* Creates 32 bit mask. Note that we don't care about upper half. */
+ PSRLQ $32, X3
+
+ PSRLDQ $4, X2
+ PAND X3, X1
+ PCLMULQDQ $0, X0, X1
+ PXOR X2, X1
+
+ MOVOA rupoly<>+0(SB), X0
+
+ MOVOA X1, X2
+ PAND X3, X1
+ PCLMULQDQ $0x10, X0, X1
+ PAND X3, X1
+ PCLMULQDQ $0, X0, X1
+ PXOR X2, X1
+
+ /* PEXTRD $1, X1, AX (SSE 4.1) */
+ BYTE $0x66; BYTE $0x0f; BYTE $0x3a;
+ BYTE $0x16; BYTE $0xc8; BYTE $0x01;
+ MOVL AX, ret+32(FP)
+
+ RET
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64 amd64p32
-
package crc32
// This file contains the code to call the SSE 4.2 version of the Castagnoli
// CRC.
-// haveSSE42 is defined in crc_amd64.s and uses CPUID to test for SSE 4.2
+// haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2
// support.
func haveSSE42() bool
}
return update(crc, castagnoliTable, p)
}
+
+func updateIEEE(crc uint32, p []byte) uint32 {
+ // only use slicing-by-8 when input is >= 4KB
+ if len(p) >= 4096 {
+ iEEETable8Once.Do(func() {
+ iEEETable8 = makeTable8(IEEE)
+ })
+ return updateSlicingBy8(crc, iEEETable8, p)
+ }
+
+ return update(crc, IEEETable, p)
+}