From 2027b00e63d9128eaba4a0164072380561c0fc9c Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sun, 30 Aug 2015 22:29:00 +0200
Subject: [PATCH] hash/crc32: add AMD64 optimized IEEE CRC calculation

IEEE is the most commonly used CRC-32 polynomial, used by zip, gzip and others.

Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf

benchmark                       old ns/op     new ns/op     delta
BenchmarkIEEECrc1KB-8           3193          352           -88.98%
BenchmarkIEEECrc4KB-8           5025          1307          -73.99%
BenchmarkCastagnoliCrc1KB-8     126           126           +0.00%

benchmark                       old MB/s     new MB/s     speedup
BenchmarkIEEECrc1KB-8           320.68       2901.92      9.05x
BenchmarkIEEECrc4KB-8           815.08       3131.80      3.84x
BenchmarkCastagnoliCrc1KB-8     8100.80      8109.78      1.00x

Change-Id: I99c9a48365f631827f516e44f97e86155f03cb90
Reviewed-on: https://go-review.googlesource.com/14080
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/hash/crc32/crc32.go                       |  12 +-
 src/hash/crc32/crc32_amd64.go                 |  54 ++++++
 src/hash/crc32/crc32_amd64.s                  | 169 ++++++++++++++++++
 .../{crc32_amd64x.go => crc32_amd64p32.go}    |  16 +-
 src/hash/crc32/crc32_generic.go               |  11 ++
 5 files changed, 251 insertions(+), 11 deletions(-)
 create mode 100644 src/hash/crc32/crc32_amd64.go
 rename src/hash/crc32/{crc32_amd64x.go => crc32_amd64p32.go} (63%)

diff --git a/src/hash/crc32/crc32.go b/src/hash/crc32/crc32.go
index 1b5e0dbde0..d41755536e 100644
--- a/src/hash/crc32/crc32.go
+++ b/src/hash/crc32/crc32.go
@@ -151,15 +151,11 @@ func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 {
 
 // Update returns the result of adding the bytes in p to the crc.
 func Update(crc uint32, tab *Table, p []byte) uint32 {
-	if tab == castagnoliTable {
+	switch tab {
+	case castagnoliTable:
 		return updateCastagnoli(crc, p)
-	}
-	// only use slicing-by-8 when input is larger than 4KB
-	if tab == IEEETable && len(p) >= 4096 {
-		iEEETable8Once.Do(func() {
-			iEEETable8 = makeTable8(IEEE)
-		})
-		return updateSlicingBy8(crc, iEEETable8, p)
+	case IEEETable:
+		return updateIEEE(crc, p)
 	}
 	return update(crc, tab, p)
 }
diff --git a/src/hash/crc32/crc32_amd64.go b/src/hash/crc32/crc32_amd64.go
new file mode 100644
index 0000000000..13e483db85
--- /dev/null
+++ b/src/hash/crc32/crc32_amd64.go
@@ -0,0 +1,54 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package crc32
+
+// This file contains the code to call the SSE 4.2 version of the Castagnoli
+// and IEEE CRC.
+
+// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
+// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
+func haveSSE41() bool
+func haveSSE42() bool
+func haveCLMUL() bool
+
+// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
+// instruction.
+func castagnoliSSE42(crc uint32, p []byte) uint32
+
+// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
+// instruction as well as SSE 4.1.
+func ieeeCLMUL(crc uint32, p []byte) uint32
+
+var sse42 = haveSSE42()
+var useFastIEEE = haveCLMUL() && haveSSE41()
+
+func updateCastagnoli(crc uint32, p []byte) uint32 {
+	if sse42 {
+		return castagnoliSSE42(crc, p)
+	}
+	return update(crc, castagnoliTable, p)
+}
+
+func updateIEEE(crc uint32, p []byte) uint32 {
+	if useFastIEEE && len(p) >= 64 {
+		left := len(p) & 15
+		do := len(p) - left
+		crc = ^ieeeCLMUL(^crc, p[:do])
+		if left > 0 {
+			crc = update(crc, IEEETable, p[do:])
+		}
+		return crc
+	}
+
+	// only use slicing-by-8 when input is >= 4KB
+	if len(p) >= 4096 {
+		iEEETable8Once.Do(func() {
+			iEEETable8 = makeTable8(IEEE)
+		})
+		return updateSlicingBy8(crc, iEEETable8, p)
+	}
+
+	return update(crc, IEEETable, p)
+}
diff --git a/src/hash/crc32/crc32_amd64.s b/src/hash/crc32/crc32_amd64.s
index 30b0d0691c..11d9bb53d8 100644
--- a/src/hash/crc32/crc32_amd64.s
+++ b/src/hash/crc32/crc32_amd64.s
@@ -62,3 +62,172 @@ TEXT Â·haveSSE42(SB),NOSPLIT,$0
 	MOVB CX, ret+0(FP)
 	RET
 
+// func haveCLMUL() bool
+TEXT Â·haveCLMUL(SB),NOSPLIT,$0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $1, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
+// func haveSSE41() bool
+TEXT Â·haveSSE41(SB),NOSPLIT,$0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $19, CX
+	ANDQ $1, CX
+	MOVB CX, ret+0(FP)
+	RET
+
+// CRC32 polynomial data
+//
+// These constants are lifted from the
+// Linux kernel, since they avoid the costly
+// PSHUFB 16 byte reversal proposed in the
+// original Intel paper.
+DATA r2r1<>+0(SB)/8, $0x154442bd4
+DATA r2r1<>+8(SB)/8, $0x1c6e41596
+DATA r4r3<>+0(SB)/8, $0x1751997d0
+DATA r4r3<>+8(SB)/8, $0x0ccaa009e
+DATA rupoly<>+0(SB)/8, $0x1db710641
+DATA rupoly<>+8(SB)/8, $0x1f7011641
+DATA r5<>+0(SB)/8, $0x163cd6124
+
+GLOBL r2r1<>(SB),RODATA,$16
+GLOBL r4r3<>(SB),RODATA,$16
+GLOBL rupoly<>(SB),RODATA,$16
+GLOBL r5<>(SB),RODATA,$8
+
+// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+// len(p) must be at least 64, and must be a multiple of 16.
+
+// func ieeeCLMUL(crc uint32, p []byte) uint32
+TEXT Â·ieeeCLMUL(SB),NOSPLIT,$0
+	MOVL   crc+0(FP), X0             // Initial CRC value
+	MOVQ   p+8(FP), SI  	         // data pointer
+	MOVQ   p_len+16(FP), CX          // len(p)
+
+	MOVOU  (SI), X1
+	MOVOU  16(SI), X2
+	MOVOU  32(SI), X3
+	MOVOU  48(SI), X4
+	PXOR   X0, X1
+	ADDQ   $64, SI                  // buf+=64
+	SUBQ   $64, CX                  // len-=64
+	CMPQ   CX, $64                  // Less than 64 bytes left
+	JB     remain64
+
+	MOVOA  r2r1<>+0(SB), X0
+loopback64:
+	MOVOA  X1, X5
+	MOVOA  X2, X6
+	MOVOA  X3, X7
+	MOVOA  X4, X8
+
+	PCLMULQDQ $0, X0, X1
+	PCLMULQDQ $0, X0, X2
+	PCLMULQDQ $0, X0, X3
+	PCLMULQDQ $0, X0, X4
+
+	/* Load next early */
+	MOVOU    (SI), X11
+	MOVOU    16(SI), X12
+	MOVOU    32(SI), X13
+	MOVOU    48(SI), X14
+
+	PCLMULQDQ $0x11, X0, X5
+	PCLMULQDQ $0x11, X0, X6
+	PCLMULQDQ $0x11, X0, X7
+	PCLMULQDQ $0x11, X0, X8
+
+	PXOR     X5, X1
+	PXOR     X6, X2
+	PXOR     X7, X3
+	PXOR     X8, X4
+
+	PXOR     X11, X1
+	PXOR     X12, X2
+	PXOR     X13, X3
+	PXOR     X14, X4
+
+	ADDQ    $0x40, DI
+	ADDQ    $64, SI      // buf+=64
+	SUBQ    $64, CX      // len-=64
+	CMPQ    CX, $64      // Less than 64 bytes left?
+	JGE     loopback64
+
+	/* Fold result into a single register (X1) */
+remain64:
+	MOVOA       r4r3<>+0(SB), X0
+
+	MOVOA       X1, X5
+	PCLMULQDQ   $0, X0, X1
+	PCLMULQDQ   $0x11, X0, X5
+	PXOR        X5, X1
+	PXOR        X2, X1
+
+	MOVOA       X1, X5
+	PCLMULQDQ   $0, X0, X1
+	PCLMULQDQ   $0x11, X0, X5
+	PXOR        X5, X1
+	PXOR        X3, X1
+
+	MOVOA       X1, X5
+	PCLMULQDQ   $0, X0, X1
+	PCLMULQDQ   $0x11, X0, X5
+	PXOR        X5, X1
+	PXOR        X4, X1
+
+	/* If there is less than 16 bytes left we are done */
+	CMPQ        CX, $16
+	JB          finish
+
+	/* Encode 16 bytes */
+remain16:
+	MOVOU       (SI), X10
+	MOVOA       X1, X5
+	PCLMULQDQ   $0, X0, X1
+	PCLMULQDQ   $0x11, X0, X5
+	PXOR        X5, X1
+	PXOR        X10, X1
+	SUBQ        $16, CX
+	ADDQ        $16, SI
+	CMPQ        CX, $16
+	JGE         remain16
+
+finish:
+	/* Fold final result into 32 bits and return it */
+	PCMPEQB     X3, X3
+	PCLMULQDQ   $1, X1, X0
+	PSRLDQ      $8, X1
+	PXOR        X0, X1
+
+	MOVOA       X1, X2
+	MOVQ        r5<>+0(SB), X0
+
+	/* Creates 32 bit mask. Note that we don't care about upper half. */
+	PSRLQ       $32, X3
+
+	PSRLDQ      $4, X2
+	PAND        X3, X1
+	PCLMULQDQ   $0, X0, X1
+	PXOR        X2, X1
+
+	MOVOA       rupoly<>+0(SB), X0
+
+	MOVOA       X1, X2
+	PAND        X3, X1
+	PCLMULQDQ   $0x10, X0, X1
+	PAND        X3, X1
+	PCLMULQDQ   $0, X0, X1
+	PXOR        X2, X1
+
+	/* PEXTRD   $1, X1, AX  (SSE 4.1) */
+	BYTE $0x66; BYTE $0x0f; BYTE $0x3a;
+	BYTE $0x16; BYTE $0xc8; BYTE $0x01;
+	MOVL        AX, ret+32(FP)
+
+	RET
diff --git a/src/hash/crc32/crc32_amd64x.go b/src/hash/crc32/crc32_amd64p32.go
similarity index 63%
rename from src/hash/crc32/crc32_amd64x.go
rename to src/hash/crc32/crc32_amd64p32.go
index b7e359930a..40241c5835 100644
--- a/src/hash/crc32/crc32_amd64x.go
+++ b/src/hash/crc32/crc32_amd64p32.go
@@ -2,14 +2,12 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 amd64p32
-
 package crc32
 
 // This file contains the code to call the SSE 4.2 version of the Castagnoli
 // CRC.
 
-// haveSSE42 is defined in crc_amd64.s and uses CPUID to test for SSE 4.2
+// haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2
 // support.
 func haveSSE42() bool
 
@@ -25,3 +23,15 @@ func updateCastagnoli(crc uint32, p []byte) uint32 {
 	}
 	return update(crc, castagnoliTable, p)
 }
+
+func updateIEEE(crc uint32, p []byte) uint32 {
+	// only use slicing-by-8 when input is >= 4KB
+	if len(p) >= 4096 {
+		iEEETable8Once.Do(func() {
+			iEEETable8 = makeTable8(IEEE)
+		})
+		return updateSlicingBy8(crc, iEEETable8, p)
+	}
+
+	return update(crc, IEEETable, p)
+}
diff --git a/src/hash/crc32/crc32_generic.go b/src/hash/crc32/crc32_generic.go
index 416c1b7c55..d2355c83df 100644
--- a/src/hash/crc32/crc32_generic.go
+++ b/src/hash/crc32/crc32_generic.go
@@ -12,3 +12,14 @@ package crc32
 func updateCastagnoli(crc uint32, p []byte) uint32 {
 	return update(crc, castagnoliTable, p)
 }
+
+func updateIEEE(crc uint32, p []byte) uint32 {
+	// only use slicing-by-8 when input is >= 4KB
+	if len(p) >= 4096 {
+		iEEETable8Once.Do(func() {
+			iEEETable8 = makeTable8(IEEE)
+		})
+		return updateSlicingBy8(crc, iEEETable8, p)
+	}
+	return update(crc, IEEETable, p)
+}
-- 
2.51.0