]> Cypherpunks repositories - gostls13.git/commitdiff
hash/crc32: add AVX512 IEEE CRC32 calculation
authorKlaus Post <klauspost@gmail.com>
Wed, 23 Jul 2025 11:23:52 +0000 (11:23 +0000)
committerGopher Robot <gobot@golang.org>
Thu, 24 Jul 2025 17:19:17 +0000 (10:19 -0700)
Benchmark:

goos: windows
goarch: amd64
pkg: hash/crc32
cpu: AMD Ryzen 9 9950X 16-Core Processor

benchmark                                               old MB/s     new MB/s     speedup
BenchmarkCRC32/poly=IEEE/size=15/align=0-32             1081.48      1089.42      1.01x
BenchmarkCRC32/poly=IEEE/size=15/align=1-32             1085.87      1082.61      1.00x
BenchmarkCRC32/poly=IEEE/size=40/align=0-32             2756.33      2752.37      1.00x
BenchmarkCRC32/poly=IEEE/size=40/align=1-32             2758.27      2756.99      1.00x
BenchmarkCRC32/poly=IEEE/size=512/align=0-32            18133.44     18076.52     1.00x
BenchmarkCRC32/poly=IEEE/size=512/align=1-32            18151.05     18055.41     0.99x
BenchmarkCRC32/poly=IEEE/size=1kB/align=0-32            19902.93     48581.07     2.44x
BenchmarkCRC32/poly=IEEE/size=1kB/align=1-32            19966.99     48393.25     2.42x
BenchmarkCRC32/poly=IEEE/size=4kB/align=0-32            21690.33     51679.25     2.38x
BenchmarkCRC32/poly=IEEE/size=4kB/align=1-32            21655.30     51731.22     2.39x
BenchmarkCRC32/poly=IEEE/size=32kB/align=0-32           22046.57     46406.90     2.10x
BenchmarkCRC32/poly=IEEE/size=32kB/align=1-32           21986.22     46250.66     2.10x

AVX512 are enabled above 1KB input size.

This rather high limit is due to AVX512 may be slower to ramp up
than the regular SSE4 implementation for smaller inputs.

This is not reflected in the benchmarks,
since consecutive calls means the CPU is "hot".

The 'HasAVX512VPCLMULQDQ' name mirrors the one in golang.org/x/sys/cpu

Change-Id: Id23685d8e3cc412b6d397a7d70056844bdb79271

Change-Id: Id23685d8e3cc412b6d397a7d70056844bdb79271
GitHub-Last-Rev: 6639f07b9febc7c96a7f3b402a2fd60f7be5e154
GitHub-Pull-Request: golang/go#74701
Reviewed-on: https://go-review.googlesource.com/c/go/+/689435
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
src/hash/crc32/crc32_amd64.go
src/hash/crc32/crc32_amd64.s
src/internal/cpu/cpu.go
src/internal/cpu/cpu_x86.go

index 6be129f5ddd4c0a7d51edb62d32429b6785acbf6..105ce01a1e3c5e8bd44f958883553f96699fe764 100644 (file)
@@ -13,6 +13,11 @@ import (
        "unsafe"
 )
 
+// Offset into internal/cpu records for use in assembly.
+const (
+       offsetX86HasAVX512VPCLMULQDQL = unsafe.Offsetof(cpu.X86.HasAVX512VPCLMULQDQ)
+)
+
 // This file contains the code to call the SSE 4.2 version of the Castagnoli
 // and IEEE CRC.
 
index 6af6c253a79003316e23605da9af65a4c30ecc92..4c482dc4a7f1d357952d04421ee3810293018a67 100644 (file)
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "go_asm.h"
 
 // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
 //
@@ -136,15 +137,23 @@ loop:
 // Linux kernel, since they avoid the costly
 // PSHUFB 16 byte reversal proposed in the
 // original Intel paper.
+// Splatted so it can be loaded with a single VMOVDQU64
 DATA r2r1<>+0(SB)/8, $0x154442bd4
 DATA r2r1<>+8(SB)/8, $0x1c6e41596
+DATA r2r1<>+16(SB)/8, $0x154442bd4
+DATA r2r1<>+24(SB)/8, $0x1c6e41596
+DATA r2r1<>+32(SB)/8, $0x154442bd4
+DATA r2r1<>+40(SB)/8, $0x1c6e41596
+DATA r2r1<>+48(SB)/8, $0x154442bd4
+DATA r2r1<>+56(SB)/8, $0x1c6e41596
+
 DATA r4r3<>+0(SB)/8, $0x1751997d0
 DATA r4r3<>+8(SB)/8, $0x0ccaa009e
 DATA rupoly<>+0(SB)/8, $0x1db710641
 DATA rupoly<>+8(SB)/8, $0x1f7011641
 DATA r5<>+0(SB)/8, $0x163cd6124
 
-GLOBL r2r1<>(SB),RODATA,$16
+GLOBL r2r1<>(SB), RODATA, $64
 GLOBL r4r3<>(SB),RODATA,$16
 GLOBL rupoly<>(SB),RODATA,$16
 GLOBL r5<>(SB),RODATA,$8
@@ -158,6 +167,43 @@ TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
        MOVQ   p+8(FP), SI               // data pointer
        MOVQ   p_len+16(FP), CX          // len(p)
 
+       // Check feature support and length to be >= 1024 bytes.
+       CMPB internal∕cpu·X86+const_offsetX86HasAVX512VPCLMULQDQL(SB), $1
+       JNE  useSSE42
+       CMPQ CX, $1024
+       JL   useSSE42
+
+       // Use AVX512
+       VPXORQ    Z0, Z0, Z0
+       VMOVQ     AX, X0
+       VMOVDQU64 (SI), Z1
+       VPXORQ    Z0, Z1, Z1 // Merge initial CRC value into Z1
+       ADDQ      $64, SI    // buf+=64
+       SUBQ      $64, CX    // len-=64
+
+       VMOVDQU64 r2r1<>+0(SB), Z0
+
+loopback64Avx512:
+       VMOVDQU64  (SI), Z11          // Load next
+       VPCLMULQDQ $0x11, Z0, Z1, Z5
+       VPCLMULQDQ $0, Z0, Z1, Z1
+       VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1
+
+       ADDQ $0x40, DI
+       ADDQ $64, SI    // buf+=64
+       SUBQ $64, CX    // len-=64
+       CMPQ CX, $64    // Less than 64 bytes left?
+       JGE  loopback64Avx512
+
+       // Unfold result into XMM1-XMM4 to match SSE4 code.
+       VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane
+       VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane
+       VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane
+       VZEROUPPER
+       JMP remain64
+
+       PCALIGN $16
+useSSE42:
        MOVOU  (SI), X1
        MOVOU  16(SI), X2
        MOVOU  32(SI), X3
@@ -207,6 +253,7 @@ loopback64:
        CMPQ    CX, $64      // Less than 64 bytes left?
        JGE     loopback64
 
+       PCALIGN $16
        /* Fold result into a single register (X1) */
 remain64:
        MOVOA       r4r3<>+0(SB), X0
index 760dc0b469d83d355ef0cc020d5309870db8da07..6017b1acc9fe96f899396ad174c2c4e325e0910d 100644 (file)
@@ -26,29 +26,30 @@ var CacheLineSize uintptr = CacheLinePadSize
 // in addition to the cpuid feature bit being set.
 // The struct is padded to avoid false sharing.
 var X86 struct {
-       _            CacheLinePad
-       HasAES       bool
-       HasADX       bool
-       HasAVX       bool
-       HasAVX2      bool
-       HasAVX512F   bool
-       HasAVX512BW  bool
-       HasAVX512VL  bool
-       HasBMI1      bool
-       HasBMI2      bool
-       HasERMS      bool
-       HasFSRM      bool
-       HasFMA       bool
-       HasOSXSAVE   bool
-       HasPCLMULQDQ bool
-       HasPOPCNT    bool
-       HasRDTSCP    bool
-       HasSHA       bool
-       HasSSE3      bool
-       HasSSSE3     bool
-       HasSSE41     bool
-       HasSSE42     bool
-       _            CacheLinePad
+       _                   CacheLinePad
+       HasAES              bool
+       HasADX              bool
+       HasAVX              bool
+       HasAVX2             bool
+       HasAVX512F          bool
+       HasAVX512BW         bool
+       HasAVX512VL         bool
+       HasBMI1             bool
+       HasBMI2             bool
+       HasERMS             bool
+       HasFSRM             bool
+       HasFMA              bool
+       HasOSXSAVE          bool
+       HasPCLMULQDQ        bool
+       HasPOPCNT           bool
+       HasRDTSCP           bool
+       HasSHA              bool
+       HasSSE3             bool
+       HasSSSE3            bool
+       HasSSE41            bool
+       HasSSE42            bool
+       HasAVX512VPCLMULQDQ bool
+       _                   CacheLinePad
 }
 
 // The booleans in ARM contain the correspondingly named cpu feature bit.
index ee812076e96c490772155f5354cc8c574ef8d92c..69b9542ae2a1f504f7e99a0f438c9dd04bb25e86 100644 (file)
@@ -40,6 +40,10 @@ const (
        cpuid_SHA      = 1 << 29
        cpuid_AVX512BW = 1 << 30
        cpuid_AVX512VL = 1 << 31
+
+       // ecx bits
+       cpuid_AVX512VPCLMULQDQ = 1 << 10
+
        // edx bits
        cpuid_FSRM = 1 << 4
        // edx bits for CPUID 0x80000001
@@ -57,6 +61,7 @@ func doinit() {
                {Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
                {Name: "rdtscp", Feature: &X86.HasRDTSCP},
                {Name: "sha", Feature: &X86.HasSHA},
+               {Name: "vpclmulqdq", Feature: &X86.HasAVX512VPCLMULQDQ},
        }
        level := getGOAMD64level()
        if level < 2 {
@@ -139,7 +144,7 @@ func doinit() {
                return
        }
 
-       _, ebx7, _, edx7 := cpuid(7, 0)
+       _, ebx7, ecx7, edx7 := cpuid(7, 0)
        X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
        X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
        X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
@@ -151,6 +156,7 @@ func doinit() {
        if X86.HasAVX512F {
                X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW)
                X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
+               X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ)
        }
 
        X86.HasFSRM = isSet(edx7, cpuid_FSRM)