From: Klaus Post Date: Wed, 23 Jul 2025 11:23:52 +0000 (+0000) Subject: hash/crc32: add AVX512 IEEE CRC32 calculation X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=18dbe5b941e03a61cebbb441a9e4dfef43adf425;p=gostls13.git hash/crc32: add AVX512 IEEE CRC32 calculation Benchmark: goos: windows goarch: amd64 pkg: hash/crc32 cpu: AMD Ryzen 9 9950X 16-Core Processor benchmark old MB/s new MB/s speedup BenchmarkCRC32/poly=IEEE/size=15/align=0-32 1081.48 1089.42 1.01x BenchmarkCRC32/poly=IEEE/size=15/align=1-32 1085.87 1082.61 1.00x BenchmarkCRC32/poly=IEEE/size=40/align=0-32 2756.33 2752.37 1.00x BenchmarkCRC32/poly=IEEE/size=40/align=1-32 2758.27 2756.99 1.00x BenchmarkCRC32/poly=IEEE/size=512/align=0-32 18133.44 18076.52 1.00x BenchmarkCRC32/poly=IEEE/size=512/align=1-32 18151.05 18055.41 0.99x BenchmarkCRC32/poly=IEEE/size=1kB/align=0-32 19902.93 48581.07 2.44x BenchmarkCRC32/poly=IEEE/size=1kB/align=1-32 19966.99 48393.25 2.42x BenchmarkCRC32/poly=IEEE/size=4kB/align=0-32 21690.33 51679.25 2.38x BenchmarkCRC32/poly=IEEE/size=4kB/align=1-32 21655.30 51731.22 2.39x BenchmarkCRC32/poly=IEEE/size=32kB/align=0-32 22046.57 46406.90 2.10x BenchmarkCRC32/poly=IEEE/size=32kB/align=1-32 21986.22 46250.66 2.10x AVX512 are enabled above 1KB input size. This rather high limit is due to AVX512 may be slower to ramp up than the regular SSE4 implementation for smaller inputs. This is not reflected in the benchmarks, since consecutive calls means the CPU is "hot". The 'HasAVX512VPCLMULQDQ' name mirrors the one in golang.org/x/sys/cpu Change-Id: Id23685d8e3cc412b6d397a7d70056844bdb79271 Change-Id: Id23685d8e3cc412b6d397a7d70056844bdb79271 GitHub-Last-Rev: 6639f07b9febc7c96a7f3b402a2fd60f7be5e154 GitHub-Pull-Request: golang/go#74701 Reviewed-on: https://go-review.googlesource.com/c/go/+/689435 Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Knyszek Auto-Submit: Keith Randall Auto-Submit: Michael Knyszek Reviewed-by: Keith Randall --- diff --git a/src/hash/crc32/crc32_amd64.go b/src/hash/crc32/crc32_amd64.go index 6be129f5dd..105ce01a1e 100644 --- a/src/hash/crc32/crc32_amd64.go +++ b/src/hash/crc32/crc32_amd64.go @@ -13,6 +13,11 @@ import ( "unsafe" ) +// Offset into internal/cpu records for use in assembly. +const ( + offsetX86HasAVX512VPCLMULQDQL = unsafe.Offsetof(cpu.X86.HasAVX512VPCLMULQDQ) +) + // This file contains the code to call the SSE 4.2 version of the Castagnoli // and IEEE CRC. diff --git a/src/hash/crc32/crc32_amd64.s b/src/hash/crc32/crc32_amd64.s index 6af6c253a7..4c482dc4a7 100644 --- a/src/hash/crc32/crc32_amd64.s +++ b/src/hash/crc32/crc32_amd64.s @@ -3,6 +3,7 @@ // license that can be found in the LICENSE file. #include "textflag.h" +#include "go_asm.h" // castagnoliSSE42 updates the (non-inverted) crc with the given buffer. // @@ -136,15 +137,23 @@ loop: // Linux kernel, since they avoid the costly // PSHUFB 16 byte reversal proposed in the // original Intel paper. +// Splatted so it can be loaded with a single VMOVDQU64 DATA r2r1<>+0(SB)/8, $0x154442bd4 DATA r2r1<>+8(SB)/8, $0x1c6e41596 +DATA r2r1<>+16(SB)/8, $0x154442bd4 +DATA r2r1<>+24(SB)/8, $0x1c6e41596 +DATA r2r1<>+32(SB)/8, $0x154442bd4 +DATA r2r1<>+40(SB)/8, $0x1c6e41596 +DATA r2r1<>+48(SB)/8, $0x154442bd4 +DATA r2r1<>+56(SB)/8, $0x1c6e41596 + DATA r4r3<>+0(SB)/8, $0x1751997d0 DATA r4r3<>+8(SB)/8, $0x0ccaa009e DATA rupoly<>+0(SB)/8, $0x1db710641 DATA rupoly<>+8(SB)/8, $0x1f7011641 DATA r5<>+0(SB)/8, $0x163cd6124 -GLOBL r2r1<>(SB),RODATA,$16 +GLOBL r2r1<>(SB), RODATA, $64 GLOBL r4r3<>(SB),RODATA,$16 GLOBL rupoly<>(SB),RODATA,$16 GLOBL r5<>(SB),RODATA,$8 @@ -158,6 +167,43 @@ TEXT ·ieeeCLMUL(SB),NOSPLIT,$0 MOVQ p+8(FP), SI // data pointer MOVQ p_len+16(FP), CX // len(p) + // Check feature support and length to be >= 1024 bytes. + CMPB internal∕cpu·X86+const_offsetX86HasAVX512VPCLMULQDQL(SB), $1 + JNE useSSE42 + CMPQ CX, $1024 + JL useSSE42 + + // Use AVX512 + VPXORQ Z0, Z0, Z0 + VMOVQ AX, X0 + VMOVDQU64 (SI), Z1 + VPXORQ Z0, Z1, Z1 // Merge initial CRC value into Z1 + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + + VMOVDQU64 r2r1<>+0(SB), Z0 + +loopback64Avx512: + VMOVDQU64 (SI), Z11 // Load next + VPCLMULQDQ $0x11, Z0, Z1, Z5 + VPCLMULQDQ $0, Z0, Z1, Z1 + VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1 + + ADDQ $0x40, DI + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + CMPQ CX, $64 // Less than 64 bytes left? + JGE loopback64Avx512 + + // Unfold result into XMM1-XMM4 to match SSE4 code. + VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane + VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane + VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane + VZEROUPPER + JMP remain64 + + PCALIGN $16 +useSSE42: MOVOU (SI), X1 MOVOU 16(SI), X2 MOVOU 32(SI), X3 @@ -207,6 +253,7 @@ loopback64: CMPQ CX, $64 // Less than 64 bytes left? JGE loopback64 + PCALIGN $16 /* Fold result into a single register (X1) */ remain64: MOVOA r4r3<>+0(SB), X0 diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go index 760dc0b469..6017b1acc9 100644 --- a/src/internal/cpu/cpu.go +++ b/src/internal/cpu/cpu.go @@ -26,29 +26,30 @@ var CacheLineSize uintptr = CacheLinePadSize // in addition to the cpuid feature bit being set. // The struct is padded to avoid false sharing. var X86 struct { - _ CacheLinePad - HasAES bool - HasADX bool - HasAVX bool - HasAVX2 bool - HasAVX512F bool - HasAVX512BW bool - HasAVX512VL bool - HasBMI1 bool - HasBMI2 bool - HasERMS bool - HasFSRM bool - HasFMA bool - HasOSXSAVE bool - HasPCLMULQDQ bool - HasPOPCNT bool - HasRDTSCP bool - HasSHA bool - HasSSE3 bool - HasSSSE3 bool - HasSSE41 bool - HasSSE42 bool - _ CacheLinePad + _ CacheLinePad + HasAES bool + HasADX bool + HasAVX bool + HasAVX2 bool + HasAVX512F bool + HasAVX512BW bool + HasAVX512VL bool + HasBMI1 bool + HasBMI2 bool + HasERMS bool + HasFSRM bool + HasFMA bool + HasOSXSAVE bool + HasPCLMULQDQ bool + HasPOPCNT bool + HasRDTSCP bool + HasSHA bool + HasSSE3 bool + HasSSSE3 bool + HasSSE41 bool + HasSSE42 bool + HasAVX512VPCLMULQDQ bool + _ CacheLinePad } // The booleans in ARM contain the correspondingly named cpu feature bit. diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go index ee812076e9..69b9542ae2 100644 --- a/src/internal/cpu/cpu_x86.go +++ b/src/internal/cpu/cpu_x86.go @@ -40,6 +40,10 @@ const ( cpuid_SHA = 1 << 29 cpuid_AVX512BW = 1 << 30 cpuid_AVX512VL = 1 << 31 + + // ecx bits + cpuid_AVX512VPCLMULQDQ = 1 << 10 + // edx bits cpuid_FSRM = 1 << 4 // edx bits for CPUID 0x80000001 @@ -57,6 +61,7 @@ func doinit() { {Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ}, {Name: "rdtscp", Feature: &X86.HasRDTSCP}, {Name: "sha", Feature: &X86.HasSHA}, + {Name: "vpclmulqdq", Feature: &X86.HasAVX512VPCLMULQDQ}, } level := getGOAMD64level() if level < 2 { @@ -139,7 +144,7 @@ func doinit() { return } - _, ebx7, _, edx7 := cpuid(7, 0) + _, ebx7, ecx7, edx7 := cpuid(7, 0) X86.HasBMI1 = isSet(ebx7, cpuid_BMI1) X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX X86.HasBMI2 = isSet(ebx7, cpuid_BMI2) @@ -151,6 +156,7 @@ func doinit() { if X86.HasAVX512F { X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW) X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL) + X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ) } X86.HasFSRM = isSet(edx7, cpuid_FSRM)