From 7beb988a3b935a9db56b9e0544506491c4d5d06a Mon Sep 17 00:00:00 2001 From: Meng Zhuo Date: Thu, 8 Apr 2021 16:08:55 +0800 Subject: [PATCH] runtime: using wyhash for memhashFallback on 64bit platform MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit wyhash is a general hash function that: 1. About 8-70% faster that internal maphash 2. Passed Smhasher, BigCrush and PractRand tests name old time/op new time/op delta Hash5 28.9ns ± 0% 30.0ns ± 0% +3.77% (p=0.000 n=9+10) Hash16 32.4ns ± 0% 30.2ns ± 0% -6.74% (p=0.000 n=10+8) Hash64 52.4ns ± 0% 43.4ns ± 0% -17.20% (p=0.000 n=9+10) Hash1024 415ns ± 0% 258ns ± 2% -37.89% (p=0.000 n=10+10) Hash65536 24.9µs ± 0% 14.6µs ± 0% -41.22% (p=0.000 n=9+9) HashStringSpeed 50.2ns ± 4% 47.8ns ± 4% -4.88% (p=0.000 n=10+10) HashBytesSpeed 90.1ns ± 7% 78.3ns ± 4% -13.06% (p=0.000 n=10+10) HashInt32Speed 33.3ns ± 6% 33.6ns ± 4% ~ (p=0.071 n=10+10) HashInt64Speed 32.7ns ± 3% 34.0ns ± 3% +4.05% (p=0.000 n=9+10) HashStringArraySpeed 131ns ± 2% 117ns ± 5% -10.32% (p=0.000 n=9+10) FastrandHashiter 72.2ns ± 1% 75.7ns ±10% +4.87% (p=0.019 n=8+10) name old speed new speed delta Hash5 173MB/s ± 0% 167MB/s ± 0% -3.63% (p=0.000 n=9+10) Hash16 494MB/s ± 0% 530MB/s ± 0% +7.23% (p=0.000 n=10+8) Hash64 1.22GB/s ± 0% 1.48GB/s ± 0% +20.77% (p=0.000 n=9+10) Hash1024 2.47GB/s ± 0% 3.97GB/s ± 2% +61.01% (p=0.000 n=8+10) Hash65536 2.64GB/s ± 0% 4.48GB/s ± 0% +70.13% (p=0.000 n=9+9) Change-Id: I76af4e2bc1995a18149d11983ea8a149c132865e Reviewed-on: https://go-review.googlesource.com/c/go/+/279612 Trust: Meng Zhuo Run-TryBot: Meng Zhuo TryBot-Result: Go Bot Reviewed-by: Keith Randall --- src/cmd/compile/internal/ssagen/ssa.go | 1 + src/cmd/compile/internal/test/inl_test.go | 4 +- src/runtime/hash64.go | 136 ++++++++++------------ src/runtime/internal/math/math.go | 21 ++++ 4 files changed, 84 insertions(+), 78 deletions(-) diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go index 97b970012d..de60cbf390 100644 --- a/src/cmd/compile/internal/ssagen/ssa.go +++ b/src/cmd/compile/internal/ssagen/ssa.go @@ -4484,6 +4484,7 @@ func InitTables() { }, sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64) alias("math/bits", "Mul", "math/bits", "Mul64", sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64, sys.ArchS390X, sys.ArchMIPS64, sys.ArchMIPS64LE) + alias("runtime/internal/math", "Mul64", "math/bits", "Mul64", sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64, sys.ArchS390X, sys.ArchMIPS64, sys.ArchMIPS64LE) addF("math/bits", "Add64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go index fb9942a8da..6f100033cf 100644 --- a/src/cmd/compile/internal/test/inl_test.go +++ b/src/cmd/compile/internal/test/inl_test.go @@ -175,8 +175,8 @@ func TestIntendedInlining(t *testing.T) { want["runtime/internal/sys"] = append(want["runtime/internal/sys"], "Bswap32") } if bits.UintSize == 64 { - // rotl_31 is only defined on 64-bit architectures - want["runtime"] = append(want["runtime"], "rotl_31") + // mix is only defined on 64-bit architectures + want["runtime"] = append(want["runtime"], "mix") } switch runtime.GOARCH { diff --git a/src/runtime/hash64.go b/src/runtime/hash64.go index 1bee666bd7..5f7d00bf7f 100644 --- a/src/runtime/hash64.go +++ b/src/runtime/hash64.go @@ -3,107 +3,91 @@ // license that can be found in the LICENSE file. // Hashing algorithm inspired by -// xxhash: https://code.google.com/p/xxhash/ -// cityhash: https://code.google.com/p/cityhash/ +// wyhash: https://github.com/wangyi-fudan/wyhash //go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm // +build amd64 arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x wasm package runtime -import "unsafe" +import ( + "runtime/internal/math" + "unsafe" +) const ( - // Constants for multiplication: four random odd 64-bit numbers. - m1 = 16877499708836156737 - m2 = 2820277070424839065 - m3 = 9497967016996688599 - m4 = 15839092249703872147 + m1 = 0xa0761d6478bd642f + m2 = 0xe7037ed1a0b428db + m3 = 0x8ebc6af09c88c6e3 + m4 = 0x589965cc75374cc3 + m5 = 0x1d8e4e27c47d124f ) func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr { - h := uint64(seed + s*hashkey[0]) -tail: + var a, b uintptr + seed ^= hashkey[0] ^ m1 switch { case s == 0: + return seed case s < 4: - h ^= uint64(*(*byte)(p)) - h ^= uint64(*(*byte)(add(p, s>>1))) << 8 - h ^= uint64(*(*byte)(add(p, s-1))) << 16 - h = rotl_31(h*m1) * m2 - case s <= 8: - h ^= uint64(readUnaligned32(p)) - h ^= uint64(readUnaligned32(add(p, s-4))) << 32 - h = rotl_31(h*m1) * m2 + a = uintptr(*(*byte)(p)) + a |= uintptr(*(*byte)(add(p, s>>1))) << 8 + a |= uintptr(*(*byte)(add(p, s-1))) << 16 + case s == 4: + a = r4(p) + b = a + case s < 8: + a = r4(p) + b = r4(add(p, s-4)) + case s == 8: + a = r8(p) + b = a case s <= 16: - h ^= readUnaligned64(p) - h = rotl_31(h*m1) * m2 - h ^= readUnaligned64(add(p, s-8)) - h = rotl_31(h*m1) * m2 - case s <= 32: - h ^= readUnaligned64(p) - h = rotl_31(h*m1) * m2 - h ^= readUnaligned64(add(p, 8)) - h = rotl_31(h*m1) * m2 - h ^= readUnaligned64(add(p, s-16)) - h = rotl_31(h*m1) * m2 - h ^= readUnaligned64(add(p, s-8)) - h = rotl_31(h*m1) * m2 + a = r8(p) + b = r8(add(p, s-8)) default: - v1 := h - v2 := uint64(seed * hashkey[1]) - v3 := uint64(seed * hashkey[2]) - v4 := uint64(seed * hashkey[3]) - for s >= 32 { - v1 ^= readUnaligned64(p) - v1 = rotl_31(v1*m1) * m2 - p = add(p, 8) - v2 ^= readUnaligned64(p) - v2 = rotl_31(v2*m2) * m3 - p = add(p, 8) - v3 ^= readUnaligned64(p) - v3 = rotl_31(v3*m3) * m4 - p = add(p, 8) - v4 ^= readUnaligned64(p) - v4 = rotl_31(v4*m4) * m1 - p = add(p, 8) - s -= 32 + l := s + if l > 48 { + seed1 := seed + seed2 := seed + for ; l > 48; l -= 48 { + seed = mix(r8(p)^m2, r8(add(p, 8))^seed) + seed1 = mix(r8(add(p, 16))^m3, r8(add(p, 24))^seed1) + seed2 = mix(r8(add(p, 32))^m4, r8(add(p, 40))^seed2) + p = add(p, 48) + } + seed ^= seed1 ^ seed2 + } + for ; l > 16; l -= 16 { + seed = mix(r8(p)^m2, r8(add(p, 8))^seed) + p = add(p, 16) } - h = v1 ^ v2 ^ v3 ^ v4 - goto tail + a = r8(add(p, l-16)) + b = r8(add(p, l-8)) } - h ^= h >> 29 - h *= m3 - h ^= h >> 32 - return uintptr(h) + return mix(m5^s, mix(a^m2, b^seed)) } func memhash32Fallback(p unsafe.Pointer, seed uintptr) uintptr { - h := uint64(seed + 4*hashkey[0]) - v := uint64(readUnaligned32(p)) - h ^= v - h ^= v << 32 - h = rotl_31(h*m1) * m2 - h ^= h >> 29 - h *= m3 - h ^= h >> 32 - return uintptr(h) + a := r4(p) + return mix(m5^4, mix(a^m2, a^seed^hashkey[0]^m1)) } func memhash64Fallback(p unsafe.Pointer, seed uintptr) uintptr { - h := uint64(seed + 8*hashkey[0]) - h ^= uint64(readUnaligned32(p)) | uint64(readUnaligned32(add(p, 4)))<<32 - h = rotl_31(h*m1) * m2 - h ^= h >> 29 - h *= m3 - h ^= h >> 32 - return uintptr(h) + a := r8(p) + return mix(m5^8, mix(a^m2, a^seed^hashkey[0]^m1)) +} + +func mix(a, b uintptr) uintptr { + hi, lo := math.Mul64(uint64(a), uint64(b)) + return uintptr(hi ^ lo) +} + +func r4(p unsafe.Pointer) uintptr { + return uintptr(readUnaligned32(p)) } -// Note: in order to get the compiler to issue rotl instructions, we -// need to constant fold the shift amount by hand. -// TODO: convince the compiler to issue rotl instructions after inlining. -func rotl_31(x uint64) uint64 { - return (x << 31) | (x >> (64 - 31)) +func r8(p unsafe.Pointer) uintptr { + return uintptr(readUnaligned64(p)) } diff --git a/src/runtime/internal/math/math.go b/src/runtime/internal/math/math.go index 5385f5dd86..b6bd12d3e8 100644 --- a/src/runtime/internal/math/math.go +++ b/src/runtime/internal/math/math.go @@ -17,3 +17,24 @@ func MulUintptr(a, b uintptr) (uintptr, bool) { overflow := b > MaxUintptr/a return a * b, overflow } + +// Mul64 returns the 128-bit product of x and y: (hi, lo) = x * y +// with the product bits' upper half returned in hi and the lower +// half returned in lo. +// This is a copy from math/bits.Mul64 +// On supported platforms this is an intrinsic lowered by the compiler. +func Mul64(x, y uint64) (hi, lo uint64) { + const mask32 = 1<<32 - 1 + x0 := x & mask32 + x1 := x >> 32 + y0 := y & mask32 + y1 := y >> 32 + w0 := x0 * y0 + t := x1*y0 + w0>>32 + w1 := t & mask32 + w2 := t >> 32 + w1 += x0 * y1 + hi = x1*y1 + w2 + w1>>32 + lo = x * y + return +} -- 2.50.0