hash/maphash: revise API to be more idiomatic

author Russ Cox <rsc@golang.org>

Mon, 4 Nov 2019 18:29:29 +0000 (13:29 -0500)

committer Russ Cox <rsc@golang.org>

Mon, 4 Nov 2019 21:30:29 +0000 (21:30 +0000)
author Russ Cox <rsc@golang.org>
Mon, 4 Nov 2019 18:29:29 +0000 (13:29 -0500)
committer Russ Cox <rsc@golang.org>
Mon, 4 Nov 2019 21:30:29 +0000 (21:30 +0000)
diff --git a/src/hash/maphash/maphash.go b/src/hash/maphash/maphash.go

index 0cd4769c03762c68698ea84a1b4f8a3c0e6e2e75..9b6c0cfb99860300f09b0d0bad438ded899ea00e 100644 (file)
--- a/src/hash/maphash/maphash.go
+++ b/src/hash/maphash/maphash.go
@@ -2,65 +2,89 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-// Package hash/maphash provides hash functions on byte sequences. These
-// hash functions are intended to be used to implement hash tables or
+// Package maphash provides hash functions on byte sequences.
+// These hash functions are intended to be used to implement hash tables or
  // other data structures that need to map arbitrary strings or byte
-// sequences to a uniform distribution of integers. The hash functions
-// are collision-resistant but are not cryptographically secure (use
-// one of the hash functions in crypto/* if you need that).
+// sequences to a uniform distribution of integers.
  //
-// The produced hashes depend only on the sequence of bytes provided
-// to the Hash object, not on the way in which they are provided. For
-// example, the calls
-//     h.AddString("foo")
-//     h.AddBytes([]byte{'f','o','o'})
-//     h.AddByte('f'); h.AddByte('o'); h.AddByte('o')
-// will all have the same effect.
-//
-// Two Hash instances in the same process using the same seed
-// behave identically.
-//
-// Two Hash instances with the same seed in different processes are
-// not guaranteed to behave identically, even if the processes share
-// the same binary.
-//
-// Hashes are intended to be collision-resistant, even for situations
-// where an adversary controls the byte sequences being hashed.
-// All bits of the Hash result are close to uniformly and
-// independently distributed, so can be safely restricted to a range
-// using bit masking, shifting, or modular arithmetic.
+// The hash functions are collision-resistant but not cryptographically secure.
+// (See crypto/sha256 and crypto/sha512 for cryptographic use.)
  package maphash
  
-import (
-       "unsafe"
-)
+import "unsafe"
  
-// A Seed controls the behavior of a Hash.  Two Hash objects with the
-// same seed in the same process will behave identically.  Two Hash
-// objects with different seeds will very likely behave differently.
+// A Seed is a random value that selects the specific hash function
+// computed by a Hash. If two Hashes use the same Seeds, they
+// will compute the same hash values for any given input.
+// If two Hashes use different Seeds, they are very likely to compute
+// distinct hash values for any given input.
+//
+// A Seed must be initialized by calling MakeSeed.
+// The zero seed is uninitialized and not valid for use with Hash's SetSeed method.
+//
+// Each Seed value is local to a single process and cannot be serialized
+// or otherwise recreated in a different process.
  type Seed struct {
         s uint64
  }
  
-// A Hash object is used to compute the hash of a byte sequence.
+// A Hash computes a seeded hash of a byte sequence.
+//
+// The zero Hash is a valid Hash ready to use.
+// A zero Hash chooses a random seed for itself during
+// the first call to a Reset, Write, Seed, Sum64, or Seed method.
+// For control over the seed, use SetSeed.
+//
+// The computed hash values depend only on the initial seed and
+// the sequence of bytes provided to the Hash object, not on the way
+// in which the bytes are provided. For example, the three sequences
+//
+//     h.Write([]byte{'f','o','o'})
+//     h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
+//     h.WriteString("foo")
+//
+// all have the same effect.
+//
+// Hashes are intended to be collision-resistant, even for situations
+// where an adversary controls the byte sequences being hashed.
+//
+// A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
+// If multiple goroutines must compute the same seeded hash,
+// each can declare its own Hash and call SetSeed with a common Seed.
  type Hash struct {
-       seed  Seed     // initial seed used for this hash
-       state Seed     // current hash of all flushed bytes
-       buf   [64]byte // unflushed byte buffer
-       n     int      // number of unflushed bytes
+       _     [0]func() // not comparable
+       seed  Seed      // initial seed used for this hash
+       state Seed      // current hash of all flushed bytes
+       buf   [64]byte  // unflushed byte buffer
+       n     int       // number of unflushed bytes
+}
+
+// initSeed seeds the hash if necessary.
+// initSeed is called lazily before any operation that actually uses h.seed/h.state.
+// Note that this does not include Write/WriteByte/WriteString in the case
+// where they only add to h.buf. (If they write too much, they call h.flush,
+// which does call h.initSeed.)
+func (h *Hash) initSeed() {
+       if h.seed.s == 0 {
+               h.SetSeed(MakeSeed())
+       }
  }
  
-// AddByte adds b to the sequence of bytes hashed by h.
-func (h *Hash) AddByte(b byte) {
+// WriteByte adds b to the sequence of bytes hashed by h.
+// It never fails; the error result is for implementing io.ByteWriter.
+func (h *Hash) WriteByte(b byte) error {
         if h.n == len(h.buf) {
                 h.flush()
         }
         h.buf[h.n] = b
         h.n++
+       return nil
  }
  
-// AddBytes adds b to the sequence of bytes hashed by h.
-func (h *Hash) AddBytes(b []byte) {
+// Write adds b to the sequence of bytes hashed by h.
+// It always writes all of b and never fails; the count and error result are for implementing io.Writer.
+func (h *Hash) Write(b []byte) (int, error) {
+       size := len(b)
         for h.n+len(b) > len(h.buf) {
                 k := copy(h.buf[h.n:], b)
                 h.n = len(h.buf)
@@ -68,10 +92,13 @@ func (h *Hash) AddBytes(b []byte) {
                 h.flush()
         }
         h.n += copy(h.buf[h.n:], b)
+       return size, nil
  }
  
-// AddString adds the bytes of s to the sequence of bytes hashed by h.
-func (h *Hash) AddString(s string) {
+// WriteString adds the bytes of s to the sequence of bytes hashed by h.
+// It always writes all of s and never fails; the count and error result are for implementing io.StringWriter.
+func (h *Hash) WriteString(s string) (int, error) {
+       size := len(s)
         for h.n+len(s) > len(h.buf) {
                 k := copy(h.buf[h.n:], s)
                 h.n = len(h.buf)
@@ -79,19 +106,24 @@ func (h *Hash) AddString(s string) {
                 h.flush()
         }
         h.n += copy(h.buf[h.n:], s)
+       return size, nil
  }
  
-// Seed returns the seed value specified in the most recent call to
-// SetSeed, or the initial seed if SetSeed was never called.
+// Seed returns h's seed value.
  func (h *Hash) Seed() Seed {
+       h.initSeed()
         return h.seed
  }
  
-// SetSeed sets the seed used by h. Two Hash objects with the same
-// seed in the same process will behave identically.  Two Hash objects
-// with different seeds will very likely behave differently.  Any
-// bytes added to h previous to this call will be discarded.
+// SetSeed sets h to use seed, which must have been returned by MakeSeed
+// or by another Hash's Seed method.
+// Two Hash objects with the same seed behave identically.
+// Two Hash objects with different seeds will very likely behave differently.
+// Any bytes added to h before this call will be discarded.
  func (h *Hash) SetSeed(seed Seed) {
+       if seed.s == 0 {
+               panic("maphash: use of uninitialized Seed")
+       }
         h.seed = seed
         h.state = seed
         h.n = 0
@@ -100,6 +132,7 @@ func (h *Hash) SetSeed(seed Seed) {
  // Reset discards all bytes added to h.
  // (The seed remains the same.)
  func (h *Hash) Reset() {
+       h.initSeed()
         h.state = h.seed
         h.n = 0
  }
@@ -107,36 +140,38 @@ func (h *Hash) Reset() {
  // precondition: buffer is full.
  func (h *Hash) flush() {
         if h.n != len(h.buf) {
-               panic("flush of partially full buffer")
+               panic("maphash: flush of partially full buffer")
         }
+       h.initSeed()
         h.state.s = rthash(h.buf[:], h.state.s)
         h.n = 0
  }
  
-// Hash returns a value which depends on h's seed and the sequence of
-// bytes added to h (since the last call to Reset or SetSeed).
-func (h *Hash) Hash() uint64 {
+// Sum64 returns h's current 64-bit value, which depends on
+// h's seed and the sequence of bytes added to h since the
+// last call to Reset or SetSeed.
+//
+// All bits of the Sum64 result are close to uniformly and
+// independently distributed, so it can be safely reduced
+// by using bit masking, shifting, or modular arithmetic.
+func (h *Hash) Sum64() uint64 {
+       h.initSeed()
         return rthash(h.buf[:h.n], h.state.s)
  }
  
-// MakeSeed returns a Seed initialized using the bits in s.
-// Two seeds generated with the same s are guaranteed to be equal.
-// Two seeds generated with different s are very likely to be different.
-// TODO: disallow this? See Alan's comment in the issue.
-func MakeSeed(s uint64) Seed {
-       return Seed{s: s}
-}
-
-// New returns a new Hash object. Different hash objects allocated by
-// this function will very likely have different seeds.
-func New() *Hash {
-       s1 := uint64(runtime_fastrand())
-       s2 := uint64(runtime_fastrand())
-       seed := Seed{s: s1<<32 + s2}
-       return &Hash{
-               seed:  seed,
-               state: seed,
+// MakeSeed returns a new random seed.
+func MakeSeed() Seed {
+       var s1, s2 uint64
+       for {
+               s1 = uint64(runtime_fastrand())
+               s2 = uint64(runtime_fastrand())
+               // We use seed 0 to indicate an uninitialized seed/hash,
+               // so keep trying until we get a non-zero seed.
+               if s1|s2 != 0 {
+                       break
+               }
         }
+       return Seed{s: s1<<32 + s2}
  }
  
  //go:linkname runtime_fastrand runtime.fastrand
@@ -154,22 +189,17 @@ func rthash(b []byte, seed uint64) uint64 {
         }
         lo := runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed), uintptr(len(b)))
         hi := runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed>>32), uintptr(len(b)))
-       // TODO: mix lo/hi? Get 64 bits some other way?
         return uint64(hi)<<32 | uint64(lo)
  }
  
  //go:linkname runtime_memhash runtime.memhash
  func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
  
-// Wrapper functions so that a hash/maphash.Hash implements
-// the hash.Hash and hash.Hash64 interfaces.
-
-func (h *Hash) Write(b []byte) (int, error) {
-       h.AddBytes(b)
-       return len(b), nil
-}
+// Sum appends the hash's current 64-bit value to b.
+// It exists for implementing hash.Hash.
+// For direct calls, it is more efficient to use Sum64.
  func (h *Hash) Sum(b []byte) []byte {
-       x := h.Hash()
+       x := h.Sum64()
         return append(b,
                 byte(x>>0),
                 byte(x>>8),
@@ -180,8 +210,9 @@ func (h *Hash) Sum(b []byte) []byte {
                 byte(x>>48),
                 byte(x>>56))
  }
-func (h *Hash) Sum64() uint64 {
-       return h.Hash()
-}
-func (h *Hash) Size() int      { return 8 }
+
+// Size returns h's hash value size, 8 bytes.
+func (h *Hash) Size() int { return 8 }
+
+// BlockSize returns h's block size.
  func (h *Hash) BlockSize() int { return len(h.buf) }
diff --git a/src/hash/maphash/maphash_test.go b/src/hash/maphash/maphash_test.go

index f9f631212bce6b9e2df14085e0ecfa124a974cf9..31d84a3b50a74fd0f4802a86d595be3279d0e29a 100644 (file)
--- a/src/hash/maphash/maphash_test.go
+++ b/src/hash/maphash/maphash_test.go
@@ -2,19 +2,18 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-package maphash_test
+package maphash
  
  import (
         "hash"
-       "hash/maphash"
         "testing"
  )
  
  func TestUnseededHash(t *testing.T) {
         m := map[uint64]struct{}{}
         for i := 0; i < 1000; i++ {
-               h := maphash.New()
-               m[h.Hash()] = struct{}{}
+               h := new(Hash)
+               m[h.Sum64()] = struct{}{}
         }
         if len(m) < 900 {
                 t.Errorf("empty hash not sufficiently random: got %d, want 1000", len(m))
@@ -22,12 +21,12 @@ func TestUnseededHash(t *testing.T) {
  }
  
  func TestSeededHash(t *testing.T) {
-       s := maphash.MakeSeed(1234)
+       s := MakeSeed()
         m := map[uint64]struct{}{}
         for i := 0; i < 1000; i++ {
-               h := maphash.New()
+               h := new(Hash)
                 h.SetSeed(s)
-               m[h.Hash()] = struct{}{}
+               m[h.Sum64()] = struct{}{}
         }
         if len(m) != 1 {
                 t.Errorf("seeded hash is random: got %d, want 1", len(m))
@@ -36,14 +35,17 @@ func TestSeededHash(t *testing.T) {
  
  func TestHashGrouping(t *testing.T) {
         b := []byte("foo")
-       h1 := maphash.New()
-       h2 := maphash.New()
+       h1 := new(Hash)
+       h2 := new(Hash)
         h2.SetSeed(h1.Seed())
-       h1.AddBytes(b)
+       h1.Write(b)
         for _, x := range b {
-               h2.AddByte(x)
+               err := h2.WriteByte(x)
+               if err != nil {
+                       t.Fatalf("WriteByte: %v", err)
+               }
         }
-       if h1.Hash() != h2.Hash() {
+       if h1.Sum64() != h2.Sum64() {
                 t.Errorf("hash of \"foo\" and \"f\",\"o\",\"o\" not identical")
         }
  }
@@ -51,13 +53,19 @@ func TestHashGrouping(t *testing.T) {
  func TestHashBytesVsString(t *testing.T) {
         s := "foo"
         b := []byte(s)
-       h1 := maphash.New()
-       h2 := maphash.New()
+       h1 := new(Hash)
+       h2 := new(Hash)
         h2.SetSeed(h1.Seed())
-       h1.AddString(s)
-       h2.AddBytes(b)
-       if h1.Hash() != h2.Hash() {
-               t.Errorf("hash of string and byts not identical")
+       n1, err1 := h1.WriteString(s)
+       if n1 != len(s) || err1 != nil {
+               t.Fatalf("WriteString(s) = %d, %v, want %d, nil", n1, err1, len(s))
+       }
+       n2, err2 := h2.Write(b)
+       if n2 != len(b) || err2 != nil {
+               t.Fatalf("Write(b) = %d, %v, want %d, nil", n2, err2, len(b))
+       }
+       if h1.Sum64() != h2.Sum64() {
+               t.Errorf("hash of string and bytes not identical")
         }
  }
  
@@ -66,9 +74,9 @@ func TestHashHighBytes(t *testing.T) {
         const N = 10
         m := map[uint64]struct{}{}
         for i := 0; i < N; i++ {
-               h := maphash.New()
-               h.AddString("foo")
-               m[h.Hash()>>32] = struct{}{}
+               h := new(Hash)
+               h.WriteString("foo")
+               m[h.Sum64()>>32] = struct{}{}
         }
         if len(m) < N/2 {
                 t.Errorf("from %d seeds, wanted at least %d different hashes; got %d", N, N/2, len(m))
@@ -76,5 +84,5 @@ func TestHashHighBytes(t *testing.T) {
  }
  
  // Make sure a Hash implements the hash.Hash and hash.Hash64 interfaces.
-var _ hash.Hash = &maphash.Hash{}
-var _ hash.Hash64 = &maphash.Hash{}
+var _ hash.Hash = &Hash{}
+var _ hash.Hash64 = &Hash{}
diff --git a/src/hash/maphash/smhasher_test.go b/src/hash/maphash/smhasher_test.go

index 4ac3d589769c3f8d774feaf289c7a733fdbb6a0e..6e6f2983a21f4295798baed4c2d9cf9419f91a98 100644 (file)
--- a/src/hash/maphash/smhasher_test.go
+++ b/src/hash/maphash/smhasher_test.go
@@ -2,11 +2,10 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-package maphash_test
+package maphash
  
  import (
         "fmt"
-       "hash/maphash"
         "math"
         "math/rand"
         "runtime"
@@ -19,6 +18,8 @@ import (
  // https://code.google.com/p/smhasher/
  // This code is a port of some of the Smhasher tests to Go.
  
+var fixedSeed = MakeSeed()
+
  // Sanity checks.
  // hash should not depend on values outside key.
  // hash should not depend on alignment.
@@ -36,7 +37,7 @@ func TestSmhasherSanity(t *testing.T) {
                                 randBytes(r, b[:])
                                 randBytes(r, c[:])
                                 copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
-                               if bytesHash(b[PAD:PAD+n], 0) != bytesHash(c[PAD+i:PAD+i+n], 0) {
+                               if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) {
                                         t.Errorf("hash depends on bytes outside key")
                                 }
                         }
@@ -44,17 +45,17 @@ func TestSmhasherSanity(t *testing.T) {
         }
  }
  
-func bytesHash(b []byte, seed uint64) uint64 {
-       h := maphash.New()
-       h.SetSeed(maphash.MakeSeed(seed))
-       h.AddBytes(b)
-       return h.Hash()
+func bytesHash(b []byte) uint64 {
+       var h Hash
+       h.SetSeed(fixedSeed)
+       h.Write(b)
+       return h.Sum64()
  }
-func stringHash(s string, seed uint64) uint64 {
-       h := maphash.New()
-       h.SetSeed(maphash.MakeSeed(seed))
-       h.AddString(s)
-       return h.Hash()
+func stringHash(s string) uint64 {
+       var h Hash
+       h.SetSeed(fixedSeed)
+       h.WriteString(s)
+       return h.Sum64()
  }
  
  const hashSize = 64
@@ -77,13 +78,16 @@ func (s *hashSet) add(h uint64) {
         s.n++
  }
  func (s *hashSet) addS(x string) {
-       s.add(stringHash(x, 0))
+       s.add(stringHash(x))
  }
  func (s *hashSet) addB(x []byte) {
-       s.add(bytesHash(x, 0))
+       s.add(bytesHash(x))
  }
-func (s *hashSet) addS_seed(x string, seed uint64) {
-       s.add(stringHash(x, seed))
+func (s *hashSet) addS_seed(x string, seed Seed) {
+       var h Hash
+       h.SetSeed(seed)
+       h.WriteString(x)
+       s.add(h.Sum64())
  }
  func (s *hashSet) check(t *testing.T) {
         const SLOP = 10.0
@@ -312,7 +316,7 @@ func (k *bytesKey) flipBit(i int) {
         k.b[i>>3] ^= byte(1 << uint(i&7))
  }
  func (k *bytesKey) hash() uint64 {
-       return bytesHash(k.b, 0)
+       return bytesHash(k.b)
  }
  func (k *bytesKey) name() string {
         return fmt.Sprintf("bytes%d", len(k.b))
@@ -458,8 +462,8 @@ func TestSmhasherSeed(t *testing.T) {
         const N = 100000
         s := "hello"
         for i := 0; i < N; i++ {
-               h.addS_seed(s, uint64(i))
-               h.addS_seed(s, uint64(i)<<32) // make sure high bits are used
+               h.addS_seed(s, Seed{s: uint64(i + 1)})
+               h.addS_seed(s, Seed{s: uint64(i+1) << 32}) // make sure high bits are used
         }
         h.check(t)
  }
author	Russ Cox <rsc@golang.org>
	Mon, 4 Nov 2019 18:29:29 +0000 (13:29 -0500)
committer	Russ Cox <rsc@golang.org>
	Mon, 4 Nov 2019 21:30:29 +0000 (21:30 +0000)
src/hash/maphash/maphash.go		patch \| blob \| history
src/hash/maphash/maphash_test.go		patch \| blob \| history
src/hash/maphash/smhasher_test.go		patch \| blob \| history