--- /dev/null
+// run
+
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "iter"
+ "math/rand"
+ "os"
+ "strings"
+ "unicode"
+)
+
+// WordReader is the struct that implements io.Reader
+type WordReader struct {
+ scanner *bufio.Scanner
+}
+
+// NewWordReader creates a new WordReader from an io.Reader
+func NewWordReader(r io.Reader) *WordReader {
+ scanner := bufio.NewScanner(r)
+ scanner.Split(bufio.ScanWords)
+ return &WordReader{
+ scanner: scanner,
+ }
+}
+
+// Read reads data from the input stream and returns a single lowercase word at a time
+func (wr *WordReader) Read(p []byte) (n int, err error) {
+ if !wr.scanner.Scan() {
+ if err := wr.scanner.Err(); err != nil {
+ return 0, err
+ }
+ return 0, io.EOF
+ }
+ word := wr.scanner.Text()
+ cleanedWord := removeNonAlphabetic(word)
+ if len(cleanedWord) == 0 {
+ return wr.Read(p)
+ }
+ n = copy(p, []byte(cleanedWord))
+ return n, nil
+}
+
+// All returns an iterator allowing the caller to iterate over the WordReader using for/range.
+func (wr *WordReader) All() iter.Seq[string] {
+ word := make([]byte, 1024)
+ return func(yield func(string) bool) {
+ var err error
+ var n int
+ for n, err = wr.Read(word); err == nil; n, err = wr.Read(word) {
+ if !yield(string(word[:n])) {
+ return
+ }
+ }
+ if err != io.EOF {
+ fmt.Fprintf(os.Stderr, "error reading word: %v\n", err)
+ }
+ }
+}
+
+// removeNonAlphabetic removes non-alphabetic characters from a word using strings.Map
+func removeNonAlphabetic(word string) string {
+ return strings.Map(func(r rune) rune {
+ if unicode.IsLetter(r) {
+ return unicode.ToLower(r)
+ }
+ return -1
+ }, word)
+}
+
+// ProbabilisticSkipper determines if an item should be retained with probability 1/(1<<n)
+type ProbabilisticSkipper struct {
+ n int
+ counter uint64
+ bitmask uint64
+}
+
+// NewProbabilisticSkipper initializes the ProbabilisticSkipper
+func NewProbabilisticSkipper(n int) *ProbabilisticSkipper {
+ pr := &ProbabilisticSkipper{n: n}
+ pr.refreshCounter()
+ return pr
+}
+
+// check panics if pr.n is not the expected value
+func (pr *ProbabilisticSkipper) check(n int) {
+ if pr.n != n {
+ panic(fmt.Sprintf("check: pr.n != n %d != %d", pr.n, n))
+ }
+}
+
+// refreshCounter refreshes the counter with a new random value
+func (pr *ProbabilisticSkipper) refreshCounter() {
+ if pr.n == 0 {
+ pr.bitmask = ^uint64(0) // All bits set to 1
+ } else {
+ pr.bitmask = rand.Uint64()
+ for i := 0; i < pr.n-1; i++ {
+ pr.bitmask &= rand.Uint64()
+ }
+ }
+ pr.counter = 64
+}
+
+// ShouldSkip returns true with probability 1/(1<<n)
+func (pr *ProbabilisticSkipper) ShouldSkip() bool {
+ remove := pr.bitmask&1 == 0
+ pr.bitmask >>= 1
+ pr.counter--
+ if pr.counter == 0 {
+ pr.refreshCounter()
+ }
+ return remove
+}
+
+// EstimateUniqueWordsIter estimates the number of unique words using a probabilistic counting method
+func EstimateUniqueWordsIter(reader io.Reader, memorySize int) int {
+ wordReader := NewWordReader(reader)
+ words := make(map[string]struct{}, memorySize)
+
+ rounds := 0
+ roundRemover := NewProbabilisticSkipper(1)
+ wordSkipper := NewProbabilisticSkipper(rounds)
+ wordSkipper.check(rounds)
+
+ for word := range wordReader.All() {
+ wordSkipper.check(rounds)
+ if wordSkipper.ShouldSkip() {
+ delete(words, word)
+ } else {
+ words[word] = struct{}{}
+
+ if len(words) >= memorySize {
+ rounds++
+
+ wordSkipper = NewProbabilisticSkipper(rounds)
+ for w := range words {
+ if roundRemover.ShouldSkip() {
+ delete(words, w)
+ }
+ }
+ }
+ }
+ wordSkipper.check(rounds)
+ }
+
+ if len(words) == 0 {
+ return 0
+ }
+
+ invProbability := 1 << rounds
+ estimatedUniqueWords := len(words) * invProbability
+ return estimatedUniqueWords
+}
+
+func main() {
+ input := "Hello, world! This is a test. Hello, world, hello!"
+ expectedUniqueWords := 6 // "hello", "world", "this", "is", "a", "test" (but "hello" and "world" are repeated)
+ memorySize := 6
+
+ reader := strings.NewReader(input)
+ estimatedUniqueWords := EstimateUniqueWordsIter(reader, memorySize)
+ if estimatedUniqueWords != expectedUniqueWords {
+ // ...
+ }
+}
--- /dev/null
+// run
+
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+func main() {
+ err := run()
+ if err != nil {
+ panic(err)
+ }
+}
+
+func run() error {
+ methods := "AB"
+
+ type node struct {
+ tag string
+ choices []string
+ }
+ all := []node{
+ {"000", permutations(methods)},
+ }
+
+ next := 1
+ for len(all) > 0 {
+ cur := all[0]
+ k := copy(all, all[1:])
+ all = all[:k]
+
+ if len(cur.choices) == 1 {
+ continue
+ }
+
+ var bestM map[byte][]string
+ bMax := len(cur.choices) + 1
+ bMin := -1
+ for sel := range selections(methods) {
+ m := make(map[byte][]string)
+ for _, order := range cur.choices {
+ x := findFirstMatch(order, sel)
+ m[x] = append(m[x], order)
+ }
+
+ min := len(cur.choices) + 1
+ max := -1
+ for _, v := range m {
+ if len(v) < min {
+ min = len(v)
+ }
+ if len(v) > max {
+ max = len(v)
+ }
+ }
+ if max < bMax || (max == bMax && min > bMin) {
+ bestM = m
+ bMin = min
+ bMax = max
+ }
+ }
+
+ if bMax == len(cur.choices) {
+ continue
+ }
+
+ cc := Keys(bestM)
+ for c := range cc {
+ choices := bestM[c]
+ next++
+
+ switch c {
+ case 'A':
+ case 'B':
+ default:
+ panic("unexpected selector type " + string(c))
+ }
+ all = append(all, node{"", choices})
+ }
+ }
+ return nil
+}
+
+func permutations(s string) []string {
+ if len(s) <= 1 {
+ return []string{s}
+ }
+
+ var result []string
+ for i, char := range s {
+ rest := s[:i] + s[i+1:]
+ for _, perm := range permutations(rest) {
+ result = append(result, string(char)+perm)
+ }
+ }
+ return result
+}
+
+type Seq[V any] func(yield func(V) bool)
+
+func selections(s string) Seq[string] {
+ return func(yield func(string) bool) {
+ for bits := 1; bits < 1<<len(s); bits++ {
+ var choice string
+ for j, char := range s {
+ if bits&(1<<j) != 0 {
+ choice += string(char)
+ }
+ }
+ if !yield(choice) {
+ break
+ }
+ }
+ }
+}
+
+func findFirstMatch(order, sel string) byte {
+ for _, c := range order {
+ return byte(c)
+ }
+ return 0
+}
+
+func Keys[Map ~map[K]V, K comparable, V any](m Map) Seq[K] {
+ return func(yield func(K) bool) {
+ for k := range m {
+ if !yield(k) {
+ return
+ }
+ }
+ }
+}