// run // Copyright 2024 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package main import ( "bufio" "fmt" "io" "iter" "math/rand" "os" "strings" "unicode" ) // WordReader is the struct that implements io.Reader type WordReader struct { scanner *bufio.Scanner } // NewWordReader creates a new WordReader from an io.Reader func NewWordReader(r io.Reader) *WordReader { scanner := bufio.NewScanner(r) scanner.Split(bufio.ScanWords) return &WordReader{ scanner: scanner, } } // Read reads data from the input stream and returns a single lowercase word at a time func (wr *WordReader) Read(p []byte) (n int, err error) { if !wr.scanner.Scan() { if err := wr.scanner.Err(); err != nil { return 0, err } return 0, io.EOF } word := wr.scanner.Text() cleanedWord := removeNonAlphabetic(word) if len(cleanedWord) == 0 { return wr.Read(p) } n = copy(p, []byte(cleanedWord)) return n, nil } // All returns an iterator allowing the caller to iterate over the WordReader using for/range. func (wr *WordReader) All() iter.Seq[string] { word := make([]byte, 1024) return func(yield func(string) bool) { var err error var n int for n, err = wr.Read(word); err == nil; n, err = wr.Read(word) { if !yield(string(word[:n])) { return } } if err != io.EOF { fmt.Fprintf(os.Stderr, "error reading word: %v\n", err) } } } // removeNonAlphabetic removes non-alphabetic characters from a word using strings.Map func removeNonAlphabetic(word string) string { return strings.Map(func(r rune) rune { if unicode.IsLetter(r) { return unicode.ToLower(r) } return -1 }, word) } // ProbabilisticSkipper determines if an item should be retained with probability 1/(1<>= 1 pr.counter-- if pr.counter == 0 { pr.refreshCounter() } return remove } // EstimateUniqueWordsIter estimates the number of unique words using a probabilistic counting method func EstimateUniqueWordsIter(reader io.Reader, memorySize int) int { wordReader := NewWordReader(reader) words := make(map[string]struct{}, memorySize) rounds := 0 roundRemover := NewProbabilisticSkipper(1) wordSkipper := NewProbabilisticSkipper(rounds) wordSkipper.check(rounds) for word := range wordReader.All() { wordSkipper.check(rounds) if wordSkipper.ShouldSkip() { delete(words, word) } else { words[word] = struct{}{} if len(words) >= memorySize { rounds++ wordSkipper = NewProbabilisticSkipper(rounds) for w := range words { if roundRemover.ShouldSkip() { delete(words, w) } } } } wordSkipper.check(rounds) } if len(words) == 0 { return 0 } invProbability := 1 << rounds estimatedUniqueWords := len(words) * invProbability return estimatedUniqueWords } func main() { input := "Hello, world! This is a test. Hello, world, hello!" expectedUniqueWords := 6 // "hello", "world", "this", "is", "a", "test" (but "hello" and "world" are repeated) memorySize := 6 reader := strings.NewReader(input) estimatedUniqueWords := EstimateUniqueWordsIter(reader, memorySize) if estimatedUniqueWords != expectedUniqueWords { // ... } }