Source file src/hash/maphash/maphash.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package maphash provides hash functions on byte sequences.
     6  // These hash functions are intended to be used to implement hash tables or
     7  // other data structures that need to map arbitrary strings or byte
     8  // sequences to a uniform distribution on unsigned 64-bit integers.
     9  // Each different instance of a hash table or data structure should use its own [Seed].
    10  //
    11  // The hash functions are not cryptographically secure.
    12  // (See crypto/sha256 and crypto/sha512 for cryptographic use.)
    13  package maphash
    14  
    15  // A Seed is a random value that selects the specific hash function
    16  // computed by a [Hash]. If two Hashes use the same Seeds, they
    17  // will compute the same hash values for any given input.
    18  // If two Hashes use different Seeds, they are very likely to compute
    19  // distinct hash values for any given input.
    20  //
    21  // A Seed must be initialized by calling [MakeSeed].
    22  // The zero seed is uninitialized and not valid for use with [Hash]'s SetSeed method.
    23  //
    24  // Each Seed value is local to a single process and cannot be serialized
    25  // or otherwise recreated in a different process.
    26  type Seed struct {
    27  	s uint64
    28  }
    29  
    30  // Bytes returns the hash of b with the given seed.
    31  //
    32  // Bytes is equivalent to, but more convenient and efficient than:
    33  //
    34  //	var h Hash
    35  //	h.SetSeed(seed)
    36  //	h.Write(b)
    37  //	return h.Sum64()
    38  func Bytes(seed Seed, b []byte) uint64 {
    39  	state := seed.s
    40  	if state == 0 {
    41  		panic("maphash: use of uninitialized Seed")
    42  	}
    43  
    44  	if len(b) > bufSize {
    45  		b = b[:len(b):len(b)] // merge len and cap calculations when reslicing
    46  		for len(b) > bufSize {
    47  			state = rthash(b[:bufSize], state)
    48  			b = b[bufSize:]
    49  		}
    50  	}
    51  	return rthash(b, state)
    52  }
    53  
    54  // String returns the hash of s with the given seed.
    55  //
    56  // String is equivalent to, but more convenient and efficient than:
    57  //
    58  //	var h Hash
    59  //	h.SetSeed(seed)
    60  //	h.WriteString(s)
    61  //	return h.Sum64()
    62  func String(seed Seed, s string) uint64 {
    63  	state := seed.s
    64  	if state == 0 {
    65  		panic("maphash: use of uninitialized Seed")
    66  	}
    67  	for len(s) > bufSize {
    68  		state = rthashString(s[:bufSize], state)
    69  		s = s[bufSize:]
    70  	}
    71  	return rthashString(s, state)
    72  }
    73  
    74  // A Hash computes a seeded hash of a byte sequence.
    75  //
    76  // The zero Hash is a valid Hash ready to use.
    77  // A zero Hash chooses a random seed for itself during
    78  // the first call to a Reset, Write, Seed, or Sum64 method.
    79  // For control over the seed, use SetSeed.
    80  //
    81  // The computed hash values depend only on the initial seed and
    82  // the sequence of bytes provided to the Hash object, not on the way
    83  // in which the bytes are provided. For example, the three sequences
    84  //
    85  //	h.Write([]byte{'f','o','o'})
    86  //	h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
    87  //	h.WriteString("foo")
    88  //
    89  // all have the same effect.
    90  //
    91  // Hashes are intended to be collision-resistant, even for situations
    92  // where an adversary controls the byte sequences being hashed.
    93  //
    94  // A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
    95  // If multiple goroutines must compute the same seeded hash,
    96  // each can declare its own Hash and call SetSeed with a common Seed.
    97  type Hash struct {
    98  	_     [0]func()     // not comparable
    99  	seed  Seed          // initial seed used for this hash
   100  	state Seed          // current hash of all flushed bytes
   101  	buf   [bufSize]byte // unflushed byte buffer
   102  	n     int           // number of unflushed bytes
   103  }
   104  
   105  // bufSize is the size of the Hash write buffer.
   106  // The buffer ensures that writes depend only on the sequence of bytes,
   107  // not the sequence of WriteByte/Write/WriteString calls,
   108  // by always calling rthash with a full buffer (except for the tail).
   109  const bufSize = 128
   110  
   111  // initSeed seeds the hash if necessary.
   112  // initSeed is called lazily before any operation that actually uses h.seed/h.state.
   113  // Note that this does not include Write/WriteByte/WriteString in the case
   114  // where they only add to h.buf. (If they write too much, they call h.flush,
   115  // which does call h.initSeed.)
   116  func (h *Hash) initSeed() {
   117  	if h.seed.s == 0 {
   118  		seed := MakeSeed()
   119  		h.seed = seed
   120  		h.state = seed
   121  	}
   122  }
   123  
   124  // WriteByte adds b to the sequence of bytes hashed by h.
   125  // It never fails; the error result is for implementing [io.ByteWriter].
   126  func (h *Hash) WriteByte(b byte) error {
   127  	if h.n == len(h.buf) {
   128  		h.flush()
   129  	}
   130  	h.buf[h.n] = b
   131  	h.n++
   132  	return nil
   133  }
   134  
   135  // Write adds b to the sequence of bytes hashed by h.
   136  // It always writes all of b and never fails; the count and error result are for implementing [io.Writer].
   137  func (h *Hash) Write(b []byte) (int, error) {
   138  	size := len(b)
   139  	// Deal with bytes left over in h.buf.
   140  	// h.n <= bufSize is always true.
   141  	// Checking it is ~free and it lets the compiler eliminate a bounds check.
   142  	if h.n > 0 && h.n <= bufSize {
   143  		k := copy(h.buf[h.n:], b)
   144  		h.n += k
   145  		if h.n < bufSize {
   146  			// Copied the entirety of b to h.buf.
   147  			return size, nil
   148  		}
   149  		b = b[k:]
   150  		h.flush()
   151  		// No need to set h.n = 0 here; it happens just before exit.
   152  	}
   153  	// Process as many full buffers as possible, without copying, and calling initSeed only once.
   154  	if len(b) > bufSize {
   155  		h.initSeed()
   156  		for len(b) > bufSize {
   157  			h.state.s = rthash(b[:bufSize], h.state.s)
   158  			b = b[bufSize:]
   159  		}
   160  	}
   161  	// Copy the tail.
   162  	copy(h.buf[:], b)
   163  	h.n = len(b)
   164  	return size, nil
   165  }
   166  
   167  // WriteString adds the bytes of s to the sequence of bytes hashed by h.
   168  // It always writes all of s and never fails; the count and error result are for implementing [io.StringWriter].
   169  func (h *Hash) WriteString(s string) (int, error) {
   170  	// WriteString mirrors Write. See Write for comments.
   171  	size := len(s)
   172  	if h.n > 0 && h.n <= bufSize {
   173  		k := copy(h.buf[h.n:], s)
   174  		h.n += k
   175  		if h.n < bufSize {
   176  			return size, nil
   177  		}
   178  		s = s[k:]
   179  		h.flush()
   180  	}
   181  	if len(s) > bufSize {
   182  		h.initSeed()
   183  		for len(s) > bufSize {
   184  			h.state.s = rthashString(s[:bufSize], h.state.s)
   185  			s = s[bufSize:]
   186  		}
   187  	}
   188  	copy(h.buf[:], s)
   189  	h.n = len(s)
   190  	return size, nil
   191  }
   192  
   193  // Seed returns h's seed value.
   194  func (h *Hash) Seed() Seed {
   195  	h.initSeed()
   196  	return h.seed
   197  }
   198  
   199  // SetSeed sets h to use seed, which must have been returned by [MakeSeed]
   200  // or by another [Hash.Seed] method.
   201  // Two [Hash] objects with the same seed behave identically.
   202  // Two [Hash] objects with different seeds will very likely behave differently.
   203  // Any bytes added to h before this call will be discarded.
   204  func (h *Hash) SetSeed(seed Seed) {
   205  	if seed.s == 0 {
   206  		panic("maphash: use of uninitialized Seed")
   207  	}
   208  	h.seed = seed
   209  	h.state = seed
   210  	h.n = 0
   211  }
   212  
   213  // Reset discards all bytes added to h.
   214  // (The seed remains the same.)
   215  func (h *Hash) Reset() {
   216  	h.initSeed()
   217  	h.state = h.seed
   218  	h.n = 0
   219  }
   220  
   221  // precondition: buffer is full.
   222  func (h *Hash) flush() {
   223  	if h.n != len(h.buf) {
   224  		panic("maphash: flush of partially full buffer")
   225  	}
   226  	h.initSeed()
   227  	h.state.s = rthash(h.buf[:h.n], h.state.s)
   228  	h.n = 0
   229  }
   230  
   231  // Sum64 returns h's current 64-bit value, which depends on
   232  // h's seed and the sequence of bytes added to h since the
   233  // last call to [Hash.Reset] or [Hash.SetSeed].
   234  //
   235  // All bits of the Sum64 result are close to uniformly and
   236  // independently distributed, so it can be safely reduced
   237  // by using bit masking, shifting, or modular arithmetic.
   238  func (h *Hash) Sum64() uint64 {
   239  	h.initSeed()
   240  	return rthash(h.buf[:h.n], h.state.s)
   241  }
   242  
   243  // MakeSeed returns a new random seed.
   244  func MakeSeed() Seed {
   245  	var s uint64
   246  	for {
   247  		s = randUint64()
   248  		// We use seed 0 to indicate an uninitialized seed/hash,
   249  		// so keep trying until we get a non-zero seed.
   250  		if s != 0 {
   251  			break
   252  		}
   253  	}
   254  	return Seed{s: s}
   255  }
   256  
   257  // Sum appends the hash's current 64-bit value to b.
   258  // It exists for implementing [hash.Hash].
   259  // For direct calls, it is more efficient to use [Hash.Sum64].
   260  func (h *Hash) Sum(b []byte) []byte {
   261  	x := h.Sum64()
   262  	return append(b,
   263  		byte(x>>0),
   264  		byte(x>>8),
   265  		byte(x>>16),
   266  		byte(x>>24),
   267  		byte(x>>32),
   268  		byte(x>>40),
   269  		byte(x>>48),
   270  		byte(x>>56))
   271  }
   272  
   273  // Size returns h's hash value size, 8 bytes.
   274  func (h *Hash) Size() int { return 8 }
   275  
   276  // BlockSize returns h's block size.
   277  func (h *Hash) BlockSize() int { return len(h.buf) }
   278  

View as plain text