Source file src/hash/maphash/smhasher_test.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !race
     6  
     7  package maphash
     8  
     9  import (
    10  	"fmt"
    11  	"internal/testenv"
    12  	"math"
    13  	"math/rand"
    14  	"runtime"
    15  	"slices"
    16  	"strings"
    17  	"testing"
    18  	"unsafe"
    19  )
    20  
    21  // Smhasher is a torture test for hash functions.
    22  // https://code.google.com/p/smhasher/
    23  // This code is a port of some of the Smhasher tests to Go.
    24  
    25  // Note: due to the long running time of these tests, they are
    26  // currently disabled in -race mode.
    27  
    28  var fixedSeed = MakeSeed()
    29  
    30  // Sanity checks.
    31  // hash should not depend on values outside key.
    32  // hash should not depend on alignment.
    33  func TestSmhasherSanity(t *testing.T) {
    34  	t.Parallel()
    35  	r := rand.New(rand.NewSource(1234))
    36  	const REP = 10
    37  	const KEYMAX = 128
    38  	const PAD = 16
    39  	const OFFMAX = 16
    40  	for k := 0; k < REP; k++ {
    41  		for n := 0; n < KEYMAX; n++ {
    42  			for i := 0; i < OFFMAX; i++ {
    43  				var b [KEYMAX + OFFMAX + 2*PAD]byte
    44  				var c [KEYMAX + OFFMAX + 2*PAD]byte
    45  				randBytes(r, b[:])
    46  				randBytes(r, c[:])
    47  				copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
    48  				if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) {
    49  					t.Errorf("hash depends on bytes outside key")
    50  				}
    51  			}
    52  		}
    53  	}
    54  }
    55  
    56  func bytesHash(b []byte) uint64 {
    57  	var h Hash
    58  	h.SetSeed(fixedSeed)
    59  	h.Write(b)
    60  	return h.Sum64()
    61  }
    62  func stringHash(s string) uint64 {
    63  	var h Hash
    64  	h.SetSeed(fixedSeed)
    65  	h.WriteString(s)
    66  	return h.Sum64()
    67  }
    68  
    69  const hashSize = 64
    70  
    71  func randBytes(r *rand.Rand, b []byte) {
    72  	r.Read(b) // can't fail
    73  }
    74  
    75  // A hashSet measures the frequency of hash collisions.
    76  type hashSet struct {
    77  	list []uint64 // list of hashes added
    78  }
    79  
    80  func newHashSet() *hashSet {
    81  	return &hashSet{list: make([]uint64, 0, 1024)}
    82  }
    83  func (s *hashSet) add(h uint64) {
    84  	s.list = append(s.list, h)
    85  }
    86  func (s *hashSet) addS(x string) {
    87  	s.add(stringHash(x))
    88  }
    89  func (s *hashSet) addB(x []byte) {
    90  	s.add(bytesHash(x))
    91  }
    92  func (s *hashSet) addS_seed(x string, seed Seed) {
    93  	var h Hash
    94  	h.SetSeed(seed)
    95  	h.WriteString(x)
    96  	s.add(h.Sum64())
    97  }
    98  func (s *hashSet) check(t *testing.T) {
    99  	t.Helper()
   100  	list := s.list
   101  	slices.Sort(list)
   102  
   103  	collisions := 0
   104  	for i := 1; i < len(list); i++ {
   105  		if list[i] == list[i-1] {
   106  			collisions++
   107  		}
   108  	}
   109  	n := len(list)
   110  
   111  	const SLOP = 10.0
   112  	pairs := int64(n) * int64(n-1) / 2
   113  	expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
   114  	stddev := math.Sqrt(expected)
   115  	if float64(collisions) > expected+SLOP*(3*stddev+1) {
   116  		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
   117  	}
   118  	// Reset for reuse
   119  	s.list = s.list[:0]
   120  }
   121  
   122  // a string plus adding zeros must make distinct hashes
   123  func TestSmhasherAppendedZeros(t *testing.T) {
   124  	t.Parallel()
   125  	s := "hello" + strings.Repeat("\x00", 256)
   126  	h := newHashSet()
   127  	for i := 0; i <= len(s); i++ {
   128  		h.addS(s[:i])
   129  	}
   130  	h.check(t)
   131  }
   132  
   133  // All 0-3 byte strings have distinct hashes.
   134  func TestSmhasherSmallKeys(t *testing.T) {
   135  	testenv.ParallelOn64Bit(t)
   136  	h := newHashSet()
   137  	var b [3]byte
   138  	for i := 0; i < 256; i++ {
   139  		b[0] = byte(i)
   140  		h.addB(b[:1])
   141  		for j := 0; j < 256; j++ {
   142  			b[1] = byte(j)
   143  			h.addB(b[:2])
   144  			if !testing.Short() {
   145  				for k := 0; k < 256; k++ {
   146  					b[2] = byte(k)
   147  					h.addB(b[:3])
   148  				}
   149  			}
   150  		}
   151  	}
   152  	h.check(t)
   153  }
   154  
   155  // Different length strings of all zeros have distinct hashes.
   156  func TestSmhasherZeros(t *testing.T) {
   157  	t.Parallel()
   158  	N := 256 * 1024
   159  	if testing.Short() {
   160  		N = 1024
   161  	}
   162  	h := newHashSet()
   163  	b := make([]byte, N)
   164  	for i := 0; i <= N; i++ {
   165  		h.addB(b[:i])
   166  	}
   167  	h.check(t)
   168  }
   169  
   170  // Strings with up to two nonzero bytes all have distinct hashes.
   171  func TestSmhasherTwoNonzero(t *testing.T) {
   172  	if runtime.GOARCH == "wasm" {
   173  		t.Skip("Too slow on wasm")
   174  	}
   175  	if testing.Short() {
   176  		t.Skip("Skipping in short mode")
   177  	}
   178  	testenv.ParallelOn64Bit(t)
   179  	h := newHashSet()
   180  	for n := 2; n <= 16; n++ {
   181  		twoNonZero(h, n)
   182  	}
   183  	h.check(t)
   184  }
   185  func twoNonZero(h *hashSet, n int) {
   186  	b := make([]byte, n)
   187  
   188  	// all zero
   189  	h.addB(b)
   190  
   191  	// one non-zero byte
   192  	for i := 0; i < n; i++ {
   193  		for x := 1; x < 256; x++ {
   194  			b[i] = byte(x)
   195  			h.addB(b)
   196  			b[i] = 0
   197  		}
   198  	}
   199  
   200  	// two non-zero bytes
   201  	for i := 0; i < n; i++ {
   202  		for x := 1; x < 256; x++ {
   203  			b[i] = byte(x)
   204  			for j := i + 1; j < n; j++ {
   205  				for y := 1; y < 256; y++ {
   206  					b[j] = byte(y)
   207  					h.addB(b)
   208  					b[j] = 0
   209  				}
   210  			}
   211  			b[i] = 0
   212  		}
   213  	}
   214  }
   215  
   216  // Test strings with repeats, like "abcdabcdabcdabcd..."
   217  func TestSmhasherCyclic(t *testing.T) {
   218  	if testing.Short() {
   219  		t.Skip("Skipping in short mode")
   220  	}
   221  	t.Parallel()
   222  	r := rand.New(rand.NewSource(1234))
   223  	const REPEAT = 8
   224  	const N = 1000000
   225  	h := newHashSet()
   226  	for n := 4; n <= 12; n++ {
   227  		b := make([]byte, REPEAT*n)
   228  		for i := 0; i < N; i++ {
   229  			b[0] = byte(i * 79 % 97)
   230  			b[1] = byte(i * 43 % 137)
   231  			b[2] = byte(i * 151 % 197)
   232  			b[3] = byte(i * 199 % 251)
   233  			randBytes(r, b[4:n])
   234  			for j := n; j < n*REPEAT; j++ {
   235  				b[j] = b[j-n]
   236  			}
   237  			h.addB(b)
   238  		}
   239  		h.check(t)
   240  	}
   241  }
   242  
   243  // Test strings with only a few bits set
   244  func TestSmhasherSparse(t *testing.T) {
   245  	if runtime.GOARCH == "wasm" {
   246  		t.Skip("Too slow on wasm")
   247  	}
   248  	if testing.Short() {
   249  		t.Skip("Skipping in short mode")
   250  	}
   251  	t.Parallel()
   252  	h := newHashSet()
   253  	sparse(t, h, 32, 6)
   254  	sparse(t, h, 40, 6)
   255  	sparse(t, h, 48, 5)
   256  	sparse(t, h, 56, 5)
   257  	sparse(t, h, 64, 5)
   258  	sparse(t, h, 96, 4)
   259  	sparse(t, h, 256, 3)
   260  	sparse(t, h, 2048, 2)
   261  }
   262  func sparse(t *testing.T, h *hashSet, n int, k int) {
   263  	t.Helper()
   264  	b := make([]byte, n/8)
   265  	setbits(h, b, 0, k)
   266  	h.check(t)
   267  }
   268  
   269  // set up to k bits at index i and greater
   270  func setbits(h *hashSet, b []byte, i int, k int) {
   271  	h.addB(b)
   272  	if k == 0 {
   273  		return
   274  	}
   275  	for j := i; j < len(b)*8; j++ {
   276  		b[j/8] |= byte(1 << uint(j&7))
   277  		setbits(h, b, j+1, k-1)
   278  		b[j/8] &= byte(^(1 << uint(j&7)))
   279  	}
   280  }
   281  
   282  // Test all possible combinations of n blocks from the set s.
   283  // "permutation" is a bad name here, but it is what Smhasher uses.
   284  func TestSmhasherPermutation(t *testing.T) {
   285  	if runtime.GOARCH == "wasm" {
   286  		t.Skip("Too slow on wasm")
   287  	}
   288  	if testing.Short() {
   289  		t.Skip("Skipping in short mode")
   290  	}
   291  	testenv.ParallelOn64Bit(t)
   292  	h := newHashSet()
   293  	permutation(t, h, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8)
   294  	permutation(t, h, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8)
   295  	permutation(t, h, []uint32{0, 1}, 20)
   296  	permutation(t, h, []uint32{0, 1 << 31}, 20)
   297  	permutation(t, h, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6)
   298  }
   299  func permutation(t *testing.T, h *hashSet, s []uint32, n int) {
   300  	t.Helper()
   301  	b := make([]byte, n*4)
   302  	genPerm(h, b, s, 0)
   303  	h.check(t)
   304  }
   305  func genPerm(h *hashSet, b []byte, s []uint32, n int) {
   306  	h.addB(b[:n])
   307  	if n == len(b) {
   308  		return
   309  	}
   310  	for _, v := range s {
   311  		b[n] = byte(v)
   312  		b[n+1] = byte(v >> 8)
   313  		b[n+2] = byte(v >> 16)
   314  		b[n+3] = byte(v >> 24)
   315  		genPerm(h, b, s, n+4)
   316  	}
   317  }
   318  
   319  type key interface {
   320  	clear()              // set bits all to 0
   321  	random(r *rand.Rand) // set key to something random
   322  	bits() int           // how many bits key has
   323  	flipBit(i int)       // flip bit i of the key
   324  	hash() uint64        // hash the key
   325  	name() string        // for error reporting
   326  }
   327  
   328  type bytesKey struct {
   329  	b []byte
   330  }
   331  
   332  func (k *bytesKey) clear() {
   333  	clear(k.b)
   334  }
   335  func (k *bytesKey) random(r *rand.Rand) {
   336  	randBytes(r, k.b)
   337  }
   338  func (k *bytesKey) bits() int {
   339  	return len(k.b) * 8
   340  }
   341  func (k *bytesKey) flipBit(i int) {
   342  	k.b[i>>3] ^= byte(1 << uint(i&7))
   343  }
   344  func (k *bytesKey) hash() uint64 {
   345  	return bytesHash(k.b)
   346  }
   347  func (k *bytesKey) name() string {
   348  	return fmt.Sprintf("bytes%d", len(k.b))
   349  }
   350  
   351  // Flipping a single bit of a key should flip each output bit with 50% probability.
   352  func TestSmhasherAvalanche(t *testing.T) {
   353  	if runtime.GOARCH == "wasm" {
   354  		t.Skip("Too slow on wasm")
   355  	}
   356  	if testing.Short() {
   357  		t.Skip("Skipping in short mode")
   358  	}
   359  	t.Parallel()
   360  	avalancheTest1(t, &bytesKey{make([]byte, 2)})
   361  	avalancheTest1(t, &bytesKey{make([]byte, 4)})
   362  	avalancheTest1(t, &bytesKey{make([]byte, 8)})
   363  	avalancheTest1(t, &bytesKey{make([]byte, 16)})
   364  	avalancheTest1(t, &bytesKey{make([]byte, 32)})
   365  	avalancheTest1(t, &bytesKey{make([]byte, 200)})
   366  }
   367  func avalancheTest1(t *testing.T, k key) {
   368  	t.Helper()
   369  	const REP = 100000
   370  	r := rand.New(rand.NewSource(1234))
   371  	n := k.bits()
   372  
   373  	// grid[i][j] is a count of whether flipping
   374  	// input bit i affects output bit j.
   375  	grid := make([][hashSize]int, n)
   376  
   377  	for z := 0; z < REP; z++ {
   378  		// pick a random key, hash it
   379  		k.random(r)
   380  		h := k.hash()
   381  
   382  		// flip each bit, hash & compare the results
   383  		for i := 0; i < n; i++ {
   384  			k.flipBit(i)
   385  			d := h ^ k.hash()
   386  			k.flipBit(i)
   387  
   388  			// record the effects of that bit flip
   389  			g := &grid[i]
   390  			for j := 0; j < hashSize; j++ {
   391  				g[j] += int(d & 1)
   392  				d >>= 1
   393  			}
   394  		}
   395  	}
   396  
   397  	// Each entry in the grid should be about REP/2.
   398  	// More precisely, we did N = k.bits() * hashSize experiments where
   399  	// each is the sum of REP coin flips. We want to find bounds on the
   400  	// sum of coin flips such that a truly random experiment would have
   401  	// all sums inside those bounds with 99% probability.
   402  	N := n * hashSize
   403  	var c float64
   404  	// find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999
   405  	for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 {
   406  	}
   407  	c *= 11.0 // allowed slack: 40% to 60% - we don't need to be perfectly random
   408  	mean := .5 * REP
   409  	stddev := .5 * math.Sqrt(REP)
   410  	low := int(mean - c*stddev)
   411  	high := int(mean + c*stddev)
   412  	for i := 0; i < n; i++ {
   413  		for j := 0; j < hashSize; j++ {
   414  			x := grid[i][j]
   415  			if x < low || x > high {
   416  				t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP)
   417  			}
   418  		}
   419  	}
   420  }
   421  
   422  // All bit rotations of a set of distinct keys
   423  func TestSmhasherWindowed(t *testing.T) {
   424  	t.Parallel()
   425  	windowed(t, &bytesKey{make([]byte, 128)})
   426  }
   427  func windowed(t *testing.T, k key) {
   428  	if runtime.GOARCH == "wasm" {
   429  		t.Skip("Too slow on wasm")
   430  	}
   431  	if testing.Short() {
   432  		t.Skip("Skipping in short mode")
   433  	}
   434  	const BITS = 16
   435  
   436  	h := newHashSet()
   437  	for r := 0; r < k.bits(); r++ {
   438  		for i := 0; i < 1<<BITS; i++ {
   439  			k.clear()
   440  			for j := 0; j < BITS; j++ {
   441  				if i>>uint(j)&1 != 0 {
   442  					k.flipBit((j + r) % k.bits())
   443  				}
   444  			}
   445  			h.add(k.hash())
   446  		}
   447  		h.check(t)
   448  	}
   449  }
   450  
   451  // All keys of the form prefix + [A-Za-z0-9]*N + suffix.
   452  func TestSmhasherText(t *testing.T) {
   453  	if testing.Short() {
   454  		t.Skip("Skipping in short mode")
   455  	}
   456  	t.Parallel()
   457  	h := newHashSet()
   458  	text(t, h, "Foo", "Bar")
   459  	text(t, h, "FooBar", "")
   460  	text(t, h, "", "FooBar")
   461  }
   462  func text(t *testing.T, h *hashSet, prefix, suffix string) {
   463  	t.Helper()
   464  	const N = 4
   465  	const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789"
   466  	const L = len(S)
   467  	b := make([]byte, len(prefix)+N+len(suffix))
   468  	copy(b, prefix)
   469  	copy(b[len(prefix)+N:], suffix)
   470  	c := b[len(prefix):]
   471  	for i := 0; i < L; i++ {
   472  		c[0] = S[i]
   473  		for j := 0; j < L; j++ {
   474  			c[1] = S[j]
   475  			for k := 0; k < L; k++ {
   476  				c[2] = S[k]
   477  				for x := 0; x < L; x++ {
   478  					c[3] = S[x]
   479  					h.addB(b)
   480  				}
   481  			}
   482  		}
   483  	}
   484  	h.check(t)
   485  }
   486  
   487  // Make sure different seed values generate different hashes.
   488  func TestSmhasherSeed(t *testing.T) {
   489  	if unsafe.Sizeof(uintptr(0)) == 4 {
   490  		t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)")
   491  	}
   492  	t.Parallel()
   493  	h := newHashSet()
   494  	const N = 100000
   495  	s := "hello"
   496  	for i := 0; i < N; i++ {
   497  		h.addS_seed(s, Seed{s: uint64(i + 1)})
   498  		h.addS_seed(s, Seed{s: uint64(i+1) << 32}) // make sure high bits are used
   499  	}
   500  	h.check(t)
   501  }
   502  

View as plain text