Source file src/hash/maphash/smhasher_test.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !race
     6  
     7  package maphash
     8  
     9  import (
    10  	"fmt"
    11  	"internal/testenv"
    12  	"math"
    13  	"math/rand"
    14  	"runtime"
    15  	"slices"
    16  	"strings"
    17  	"testing"
    18  	"unsafe"
    19  )
    20  
    21  // Smhasher is a torture test for hash functions.
    22  // https://code.google.com/p/smhasher/
    23  // This code is a port of some of the Smhasher tests to Go.
    24  
    25  // Note: due to the long running time of these tests, they are
    26  // currently disabled in -race mode.
    27  
    28  var fixedSeed = MakeSeed()
    29  
    30  // Sanity checks.
    31  // hash should not depend on values outside key.
    32  // hash should not depend on alignment.
    33  func TestSmhasherSanity(t *testing.T) {
    34  	t.Parallel()
    35  	r := rand.New(rand.NewSource(1234))
    36  	const REP = 10
    37  	const KEYMAX = 128
    38  	const PAD = 16
    39  	const OFFMAX = 16
    40  	for k := 0; k < REP; k++ {
    41  		for n := 0; n < KEYMAX; n++ {
    42  			for i := 0; i < OFFMAX; i++ {
    43  				var b [KEYMAX + OFFMAX + 2*PAD]byte
    44  				var c [KEYMAX + OFFMAX + 2*PAD]byte
    45  				randBytes(r, b[:])
    46  				randBytes(r, c[:])
    47  				copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
    48  				if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) {
    49  					t.Errorf("hash depends on bytes outside key")
    50  				}
    51  			}
    52  		}
    53  	}
    54  }
    55  
    56  func bytesHash(b []byte) uint64 {
    57  	var h Hash
    58  	h.SetSeed(fixedSeed)
    59  	h.Write(b)
    60  	return h.Sum64()
    61  }
    62  func stringHash(s string) uint64 {
    63  	var h Hash
    64  	h.SetSeed(fixedSeed)
    65  	h.WriteString(s)
    66  	return h.Sum64()
    67  }
    68  
    69  const hashSize = 64
    70  
    71  func randBytes(r *rand.Rand, b []byte) {
    72  	r.Read(b) // can't fail
    73  }
    74  
    75  // A hashSet measures the frequency of hash collisions.
    76  type hashSet struct {
    77  	list []uint64 // list of hashes added
    78  }
    79  
    80  func newHashSet() *hashSet {
    81  	return &hashSet{list: make([]uint64, 0, 1024)}
    82  }
    83  func (s *hashSet) add(h uint64) {
    84  	s.list = append(s.list, h)
    85  }
    86  func (s *hashSet) addS(x string) {
    87  	s.add(stringHash(x))
    88  }
    89  func (s *hashSet) addB(x []byte) {
    90  	s.add(bytesHash(x))
    91  }
    92  func (s *hashSet) addS_seed(x string, seed Seed) {
    93  	var h Hash
    94  	h.SetSeed(seed)
    95  	h.WriteString(x)
    96  	s.add(h.Sum64())
    97  }
    98  func (s *hashSet) check(t *testing.T) {
    99  	list := s.list
   100  	slices.Sort(list)
   101  
   102  	collisions := 0
   103  	for i := 1; i < len(list); i++ {
   104  		if list[i] == list[i-1] {
   105  			collisions++
   106  		}
   107  	}
   108  	n := len(list)
   109  
   110  	const SLOP = 10.0
   111  	pairs := int64(n) * int64(n-1) / 2
   112  	expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
   113  	stddev := math.Sqrt(expected)
   114  	if float64(collisions) > expected+SLOP*(3*stddev+1) {
   115  		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
   116  	}
   117  	// Reset for reuse
   118  	s.list = s.list[:0]
   119  }
   120  
   121  // a string plus adding zeros must make distinct hashes
   122  func TestSmhasherAppendedZeros(t *testing.T) {
   123  	t.Parallel()
   124  	s := "hello" + strings.Repeat("\x00", 256)
   125  	h := newHashSet()
   126  	for i := 0; i <= len(s); i++ {
   127  		h.addS(s[:i])
   128  	}
   129  	h.check(t)
   130  }
   131  
   132  // All 0-3 byte strings have distinct hashes.
   133  func TestSmhasherSmallKeys(t *testing.T) {
   134  	testenv.ParallelOn64Bit(t)
   135  	h := newHashSet()
   136  	var b [3]byte
   137  	for i := 0; i < 256; i++ {
   138  		b[0] = byte(i)
   139  		h.addB(b[:1])
   140  		for j := 0; j < 256; j++ {
   141  			b[1] = byte(j)
   142  			h.addB(b[:2])
   143  			if !testing.Short() {
   144  				for k := 0; k < 256; k++ {
   145  					b[2] = byte(k)
   146  					h.addB(b[:3])
   147  				}
   148  			}
   149  		}
   150  	}
   151  	h.check(t)
   152  }
   153  
   154  // Different length strings of all zeros have distinct hashes.
   155  func TestSmhasherZeros(t *testing.T) {
   156  	t.Parallel()
   157  	N := 256 * 1024
   158  	if testing.Short() {
   159  		N = 1024
   160  	}
   161  	h := newHashSet()
   162  	b := make([]byte, N)
   163  	for i := 0; i <= N; i++ {
   164  		h.addB(b[:i])
   165  	}
   166  	h.check(t)
   167  }
   168  
   169  // Strings with up to two nonzero bytes all have distinct hashes.
   170  func TestSmhasherTwoNonzero(t *testing.T) {
   171  	if runtime.GOARCH == "wasm" {
   172  		t.Skip("Too slow on wasm")
   173  	}
   174  	if testing.Short() {
   175  		t.Skip("Skipping in short mode")
   176  	}
   177  	testenv.ParallelOn64Bit(t)
   178  	h := newHashSet()
   179  	for n := 2; n <= 16; n++ {
   180  		twoNonZero(h, n)
   181  	}
   182  	h.check(t)
   183  }
   184  func twoNonZero(h *hashSet, n int) {
   185  	b := make([]byte, n)
   186  
   187  	// all zero
   188  	h.addB(b)
   189  
   190  	// one non-zero byte
   191  	for i := 0; i < n; i++ {
   192  		for x := 1; x < 256; x++ {
   193  			b[i] = byte(x)
   194  			h.addB(b)
   195  			b[i] = 0
   196  		}
   197  	}
   198  
   199  	// two non-zero bytes
   200  	for i := 0; i < n; i++ {
   201  		for x := 1; x < 256; x++ {
   202  			b[i] = byte(x)
   203  			for j := i + 1; j < n; j++ {
   204  				for y := 1; y < 256; y++ {
   205  					b[j] = byte(y)
   206  					h.addB(b)
   207  					b[j] = 0
   208  				}
   209  			}
   210  			b[i] = 0
   211  		}
   212  	}
   213  }
   214  
   215  // Test strings with repeats, like "abcdabcdabcdabcd..."
   216  func TestSmhasherCyclic(t *testing.T) {
   217  	if testing.Short() {
   218  		t.Skip("Skipping in short mode")
   219  	}
   220  	t.Parallel()
   221  	r := rand.New(rand.NewSource(1234))
   222  	const REPEAT = 8
   223  	const N = 1000000
   224  	h := newHashSet()
   225  	for n := 4; n <= 12; n++ {
   226  		b := make([]byte, REPEAT*n)
   227  		for i := 0; i < N; i++ {
   228  			b[0] = byte(i * 79 % 97)
   229  			b[1] = byte(i * 43 % 137)
   230  			b[2] = byte(i * 151 % 197)
   231  			b[3] = byte(i * 199 % 251)
   232  			randBytes(r, b[4:n])
   233  			for j := n; j < n*REPEAT; j++ {
   234  				b[j] = b[j-n]
   235  			}
   236  			h.addB(b)
   237  		}
   238  		h.check(t)
   239  	}
   240  }
   241  
   242  // Test strings with only a few bits set
   243  func TestSmhasherSparse(t *testing.T) {
   244  	if runtime.GOARCH == "wasm" {
   245  		t.Skip("Too slow on wasm")
   246  	}
   247  	if testing.Short() {
   248  		t.Skip("Skipping in short mode")
   249  	}
   250  	t.Parallel()
   251  	h := newHashSet()
   252  	sparse(t, h, 32, 6)
   253  	sparse(t, h, 40, 6)
   254  	sparse(t, h, 48, 5)
   255  	sparse(t, h, 56, 5)
   256  	sparse(t, h, 64, 5)
   257  	sparse(t, h, 96, 4)
   258  	sparse(t, h, 256, 3)
   259  	sparse(t, h, 2048, 2)
   260  }
   261  func sparse(t *testing.T, h *hashSet, n int, k int) {
   262  	b := make([]byte, n/8)
   263  	setbits(h, b, 0, k)
   264  	h.check(t)
   265  }
   266  
   267  // set up to k bits at index i and greater
   268  func setbits(h *hashSet, b []byte, i int, k int) {
   269  	h.addB(b)
   270  	if k == 0 {
   271  		return
   272  	}
   273  	for j := i; j < len(b)*8; j++ {
   274  		b[j/8] |= byte(1 << uint(j&7))
   275  		setbits(h, b, j+1, k-1)
   276  		b[j/8] &= byte(^(1 << uint(j&7)))
   277  	}
   278  }
   279  
   280  // Test all possible combinations of n blocks from the set s.
   281  // "permutation" is a bad name here, but it is what Smhasher uses.
   282  func TestSmhasherPermutation(t *testing.T) {
   283  	if runtime.GOARCH == "wasm" {
   284  		t.Skip("Too slow on wasm")
   285  	}
   286  	if testing.Short() {
   287  		t.Skip("Skipping in short mode")
   288  	}
   289  	testenv.ParallelOn64Bit(t)
   290  	h := newHashSet()
   291  	permutation(t, h, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8)
   292  	permutation(t, h, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8)
   293  	permutation(t, h, []uint32{0, 1}, 20)
   294  	permutation(t, h, []uint32{0, 1 << 31}, 20)
   295  	permutation(t, h, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6)
   296  }
   297  func permutation(t *testing.T, h *hashSet, s []uint32, n int) {
   298  	b := make([]byte, n*4)
   299  	genPerm(h, b, s, 0)
   300  	h.check(t)
   301  }
   302  func genPerm(h *hashSet, b []byte, s []uint32, n int) {
   303  	h.addB(b[:n])
   304  	if n == len(b) {
   305  		return
   306  	}
   307  	for _, v := range s {
   308  		b[n] = byte(v)
   309  		b[n+1] = byte(v >> 8)
   310  		b[n+2] = byte(v >> 16)
   311  		b[n+3] = byte(v >> 24)
   312  		genPerm(h, b, s, n+4)
   313  	}
   314  }
   315  
   316  type key interface {
   317  	clear()              // set bits all to 0
   318  	random(r *rand.Rand) // set key to something random
   319  	bits() int           // how many bits key has
   320  	flipBit(i int)       // flip bit i of the key
   321  	hash() uint64        // hash the key
   322  	name() string        // for error reporting
   323  }
   324  
   325  type bytesKey struct {
   326  	b []byte
   327  }
   328  
   329  func (k *bytesKey) clear() {
   330  	clear(k.b)
   331  }
   332  func (k *bytesKey) random(r *rand.Rand) {
   333  	randBytes(r, k.b)
   334  }
   335  func (k *bytesKey) bits() int {
   336  	return len(k.b) * 8
   337  }
   338  func (k *bytesKey) flipBit(i int) {
   339  	k.b[i>>3] ^= byte(1 << uint(i&7))
   340  }
   341  func (k *bytesKey) hash() uint64 {
   342  	return bytesHash(k.b)
   343  }
   344  func (k *bytesKey) name() string {
   345  	return fmt.Sprintf("bytes%d", len(k.b))
   346  }
   347  
   348  // Flipping a single bit of a key should flip each output bit with 50% probability.
   349  func TestSmhasherAvalanche(t *testing.T) {
   350  	if runtime.GOARCH == "wasm" {
   351  		t.Skip("Too slow on wasm")
   352  	}
   353  	if testing.Short() {
   354  		t.Skip("Skipping in short mode")
   355  	}
   356  	t.Parallel()
   357  	avalancheTest1(t, &bytesKey{make([]byte, 2)})
   358  	avalancheTest1(t, &bytesKey{make([]byte, 4)})
   359  	avalancheTest1(t, &bytesKey{make([]byte, 8)})
   360  	avalancheTest1(t, &bytesKey{make([]byte, 16)})
   361  	avalancheTest1(t, &bytesKey{make([]byte, 32)})
   362  	avalancheTest1(t, &bytesKey{make([]byte, 200)})
   363  }
   364  func avalancheTest1(t *testing.T, k key) {
   365  	const REP = 100000
   366  	r := rand.New(rand.NewSource(1234))
   367  	n := k.bits()
   368  
   369  	// grid[i][j] is a count of whether flipping
   370  	// input bit i affects output bit j.
   371  	grid := make([][hashSize]int, n)
   372  
   373  	for z := 0; z < REP; z++ {
   374  		// pick a random key, hash it
   375  		k.random(r)
   376  		h := k.hash()
   377  
   378  		// flip each bit, hash & compare the results
   379  		for i := 0; i < n; i++ {
   380  			k.flipBit(i)
   381  			d := h ^ k.hash()
   382  			k.flipBit(i)
   383  
   384  			// record the effects of that bit flip
   385  			g := &grid[i]
   386  			for j := 0; j < hashSize; j++ {
   387  				g[j] += int(d & 1)
   388  				d >>= 1
   389  			}
   390  		}
   391  	}
   392  
   393  	// Each entry in the grid should be about REP/2.
   394  	// More precisely, we did N = k.bits() * hashSize experiments where
   395  	// each is the sum of REP coin flips. We want to find bounds on the
   396  	// sum of coin flips such that a truly random experiment would have
   397  	// all sums inside those bounds with 99% probability.
   398  	N := n * hashSize
   399  	var c float64
   400  	// find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999
   401  	for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 {
   402  	}
   403  	c *= 11.0 // allowed slack: 40% to 60% - we don't need to be perfectly random
   404  	mean := .5 * REP
   405  	stddev := .5 * math.Sqrt(REP)
   406  	low := int(mean - c*stddev)
   407  	high := int(mean + c*stddev)
   408  	for i := 0; i < n; i++ {
   409  		for j := 0; j < hashSize; j++ {
   410  			x := grid[i][j]
   411  			if x < low || x > high {
   412  				t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP)
   413  			}
   414  		}
   415  	}
   416  }
   417  
   418  // All bit rotations of a set of distinct keys
   419  func TestSmhasherWindowed(t *testing.T) {
   420  	t.Parallel()
   421  	windowed(t, &bytesKey{make([]byte, 128)})
   422  }
   423  func windowed(t *testing.T, k key) {
   424  	if runtime.GOARCH == "wasm" {
   425  		t.Skip("Too slow on wasm")
   426  	}
   427  	if testing.Short() {
   428  		t.Skip("Skipping in short mode")
   429  	}
   430  	const BITS = 16
   431  
   432  	h := newHashSet()
   433  	for r := 0; r < k.bits(); r++ {
   434  		for i := 0; i < 1<<BITS; i++ {
   435  			k.clear()
   436  			for j := 0; j < BITS; j++ {
   437  				if i>>uint(j)&1 != 0 {
   438  					k.flipBit((j + r) % k.bits())
   439  				}
   440  			}
   441  			h.add(k.hash())
   442  		}
   443  		h.check(t)
   444  	}
   445  }
   446  
   447  // All keys of the form prefix + [A-Za-z0-9]*N + suffix.
   448  func TestSmhasherText(t *testing.T) {
   449  	if testing.Short() {
   450  		t.Skip("Skipping in short mode")
   451  	}
   452  	t.Parallel()
   453  	h := newHashSet()
   454  	text(t, h, "Foo", "Bar")
   455  	text(t, h, "FooBar", "")
   456  	text(t, h, "", "FooBar")
   457  }
   458  func text(t *testing.T, h *hashSet, prefix, suffix string) {
   459  	const N = 4
   460  	const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789"
   461  	const L = len(S)
   462  	b := make([]byte, len(prefix)+N+len(suffix))
   463  	copy(b, prefix)
   464  	copy(b[len(prefix)+N:], suffix)
   465  	c := b[len(prefix):]
   466  	for i := 0; i < L; i++ {
   467  		c[0] = S[i]
   468  		for j := 0; j < L; j++ {
   469  			c[1] = S[j]
   470  			for k := 0; k < L; k++ {
   471  				c[2] = S[k]
   472  				for x := 0; x < L; x++ {
   473  					c[3] = S[x]
   474  					h.addB(b)
   475  				}
   476  			}
   477  		}
   478  	}
   479  	h.check(t)
   480  }
   481  
   482  // Make sure different seed values generate different hashes.
   483  func TestSmhasherSeed(t *testing.T) {
   484  	if unsafe.Sizeof(uintptr(0)) == 4 {
   485  		t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)")
   486  	}
   487  	t.Parallel()
   488  	h := newHashSet()
   489  	const N = 100000
   490  	s := "hello"
   491  	for i := 0; i < N; i++ {
   492  		h.addS_seed(s, Seed{s: uint64(i + 1)})
   493  		h.addS_seed(s, Seed{s: uint64(i+1) << 32}) // make sure high bits are used
   494  	}
   495  	h.check(t)
   496  }
   497  

View as plain text