Source file src/simd/ip_test.go

     1  // Copyright 2026 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd
     6  
     7  package simd_test
     8  
     9  import (
    10  	"fmt"
    11  	"math/rand/v2"
    12  	"simd"
    13  	"testing"
    14  )
    15  
    16  func fill(x, y []float32) {
    17  	for i := range x {
    18  		x[i] = 2*rand.Float32() - 1
    19  		y[i] = 2*rand.Float32() - 1
    20  	}
    21  }
    22  
    23  func checkErrors(b *testing.B, errors int) {
    24  	b.Helper()
    25  	if errors > 0 {
    26  		b.Logf("errors = %d", errors)
    27  	}
    28  }
    29  
    30  // BenchmarkIPFMA is simd vector inner product computing using FMA.
    31  func BenchmarkIPFMA(b *testing.B) {
    32  	x := make([]float32, ipBenchLen)
    33  	y := make([]float32, ipBenchLen)
    34  
    35  	fill(x, y)
    36  
    37  	ip0, _, _ := ipFMA(x, y)
    38  
    39  	var errors int
    40  	for b.Loop() {
    41  		z, _, _ := ipFMA(x, y)
    42  		if z != ip0 {
    43  			errors++
    44  		}
    45  	}
    46  	checkErrors(b, errors)
    47  }
    48  
    49  func ipFMA(x, y []float32) (float32, int, bool) {
    50  	var a simd.Float32s
    51  	sumWidth := a.Len() * 32
    52  	emulated := simd.Emulated()
    53  	var i int
    54  	for i = 0; i < len(x)-a.Len()+1; i += a.Len() {
    55  		u := simd.LoadFloat32s(x[i : i+a.Len()])
    56  		v := simd.LoadFloat32s(y[i : i+a.Len()])
    57  		a = u.MulAdd(v, a)
    58  	}
    59  	if i < len(x) {
    60  		a = first(simd.LoadFloat32sPart(x[i:])).MulAdd(
    61  			first(simd.LoadFloat32sPart(y[i:])), a)
    62  	}
    63  
    64  	return sum(a), sumWidth, emulated
    65  }
    66  
    67  func TestIP(t *testing.T) {
    68  
    69  	var a, b [50]float32
    70  	for i := 0; i < 50; i++ {
    71  		a[i] = float32(i)
    72  		b[i] = float32(i)
    73  	}
    74  	x, sumWidth, emulated := ip(a[:50], b[:50])
    75  
    76  	if x != 40425 {
    77  		t.Errorf("Expected 40425, got %f", x)
    78  	}
    79  
    80  	fmt.Printf("ip: sum was computed in width %d, emulated = %v\n", sumWidth, emulated)
    81  }
    82  
    83  func TestIPGoTo(t *testing.T) {
    84  
    85  	var a, b [50]float32
    86  	for i := 0; i < 50; i++ {
    87  		a[i] = float32(i)
    88  		b[i] = float32(i)
    89  	}
    90  	x, sumWidth, emulated := ipGoTo(a[:50], b[:50])
    91  
    92  	if x != 40425 {
    93  		t.Errorf("Expected 40425, got %f", x)
    94  	}
    95  
    96  	fmt.Printf("ipgoto: sum was computed in width %d, emulated = %v\n", sumWidth, emulated)
    97  }
    98  
    99  func first[T, U any](t T, u U) T {
   100  	return t
   101  }
   102  
   103  const ipBenchLen = 300000
   104  
   105  // BenchmarkIP is simd vector inner product, vanilla transcription.
   106  func BenchmarkIP(b *testing.B) {
   107  	x := make([]float32, ipBenchLen)
   108  	y := make([]float32, ipBenchLen)
   109  
   110  	fill(x, y)
   111  
   112  	ip0, _, _ := ip(x, y)
   113  
   114  	var errors int
   115  	for b.Loop() {
   116  		z, _, _ := ip(x, y)
   117  		if z != ip0 {
   118  			errors++
   119  		}
   120  	}
   121  	checkErrors(b, errors)
   122  }
   123  
   124  // BenchmarkIPUnroll is simd vector inner product, unrolled 4x vector ops.
   125  func BenchmarkIPUnroll(b *testing.B) {
   126  	x := make([]float32, ipBenchLen)
   127  	y := make([]float32, ipBenchLen)
   128  
   129  	fill(x, y)
   130  
   131  	ip0, _, _ := ipU(x, y)
   132  
   133  	var errors int
   134  	for b.Loop() {
   135  		z, _, _ := ipU(x, y)
   136  		if z != ip0 {
   137  			errors++
   138  		}
   139  	}
   140  	checkErrors(b, errors)
   141  }
   142  
   143  // BenchmarkIPUnrollMore is simd vector inner product, unrolled 5x vector ops
   144  func BenchmarkIPUnrollMore(b *testing.B) {
   145  	x := make([]float32, ipBenchLen)
   146  	y := make([]float32, ipBenchLen)
   147  
   148  	fill(x, y)
   149  
   150  	ip0, _, _ := ipUmore(x, y)
   151  
   152  	var errors int
   153  	for b.Loop() {
   154  		z, _, _ := ipUmore(x, y)
   155  		if z != ip0 {
   156  			errors++
   157  		}
   158  	}
   159  	checkErrors(b, errors)
   160  }
   161  
   162  // ipNosimd computes inner product with serial
   163  // addition order of the terms (to make the)
   164  // check comparison turn out right.
   165  func ipNosimd(x, y []float32) float32 {
   166  	var z float32
   167  	for i, a := range x {
   168  		z += a * y[i]
   169  	}
   170  	return z
   171  }
   172  
   173  // BenchmarkIPnosimd1 is serial, just a vanilla inner product.
   174  func BenchmarkIPnosimd0(b *testing.B) {
   175  	x := make([]float32, ipBenchLen)
   176  	y := make([]float32, ipBenchLen)
   177  
   178  	fill(x, y)
   179  
   180  	ip0 := ipNosimd(x, y)
   181  
   182  	var errors int
   183  	for b.Loop() {
   184  		var z float32
   185  		for i, a := range x {
   186  			z += a * y[i]
   187  		}
   188  		if z != ip0 {
   189  			errors++
   190  		}
   191  	}
   192  	checkErrors(b, errors)
   193  }
   194  
   195  // BenchmarkIPnosimd1 is serial, but with a no-op subslice that
   196  // makes it clear that x and y have the same length.
   197  func BenchmarkIPnosimd1(b *testing.B) {
   198  	x := make([]float32, ipBenchLen)
   199  	y := make([]float32, ipBenchLen)
   200  
   201  	fill(x, y)
   202  
   203  	ip0 := ipNosimd(x, y)
   204  
   205  	var errors int
   206  	for b.Loop() {
   207  		var z float32
   208  		yy := y[:(len(x))]
   209  		for i, a := range x {
   210  			z += a * yy[i]
   211  		}
   212  		if z != ip0 {
   213  			errors++
   214  		}
   215  	}
   216  	checkErrors(b, errors)
   217  }
   218  
   219  // BenchmarkIPnosimdA is serial, rewritten to use arrays instead of slices,
   220  // so no bounds checking, gosh darn it to heck.
   221  func BenchmarkIPnosimdA(b *testing.B) {
   222  	var x, y [ipBenchLen]float32
   223  
   224  	fill(x[:], y[:])
   225  
   226  	ip0 := ipNosimd(x[:], y[:])
   227  
   228  	var errors int
   229  	for b.Loop() {
   230  		var z float32
   231  		for i, a := range x {
   232  			z += a * y[i]
   233  		}
   234  		if z != ip0 {
   235  			errors++
   236  		}
   237  	}
   238  	checkErrors(b, errors)
   239  }
   240  
   241  var x, y [ipBenchLen]float32
   242  var ip0 float32
   243  
   244  func initIp0() {
   245  	fill(x[:], y[:])
   246  	ip0 = ipNosimd(x[:], y[:])
   247  }
   248  
   249  // BenchmarkIPnosimdAnotBloop is serial, rewritten to use arrays instead of slices,
   250  // and using a classic iterated loop to see if b.Loop affects subscript inference,
   251  // so no bounds checking, gosh darn it to heck, this time, for sure.
   252  func BenchmarkIPnosimdAnotBloop(b *testing.B) {
   253  	if ip0 == 0 {
   254  		initIp0()
   255  	}
   256  
   257  	var errors int
   258  	for range b.N {
   259  		var z float32
   260  		for i, a := range x {
   261  			z += a * y[i]
   262  		}
   263  		if z != ip0 {
   264  			errors++
   265  		}
   266  	}
   267  	checkErrors(b, errors)
   268  }
   269  
   270  func ip(x, y []float32) (float32, int, bool) {
   271  	var a simd.Float32s
   272  	sumWidth := a.Len() * 32
   273  	emulated := simd.Emulated()
   274  	var i int
   275  	for i = 0; i < len(x)-a.Len()+1; i += a.Len() {
   276  		u := simd.LoadFloat32s(x[i : i+a.Len()])
   277  		v := simd.LoadFloat32s(y[i : i+a.Len()])
   278  		a = a.Add(u.Mul(v))
   279  	}
   280  	if i < len(x) {
   281  		a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
   282  			Mul(first(simd.LoadFloat32sPart(y[i:]))))
   283  	}
   284  
   285  	return sum(a), sumWidth, emulated
   286  }
   287  
   288  func ipU(x, y []float32) (float32, int, bool) {
   289  	const U = 4
   290  	var a, a0, a1, a2, a3 simd.Float32s
   291  	sumWidth := a.Len() * 32
   292  	emulated := simd.Emulated()
   293  	var i int
   294  	for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
   295  		i0 := i
   296  		i1 := i + a.Len()
   297  		i2 := i + 2*a.Len()
   298  		i3 := i + 3*a.Len()
   299  
   300  		u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
   301  		v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
   302  		a0 = a0.Add(u.Mul(v))
   303  
   304  		u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
   305  		v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
   306  		a1 = a1.Add(u.Mul(v))
   307  
   308  		u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
   309  		v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
   310  		a2 = a2.Add(u.Mul(v))
   311  
   312  		u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
   313  		v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
   314  		a3 = a3.Add(u.Mul(v))
   315  	}
   316  	a = a0.Add(a1).Add(a2.Add(a3))
   317  	for ; i < len(x)-a.Len()+1; i += a.Len() {
   318  		u := simd.LoadFloat32s(x[i : i+a.Len()])
   319  		v := simd.LoadFloat32s(y[i : i+a.Len()])
   320  		a = a.Add(u.Mul(v))
   321  	}
   322  	if i < len(x) {
   323  		a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
   324  			Mul(first(simd.LoadFloat32sPart(y[i:]))))
   325  	}
   326  
   327  	return sum(a), sumWidth, emulated
   328  }
   329  
   330  func ipUmore(x, y []float32) (float32, int, bool) {
   331  	const U = 5
   332  	var a, a0, a1, a2, a3, a4 simd.Float32s
   333  	sumWidth := a.Len() * 32
   334  	emulated := simd.Emulated()
   335  	var i int
   336  	for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
   337  		i0 := i
   338  		i1 := i + a.Len()
   339  		i2 := i + 2*a.Len()
   340  		i3 := i + 3*a.Len()
   341  		i4 := i + 4*a.Len()
   342  
   343  		u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
   344  		v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
   345  		a0 = a0.Add(u.Mul(v))
   346  
   347  		u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
   348  		v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
   349  		a1 = a1.Add(u.Mul(v))
   350  
   351  		u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
   352  		v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
   353  		a2 = a2.Add(u.Mul(v))
   354  
   355  		u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
   356  		v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
   357  		a3 = a3.Add(u.Mul(v))
   358  
   359  		u = simd.LoadFloat32s(x[i4 : i4+a.Len()])
   360  		v = simd.LoadFloat32s(y[i4 : i4+a.Len()])
   361  		a4 = a4.Add(u.Mul(v))
   362  	}
   363  	a = a0.Add(a1).Add(a2.Add(a3)).Add(a4)
   364  
   365  	for ; i < len(x)-a.Len()+1; i += a.Len() {
   366  		u := simd.LoadFloat32s(x[i : i+a.Len()])
   367  		v := simd.LoadFloat32s(y[i : i+a.Len()])
   368  		a = a.Add(u.Mul(v))
   369  	}
   370  	if i < len(x) {
   371  		a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
   372  			Mul(first(simd.LoadFloat32sPart(y[i:]))))
   373  	}
   374  
   375  	return sum(a), sumWidth, emulated
   376  }
   377  
   378  func ipGoTo(x, y []float32) (float32, int, bool) {
   379  	var a simd.Float32s
   380  	sumWidth := a.Len() * 32
   381  	emulated := simd.Emulated()
   382  	var i int
   383  	var u, v simd.Float32s
   384  loop:
   385  	if !(i < len(x)-a.Len()+1) {
   386  		goto done
   387  	}
   388  	u = simd.LoadFloat32s(x[i : i+a.Len()])
   389  	v = simd.LoadFloat32s(y[i : i+a.Len()])
   390  	a = a.Add(u.Mul(v))
   391  	i += a.Len()
   392  	goto loop
   393  done:
   394  	if i < len(x) {
   395  		a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
   396  			Mul(first(simd.LoadFloat32sPart(y[i:]))))
   397  	}
   398  
   399  	return sum(a), sumWidth, emulated
   400  }
   401  
   402  func boringSum(x simd.Float32s) float32 {
   403  	s := make([]float32, x.Len())
   404  	x.Store(s)
   405  	var r float32
   406  	for _, e := range s {
   407  		r += e
   408  	}
   409  	return r
   410  }
   411  

View as plain text