// Copyright 2026 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build goexperiment.simd package simd_test import ( "fmt" "math/rand/v2" "simd" "testing" ) func fill(x, y []float32) { for i := range x { x[i] = 2*rand.Float32() - 1 y[i] = 2*rand.Float32() - 1 } } func checkErrors(b *testing.B, errors int) { b.Helper() if errors > 0 { b.Logf("errors = %d", errors) } } // BenchmarkIPFMA is simd vector inner product computing using FMA. func BenchmarkIPFMA(b *testing.B) { x := make([]float32, ipBenchLen) y := make([]float32, ipBenchLen) fill(x, y) ip0, _, _ := ipFMA(x, y) var errors int for b.Loop() { z, _, _ := ipFMA(x, y) if z != ip0 { errors++ } } checkErrors(b, errors) } func ipFMA(x, y []float32) (float32, int, bool) { var a simd.Float32s sumWidth := a.Len() * 32 emulated := simd.Emulated() var i int for i = 0; i < len(x)-a.Len()+1; i += a.Len() { u := simd.LoadFloat32s(x[i : i+a.Len()]) v := simd.LoadFloat32s(y[i : i+a.Len()]) a = u.MulAdd(v, a) } if i < len(x) { a = first(simd.LoadFloat32sPart(x[i:])).MulAdd( first(simd.LoadFloat32sPart(y[i:])), a) } return sum(a), sumWidth, emulated } func TestIP(t *testing.T) { var a, b [50]float32 for i := 0; i < 50; i++ { a[i] = float32(i) b[i] = float32(i) } x, sumWidth, emulated := ip(a[:50], b[:50]) if x != 40425 { t.Errorf("Expected 40425, got %f", x) } fmt.Printf("ip: sum was computed in width %d, emulated = %v\n", sumWidth, emulated) } func TestIPGoTo(t *testing.T) { var a, b [50]float32 for i := 0; i < 50; i++ { a[i] = float32(i) b[i] = float32(i) } x, sumWidth, emulated := ipGoTo(a[:50], b[:50]) if x != 40425 { t.Errorf("Expected 40425, got %f", x) } fmt.Printf("ipgoto: sum was computed in width %d, emulated = %v\n", sumWidth, emulated) } func first[T, U any](t T, u U) T { return t } const ipBenchLen = 300000 // BenchmarkIP is simd vector inner product, vanilla transcription. func BenchmarkIP(b *testing.B) { x := make([]float32, ipBenchLen) y := make([]float32, ipBenchLen) fill(x, y) ip0, _, _ := ip(x, y) var errors int for b.Loop() { z, _, _ := ip(x, y) if z != ip0 { errors++ } } checkErrors(b, errors) } // BenchmarkIPUnroll is simd vector inner product, unrolled 4x vector ops. func BenchmarkIPUnroll(b *testing.B) { x := make([]float32, ipBenchLen) y := make([]float32, ipBenchLen) fill(x, y) ip0, _, _ := ipU(x, y) var errors int for b.Loop() { z, _, _ := ipU(x, y) if z != ip0 { errors++ } } checkErrors(b, errors) } // BenchmarkIPUnrollMore is simd vector inner product, unrolled 5x vector ops func BenchmarkIPUnrollMore(b *testing.B) { x := make([]float32, ipBenchLen) y := make([]float32, ipBenchLen) fill(x, y) ip0, _, _ := ipUmore(x, y) var errors int for b.Loop() { z, _, _ := ipUmore(x, y) if z != ip0 { errors++ } } checkErrors(b, errors) } // ipNosimd computes inner product with serial // addition order of the terms (to make the) // check comparison turn out right. func ipNosimd(x, y []float32) float32 { var z float32 for i, a := range x { z += a * y[i] } return z } // BenchmarkIPnosimd1 is serial, just a vanilla inner product. func BenchmarkIPnosimd0(b *testing.B) { x := make([]float32, ipBenchLen) y := make([]float32, ipBenchLen) fill(x, y) ip0 := ipNosimd(x, y) var errors int for b.Loop() { var z float32 for i, a := range x { z += a * y[i] } if z != ip0 { errors++ } } checkErrors(b, errors) } // BenchmarkIPnosimd1 is serial, but with a no-op subslice that // makes it clear that x and y have the same length. func BenchmarkIPnosimd1(b *testing.B) { x := make([]float32, ipBenchLen) y := make([]float32, ipBenchLen) fill(x, y) ip0 := ipNosimd(x, y) var errors int for b.Loop() { var z float32 yy := y[:(len(x))] for i, a := range x { z += a * yy[i] } if z != ip0 { errors++ } } checkErrors(b, errors) } // BenchmarkIPnosimdA is serial, rewritten to use arrays instead of slices, // so no bounds checking, gosh darn it to heck. func BenchmarkIPnosimdA(b *testing.B) { var x, y [ipBenchLen]float32 fill(x[:], y[:]) ip0 := ipNosimd(x[:], y[:]) var errors int for b.Loop() { var z float32 for i, a := range x { z += a * y[i] } if z != ip0 { errors++ } } checkErrors(b, errors) } var x, y [ipBenchLen]float32 var ip0 float32 func initIp0() { fill(x[:], y[:]) ip0 = ipNosimd(x[:], y[:]) } // BenchmarkIPnosimdAnotBloop is serial, rewritten to use arrays instead of slices, // and using a classic iterated loop to see if b.Loop affects subscript inference, // so no bounds checking, gosh darn it to heck, this time, for sure. func BenchmarkIPnosimdAnotBloop(b *testing.B) { if ip0 == 0 { initIp0() } var errors int for range b.N { var z float32 for i, a := range x { z += a * y[i] } if z != ip0 { errors++ } } checkErrors(b, errors) } func ip(x, y []float32) (float32, int, bool) { var a simd.Float32s sumWidth := a.Len() * 32 emulated := simd.Emulated() var i int for i = 0; i < len(x)-a.Len()+1; i += a.Len() { u := simd.LoadFloat32s(x[i : i+a.Len()]) v := simd.LoadFloat32s(y[i : i+a.Len()]) a = a.Add(u.Mul(v)) } if i < len(x) { a = a.Add(first(simd.LoadFloat32sPart(x[i:])). Mul(first(simd.LoadFloat32sPart(y[i:])))) } return sum(a), sumWidth, emulated } func ipU(x, y []float32) (float32, int, bool) { const U = 4 var a, a0, a1, a2, a3 simd.Float32s sumWidth := a.Len() * 32 emulated := simd.Emulated() var i int for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() { i0 := i i1 := i + a.Len() i2 := i + 2*a.Len() i3 := i + 3*a.Len() u := simd.LoadFloat32s(x[i0 : i0+a.Len()]) v := simd.LoadFloat32s(y[i0 : i0+a.Len()]) a0 = a0.Add(u.Mul(v)) u = simd.LoadFloat32s(x[i1 : i1+a.Len()]) v = simd.LoadFloat32s(y[i1 : i1+a.Len()]) a1 = a1.Add(u.Mul(v)) u = simd.LoadFloat32s(x[i2 : i2+a.Len()]) v = simd.LoadFloat32s(y[i2 : i2+a.Len()]) a2 = a2.Add(u.Mul(v)) u = simd.LoadFloat32s(x[i3 : i3+a.Len()]) v = simd.LoadFloat32s(y[i3 : i3+a.Len()]) a3 = a3.Add(u.Mul(v)) } a = a0.Add(a1).Add(a2.Add(a3)) for ; i < len(x)-a.Len()+1; i += a.Len() { u := simd.LoadFloat32s(x[i : i+a.Len()]) v := simd.LoadFloat32s(y[i : i+a.Len()]) a = a.Add(u.Mul(v)) } if i < len(x) { a = a.Add(first(simd.LoadFloat32sPart(x[i:])). Mul(first(simd.LoadFloat32sPart(y[i:])))) } return sum(a), sumWidth, emulated } func ipUmore(x, y []float32) (float32, int, bool) { const U = 5 var a, a0, a1, a2, a3, a4 simd.Float32s sumWidth := a.Len() * 32 emulated := simd.Emulated() var i int for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() { i0 := i i1 := i + a.Len() i2 := i + 2*a.Len() i3 := i + 3*a.Len() i4 := i + 4*a.Len() u := simd.LoadFloat32s(x[i0 : i0+a.Len()]) v := simd.LoadFloat32s(y[i0 : i0+a.Len()]) a0 = a0.Add(u.Mul(v)) u = simd.LoadFloat32s(x[i1 : i1+a.Len()]) v = simd.LoadFloat32s(y[i1 : i1+a.Len()]) a1 = a1.Add(u.Mul(v)) u = simd.LoadFloat32s(x[i2 : i2+a.Len()]) v = simd.LoadFloat32s(y[i2 : i2+a.Len()]) a2 = a2.Add(u.Mul(v)) u = simd.LoadFloat32s(x[i3 : i3+a.Len()]) v = simd.LoadFloat32s(y[i3 : i3+a.Len()]) a3 = a3.Add(u.Mul(v)) u = simd.LoadFloat32s(x[i4 : i4+a.Len()]) v = simd.LoadFloat32s(y[i4 : i4+a.Len()]) a4 = a4.Add(u.Mul(v)) } a = a0.Add(a1).Add(a2.Add(a3)).Add(a4) for ; i < len(x)-a.Len()+1; i += a.Len() { u := simd.LoadFloat32s(x[i : i+a.Len()]) v := simd.LoadFloat32s(y[i : i+a.Len()]) a = a.Add(u.Mul(v)) } if i < len(x) { a = a.Add(first(simd.LoadFloat32sPart(x[i:])). Mul(first(simd.LoadFloat32sPart(y[i:])))) } return sum(a), sumWidth, emulated } func ipGoTo(x, y []float32) (float32, int, bool) { var a simd.Float32s sumWidth := a.Len() * 32 emulated := simd.Emulated() var i int var u, v simd.Float32s loop: if !(i < len(x)-a.Len()+1) { goto done } u = simd.LoadFloat32s(x[i : i+a.Len()]) v = simd.LoadFloat32s(y[i : i+a.Len()]) a = a.Add(u.Mul(v)) i += a.Len() goto loop done: if i < len(x) { a = a.Add(first(simd.LoadFloat32sPart(x[i:])). Mul(first(simd.LoadFloat32sPart(y[i:])))) } return sum(a), sumWidth, emulated } func boringSum(x simd.Float32s) float32 { s := make([]float32, x.Len()) x.Store(s) var r float32 for _, e := range s { r += e } return r }