// Copyright 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build goexperiment.simd

package simd_test

import (
	"fmt"
	"math/rand/v2"
	"simd"
	"testing"
)

func fill(x, y []float32) {
	for i := range x {
		x[i] = 2*rand.Float32() - 1
		y[i] = 2*rand.Float32() - 1
	}
}

func checkErrors(b *testing.B, errors int) {
	b.Helper()
	if errors > 0 {
		b.Logf("errors = %d", errors)
	}
}

// BenchmarkIPFMA is simd vector inner product computing using FMA.
func BenchmarkIPFMA(b *testing.B) {
	x := make([]float32, ipBenchLen)
	y := make([]float32, ipBenchLen)

	fill(x, y)

	ip0, _, _ := ipFMA(x, y)

	var errors int
	for b.Loop() {
		z, _, _ := ipFMA(x, y)
		if z != ip0 {
			errors++
		}
	}
	checkErrors(b, errors)
}

func ipFMA(x, y []float32) (float32, int, bool) {
	var a simd.Float32s
	sumWidth := a.Len() * 32
	emulated := simd.Emulated()
	var i int
	for i = 0; i < len(x)-a.Len()+1; i += a.Len() {
		u := simd.LoadFloat32s(x[i : i+a.Len()])
		v := simd.LoadFloat32s(y[i : i+a.Len()])
		a = u.MulAdd(v, a)
	}
	if i < len(x) {
		a = first(simd.LoadFloat32sPart(x[i:])).MulAdd(
			first(simd.LoadFloat32sPart(y[i:])), a)
	}

	return sum(a), sumWidth, emulated
}

func TestIP(t *testing.T) {

	var a, b [50]float32
	for i := 0; i < 50; i++ {
		a[i] = float32(i)
		b[i] = float32(i)
	}
	x, sumWidth, emulated := ip(a[:50], b[:50])

	if x != 40425 {
		t.Errorf("Expected 40425, got %f", x)
	}

	fmt.Printf("ip: sum was computed in width %d, emulated = %v\n", sumWidth, emulated)
}

func TestIPGoTo(t *testing.T) {

	var a, b [50]float32
	for i := 0; i < 50; i++ {
		a[i] = float32(i)
		b[i] = float32(i)
	}
	x, sumWidth, emulated := ipGoTo(a[:50], b[:50])

	if x != 40425 {
		t.Errorf("Expected 40425, got %f", x)
	}

	fmt.Printf("ipgoto: sum was computed in width %d, emulated = %v\n", sumWidth, emulated)
}

func first[T, U any](t T, u U) T {
	return t
}

const ipBenchLen = 300000

// BenchmarkIP is simd vector inner product, vanilla transcription.
func BenchmarkIP(b *testing.B) {
	x := make([]float32, ipBenchLen)
	y := make([]float32, ipBenchLen)

	fill(x, y)

	ip0, _, _ := ip(x, y)

	var errors int
	for b.Loop() {
		z, _, _ := ip(x, y)
		if z != ip0 {
			errors++
		}
	}
	checkErrors(b, errors)
}

// BenchmarkIPUnroll is simd vector inner product, unrolled 4x vector ops.
func BenchmarkIPUnroll(b *testing.B) {
	x := make([]float32, ipBenchLen)
	y := make([]float32, ipBenchLen)

	fill(x, y)

	ip0, _, _ := ipU(x, y)

	var errors int
	for b.Loop() {
		z, _, _ := ipU(x, y)
		if z != ip0 {
			errors++
		}
	}
	checkErrors(b, errors)
}

// BenchmarkIPUnrollMore is simd vector inner product, unrolled 5x vector ops
func BenchmarkIPUnrollMore(b *testing.B) {
	x := make([]float32, ipBenchLen)
	y := make([]float32, ipBenchLen)

	fill(x, y)

	ip0, _, _ := ipUmore(x, y)

	var errors int
	for b.Loop() {
		z, _, _ := ipUmore(x, y)
		if z != ip0 {
			errors++
		}
	}
	checkErrors(b, errors)
}

// ipNosimd computes inner product with serial
// addition order of the terms (to make the)
// check comparison turn out right.
func ipNosimd(x, y []float32) float32 {
	var z float32
	for i, a := range x {
		z += a * y[i]
	}
	return z
}

// BenchmarkIPnosimd1 is serial, just a vanilla inner product.
func BenchmarkIPnosimd0(b *testing.B) {
	x := make([]float32, ipBenchLen)
	y := make([]float32, ipBenchLen)

	fill(x, y)

	ip0 := ipNosimd(x, y)

	var errors int
	for b.Loop() {
		var z float32
		for i, a := range x {
			z += a * y[i]
		}
		if z != ip0 {
			errors++
		}
	}
	checkErrors(b, errors)
}

// BenchmarkIPnosimd1 is serial, but with a no-op subslice that
// makes it clear that x and y have the same length.
func BenchmarkIPnosimd1(b *testing.B) {
	x := make([]float32, ipBenchLen)
	y := make([]float32, ipBenchLen)

	fill(x, y)

	ip0 := ipNosimd(x, y)

	var errors int
	for b.Loop() {
		var z float32
		yy := y[:(len(x))]
		for i, a := range x {
			z += a * yy[i]
		}
		if z != ip0 {
			errors++
		}
	}
	checkErrors(b, errors)
}

// BenchmarkIPnosimdA is serial, rewritten to use arrays instead of slices,
// so no bounds checking, gosh darn it to heck.
func BenchmarkIPnosimdA(b *testing.B) {
	var x, y [ipBenchLen]float32

	fill(x[:], y[:])

	ip0 := ipNosimd(x[:], y[:])

	var errors int
	for b.Loop() {
		var z float32
		for i, a := range x {
			z += a * y[i]
		}
		if z != ip0 {
			errors++
		}
	}
	checkErrors(b, errors)
}

var x, y [ipBenchLen]float32
var ip0 float32

func initIp0() {
	fill(x[:], y[:])
	ip0 = ipNosimd(x[:], y[:])
}

// BenchmarkIPnosimdAnotBloop is serial, rewritten to use arrays instead of slices,
// and using a classic iterated loop to see if b.Loop affects subscript inference,
// so no bounds checking, gosh darn it to heck, this time, for sure.
func BenchmarkIPnosimdAnotBloop(b *testing.B) {
	if ip0 == 0 {
		initIp0()
	}

	var errors int
	for range b.N {
		var z float32
		for i, a := range x {
			z += a * y[i]
		}
		if z != ip0 {
			errors++
		}
	}
	checkErrors(b, errors)
}

func ip(x, y []float32) (float32, int, bool) {
	var a simd.Float32s
	sumWidth := a.Len() * 32
	emulated := simd.Emulated()
	var i int
	for i = 0; i < len(x)-a.Len()+1; i += a.Len() {
		u := simd.LoadFloat32s(x[i : i+a.Len()])
		v := simd.LoadFloat32s(y[i : i+a.Len()])
		a = a.Add(u.Mul(v))
	}
	if i < len(x) {
		a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
			Mul(first(simd.LoadFloat32sPart(y[i:]))))
	}

	return sum(a), sumWidth, emulated
}

func ipU(x, y []float32) (float32, int, bool) {
	const U = 4
	var a, a0, a1, a2, a3 simd.Float32s
	sumWidth := a.Len() * 32
	emulated := simd.Emulated()
	var i int
	for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
		i0 := i
		i1 := i + a.Len()
		i2 := i + 2*a.Len()
		i3 := i + 3*a.Len()

		u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
		v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
		a0 = a0.Add(u.Mul(v))

		u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
		v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
		a1 = a1.Add(u.Mul(v))

		u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
		v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
		a2 = a2.Add(u.Mul(v))

		u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
		v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
		a3 = a3.Add(u.Mul(v))
	}
	a = a0.Add(a1).Add(a2.Add(a3))
	for ; i < len(x)-a.Len()+1; i += a.Len() {
		u := simd.LoadFloat32s(x[i : i+a.Len()])
		v := simd.LoadFloat32s(y[i : i+a.Len()])
		a = a.Add(u.Mul(v))
	}
	if i < len(x) {
		a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
			Mul(first(simd.LoadFloat32sPart(y[i:]))))
	}

	return sum(a), sumWidth, emulated
}

func ipUmore(x, y []float32) (float32, int, bool) {
	const U = 5
	var a, a0, a1, a2, a3, a4 simd.Float32s
	sumWidth := a.Len() * 32
	emulated := simd.Emulated()
	var i int
	for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
		i0 := i
		i1 := i + a.Len()
		i2 := i + 2*a.Len()
		i3 := i + 3*a.Len()
		i4 := i + 4*a.Len()

		u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
		v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
		a0 = a0.Add(u.Mul(v))

		u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
		v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
		a1 = a1.Add(u.Mul(v))

		u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
		v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
		a2 = a2.Add(u.Mul(v))

		u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
		v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
		a3 = a3.Add(u.Mul(v))

		u = simd.LoadFloat32s(x[i4 : i4+a.Len()])
		v = simd.LoadFloat32s(y[i4 : i4+a.Len()])
		a4 = a4.Add(u.Mul(v))
	}
	a = a0.Add(a1).Add(a2.Add(a3)).Add(a4)

	for ; i < len(x)-a.Len()+1; i += a.Len() {
		u := simd.LoadFloat32s(x[i : i+a.Len()])
		v := simd.LoadFloat32s(y[i : i+a.Len()])
		a = a.Add(u.Mul(v))
	}
	if i < len(x) {
		a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
			Mul(first(simd.LoadFloat32sPart(y[i:]))))
	}

	return sum(a), sumWidth, emulated
}

func ipGoTo(x, y []float32) (float32, int, bool) {
	var a simd.Float32s
	sumWidth := a.Len() * 32
	emulated := simd.Emulated()
	var i int
	var u, v simd.Float32s
loop:
	if !(i < len(x)-a.Len()+1) {
		goto done
	}
	u = simd.LoadFloat32s(x[i : i+a.Len()])
	v = simd.LoadFloat32s(y[i : i+a.Len()])
	a = a.Add(u.Mul(v))
	i += a.Len()
	goto loop
done:
	if i < len(x) {
		a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
			Mul(first(simd.LoadFloat32sPart(y[i:]))))
	}

	return sum(a), sumWidth, emulated
}

func boringSum(x simd.Float32s) float32 {
	s := make([]float32, x.Len())
	x.Store(s)
	var r float32
	for _, e := range s {
		r += e
	}
	return r
}