Source file
src/simd/ip_test.go
1
2
3
4
5
6
7 package simd_test
8
9 import (
10 "fmt"
11 "math/rand/v2"
12 "simd"
13 "testing"
14 )
15
16 func fill(x, y []float32) {
17 for i := range x {
18 x[i] = 2*rand.Float32() - 1
19 y[i] = 2*rand.Float32() - 1
20 }
21 }
22
23 func checkErrors(b *testing.B, errors int) {
24 b.Helper()
25 if errors > 0 {
26 b.Logf("errors = %d", errors)
27 }
28 }
29
30
31 func BenchmarkIPFMA(b *testing.B) {
32 x := make([]float32, ipBenchLen)
33 y := make([]float32, ipBenchLen)
34
35 fill(x, y)
36
37 ip0, _, _ := ipFMA(x, y)
38
39 var errors int
40 for b.Loop() {
41 z, _, _ := ipFMA(x, y)
42 if z != ip0 {
43 errors++
44 }
45 }
46 checkErrors(b, errors)
47 }
48
49 func ipFMA(x, y []float32) (float32, int, bool) {
50 var a simd.Float32s
51 sumWidth := a.Len() * 32
52 emulated := simd.Emulated()
53 var i int
54 for i = 0; i < len(x)-a.Len()+1; i += a.Len() {
55 u := simd.LoadFloat32s(x[i : i+a.Len()])
56 v := simd.LoadFloat32s(y[i : i+a.Len()])
57 a = u.MulAdd(v, a)
58 }
59 if i < len(x) {
60 a = first(simd.LoadFloat32sPart(x[i:])).MulAdd(
61 first(simd.LoadFloat32sPart(y[i:])), a)
62 }
63
64 return sum(a), sumWidth, emulated
65 }
66
67 func TestIP(t *testing.T) {
68
69 var a, b [50]float32
70 for i := 0; i < 50; i++ {
71 a[i] = float32(i)
72 b[i] = float32(i)
73 }
74 x, sumWidth, emulated := ip(a[:50], b[:50])
75
76 if x != 40425 {
77 t.Errorf("Expected 40425, got %f", x)
78 }
79
80 fmt.Printf("ip: sum was computed in width %d, emulated = %v\n", sumWidth, emulated)
81 }
82
83 func TestIPGoTo(t *testing.T) {
84
85 var a, b [50]float32
86 for i := 0; i < 50; i++ {
87 a[i] = float32(i)
88 b[i] = float32(i)
89 }
90 x, sumWidth, emulated := ipGoTo(a[:50], b[:50])
91
92 if x != 40425 {
93 t.Errorf("Expected 40425, got %f", x)
94 }
95
96 fmt.Printf("ipgoto: sum was computed in width %d, emulated = %v\n", sumWidth, emulated)
97 }
98
99 func first[T, U any](t T, u U) T {
100 return t
101 }
102
103 const ipBenchLen = 300000
104
105
106 func BenchmarkIP(b *testing.B) {
107 x := make([]float32, ipBenchLen)
108 y := make([]float32, ipBenchLen)
109
110 fill(x, y)
111
112 ip0, _, _ := ip(x, y)
113
114 var errors int
115 for b.Loop() {
116 z, _, _ := ip(x, y)
117 if z != ip0 {
118 errors++
119 }
120 }
121 checkErrors(b, errors)
122 }
123
124
125 func BenchmarkIPUnroll(b *testing.B) {
126 x := make([]float32, ipBenchLen)
127 y := make([]float32, ipBenchLen)
128
129 fill(x, y)
130
131 ip0, _, _ := ipU(x, y)
132
133 var errors int
134 for b.Loop() {
135 z, _, _ := ipU(x, y)
136 if z != ip0 {
137 errors++
138 }
139 }
140 checkErrors(b, errors)
141 }
142
143
144 func BenchmarkIPUnrollMore(b *testing.B) {
145 x := make([]float32, ipBenchLen)
146 y := make([]float32, ipBenchLen)
147
148 fill(x, y)
149
150 ip0, _, _ := ipUmore(x, y)
151
152 var errors int
153 for b.Loop() {
154 z, _, _ := ipUmore(x, y)
155 if z != ip0 {
156 errors++
157 }
158 }
159 checkErrors(b, errors)
160 }
161
162
163
164
165 func ipNosimd(x, y []float32) float32 {
166 var z float32
167 for i, a := range x {
168 z += a * y[i]
169 }
170 return z
171 }
172
173
174 func BenchmarkIPnosimd0(b *testing.B) {
175 x := make([]float32, ipBenchLen)
176 y := make([]float32, ipBenchLen)
177
178 fill(x, y)
179
180 ip0 := ipNosimd(x, y)
181
182 var errors int
183 for b.Loop() {
184 var z float32
185 for i, a := range x {
186 z += a * y[i]
187 }
188 if z != ip0 {
189 errors++
190 }
191 }
192 checkErrors(b, errors)
193 }
194
195
196
197 func BenchmarkIPnosimd1(b *testing.B) {
198 x := make([]float32, ipBenchLen)
199 y := make([]float32, ipBenchLen)
200
201 fill(x, y)
202
203 ip0 := ipNosimd(x, y)
204
205 var errors int
206 for b.Loop() {
207 var z float32
208 yy := y[:(len(x))]
209 for i, a := range x {
210 z += a * yy[i]
211 }
212 if z != ip0 {
213 errors++
214 }
215 }
216 checkErrors(b, errors)
217 }
218
219
220
221 func BenchmarkIPnosimdA(b *testing.B) {
222 var x, y [ipBenchLen]float32
223
224 fill(x[:], y[:])
225
226 ip0 := ipNosimd(x[:], y[:])
227
228 var errors int
229 for b.Loop() {
230 var z float32
231 for i, a := range x {
232 z += a * y[i]
233 }
234 if z != ip0 {
235 errors++
236 }
237 }
238 checkErrors(b, errors)
239 }
240
241 var x, y [ipBenchLen]float32
242 var ip0 float32
243
244 func initIp0() {
245 fill(x[:], y[:])
246 ip0 = ipNosimd(x[:], y[:])
247 }
248
249
250
251
252 func BenchmarkIPnosimdAnotBloop(b *testing.B) {
253 if ip0 == 0 {
254 initIp0()
255 }
256
257 var errors int
258 for range b.N {
259 var z float32
260 for i, a := range x {
261 z += a * y[i]
262 }
263 if z != ip0 {
264 errors++
265 }
266 }
267 checkErrors(b, errors)
268 }
269
270 func ip(x, y []float32) (float32, int, bool) {
271 var a simd.Float32s
272 sumWidth := a.Len() * 32
273 emulated := simd.Emulated()
274 var i int
275 for i = 0; i < len(x)-a.Len()+1; i += a.Len() {
276 u := simd.LoadFloat32s(x[i : i+a.Len()])
277 v := simd.LoadFloat32s(y[i : i+a.Len()])
278 a = a.Add(u.Mul(v))
279 }
280 if i < len(x) {
281 a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
282 Mul(first(simd.LoadFloat32sPart(y[i:]))))
283 }
284
285 return sum(a), sumWidth, emulated
286 }
287
288 func ipU(x, y []float32) (float32, int, bool) {
289 const U = 4
290 var a, a0, a1, a2, a3 simd.Float32s
291 sumWidth := a.Len() * 32
292 emulated := simd.Emulated()
293 var i int
294 for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
295 i0 := i
296 i1 := i + a.Len()
297 i2 := i + 2*a.Len()
298 i3 := i + 3*a.Len()
299
300 u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
301 v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
302 a0 = a0.Add(u.Mul(v))
303
304 u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
305 v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
306 a1 = a1.Add(u.Mul(v))
307
308 u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
309 v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
310 a2 = a2.Add(u.Mul(v))
311
312 u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
313 v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
314 a3 = a3.Add(u.Mul(v))
315 }
316 a = a0.Add(a1).Add(a2.Add(a3))
317 for ; i < len(x)-a.Len()+1; i += a.Len() {
318 u := simd.LoadFloat32s(x[i : i+a.Len()])
319 v := simd.LoadFloat32s(y[i : i+a.Len()])
320 a = a.Add(u.Mul(v))
321 }
322 if i < len(x) {
323 a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
324 Mul(first(simd.LoadFloat32sPart(y[i:]))))
325 }
326
327 return sum(a), sumWidth, emulated
328 }
329
330 func ipUmore(x, y []float32) (float32, int, bool) {
331 const U = 5
332 var a, a0, a1, a2, a3, a4 simd.Float32s
333 sumWidth := a.Len() * 32
334 emulated := simd.Emulated()
335 var i int
336 for i = 0; i < len(x)-U*a.Len()+1; i += U * a.Len() {
337 i0 := i
338 i1 := i + a.Len()
339 i2 := i + 2*a.Len()
340 i3 := i + 3*a.Len()
341 i4 := i + 4*a.Len()
342
343 u := simd.LoadFloat32s(x[i0 : i0+a.Len()])
344 v := simd.LoadFloat32s(y[i0 : i0+a.Len()])
345 a0 = a0.Add(u.Mul(v))
346
347 u = simd.LoadFloat32s(x[i1 : i1+a.Len()])
348 v = simd.LoadFloat32s(y[i1 : i1+a.Len()])
349 a1 = a1.Add(u.Mul(v))
350
351 u = simd.LoadFloat32s(x[i2 : i2+a.Len()])
352 v = simd.LoadFloat32s(y[i2 : i2+a.Len()])
353 a2 = a2.Add(u.Mul(v))
354
355 u = simd.LoadFloat32s(x[i3 : i3+a.Len()])
356 v = simd.LoadFloat32s(y[i3 : i3+a.Len()])
357 a3 = a3.Add(u.Mul(v))
358
359 u = simd.LoadFloat32s(x[i4 : i4+a.Len()])
360 v = simd.LoadFloat32s(y[i4 : i4+a.Len()])
361 a4 = a4.Add(u.Mul(v))
362 }
363 a = a0.Add(a1).Add(a2.Add(a3)).Add(a4)
364
365 for ; i < len(x)-a.Len()+1; i += a.Len() {
366 u := simd.LoadFloat32s(x[i : i+a.Len()])
367 v := simd.LoadFloat32s(y[i : i+a.Len()])
368 a = a.Add(u.Mul(v))
369 }
370 if i < len(x) {
371 a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
372 Mul(first(simd.LoadFloat32sPart(y[i:]))))
373 }
374
375 return sum(a), sumWidth, emulated
376 }
377
378 func ipGoTo(x, y []float32) (float32, int, bool) {
379 var a simd.Float32s
380 sumWidth := a.Len() * 32
381 emulated := simd.Emulated()
382 var i int
383 var u, v simd.Float32s
384 loop:
385 if !(i < len(x)-a.Len()+1) {
386 goto done
387 }
388 u = simd.LoadFloat32s(x[i : i+a.Len()])
389 v = simd.LoadFloat32s(y[i : i+a.Len()])
390 a = a.Add(u.Mul(v))
391 i += a.Len()
392 goto loop
393 done:
394 if i < len(x) {
395 a = a.Add(first(simd.LoadFloat32sPart(x[i:])).
396 Mul(first(simd.LoadFloat32sPart(y[i:]))))
397 }
398
399 return sum(a), sumWidth, emulated
400 }
401
402 func boringSum(x simd.Float32s) float32 {
403 s := make([]float32, x.Len())
404 x.Store(s)
405 var r float32
406 for _, e := range s {
407 r += e
408 }
409 return r
410 }
411
View as plain text