1
2
3
4
5
6
7 package simd_test
8
9 import (
10 "fmt"
11 "os"
12 "simd/archsimd"
13 "slices"
14 "testing"
15 "unsafe"
16 )
17
18 func TestMain(m *testing.M) {
19 if !archsimd.X86.AVX() {
20 fmt.Fprintln(os.Stderr, "Skipping tests: AVX is not available")
21 os.Exit(0)
22 }
23 os.Exit(m.Run())
24 }
25
26 func TestPermute(t *testing.T) {
27 if !archsimd.X86.AVX512() {
28 t.Skip("Test requires X86.AVX512, not available on this hardware")
29 return
30 }
31 x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
32 indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
33 want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
34 got := make([]int64, 8)
35 archsimd.LoadInt64x8(x).Permute(archsimd.LoadUint64x8(indices)).Store(got)
36 checkSlices(t, got, want)
37 }
38
39 func TestPermuteOrZero(t *testing.T) {
40 x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
41 indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
42 want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
43 got := make([]uint8, len(x))
44 archsimd.LoadUint8x16(x).PermuteOrZero(archsimd.LoadInt8x16(indices)).Store(got)
45 checkSlices(t, got, want)
46 }
47
48 func TestConcatPermute(t *testing.T) {
49 if !archsimd.X86.AVX512() {
50 t.Skip("Test requires X86.AVX512, not available on this hardware")
51 return
52 }
53 x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
54 y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
55 indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
56 want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
57 got := make([]int64, 8)
58 archsimd.LoadInt64x8(x).ConcatPermute(archsimd.LoadInt64x8(y), archsimd.LoadUint64x8(indices)).Store(got)
59 checkSlices(t, got, want)
60 }
61
62 func TestCompress(t *testing.T) {
63 if !archsimd.X86.AVX512() {
64 t.Skip("Test requires X86.AVX512, not available on this hardware")
65 return
66 }
67 v1234 := archsimd.LoadInt32x4([]int32{1, 2, 3, 4})
68 v2400 := v1234.Compress(archsimd.Mask32x4FromBits(0b1010))
69 got := make([]int32, 4)
70 v2400.Store(got)
71 want := []int32{2, 4, 0, 0}
72 if !slices.Equal(got, want) {
73 t.Errorf("want and got differ, want=%v, got=%v", want, got)
74 }
75 }
76
77 func TestExpand(t *testing.T) {
78 if !archsimd.X86.AVX512() {
79 t.Skip("Test requires X86.AVX512, not available on this hardware")
80 return
81 }
82 v3400 := archsimd.LoadInt32x4([]int32{3, 4, 0, 0})
83 v2400 := v3400.Expand(archsimd.Mask32x4FromBits(0b1010))
84 got := make([]int32, 4)
85 v2400.Store(got)
86 want := []int32{0, 3, 0, 4}
87 if !slices.Equal(got, want) {
88 t.Errorf("want and got differ, want=%v, got=%v", want, got)
89 }
90 }
91
92 func TestSlicesInt8(t *testing.T) {
93 if !archsimd.X86.AVX2() {
94 t.Skip("Test requires X86.AVX2, not available on this hardware")
95 return
96 }
97 a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
98 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
99 v := archsimd.LoadInt8x32(a)
100 b := make([]int8, 32, 32)
101 v.Store(b)
102 checkSlices(t, a, b)
103 }
104
105 func TestSlicesInt8TooShortLoad(t *testing.T) {
106 if !archsimd.X86.AVX2() {
107 t.Skip("Test requires X86.AVX2, not available on this hardware")
108 return
109 }
110 defer func() {
111 if r := recover(); r != nil {
112 t.Logf("Saw EXPECTED panic %v", r)
113 } else {
114 t.Errorf("Did not see expected panic")
115 }
116 }()
117 a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
118 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
119 v := archsimd.LoadInt8x32(a)
120 b := make([]int8, 32, 32)
121 v.Store(b)
122 checkSlices(t, a, b)
123 }
124
125 func TestSlicesInt8TooShortStore(t *testing.T) {
126 if !archsimd.X86.AVX2() {
127 t.Skip("Test requires X86.AVX2, not available on this hardware")
128 return
129 }
130 defer func() {
131 if r := recover(); r != nil {
132 t.Logf("Saw EXPECTED panic %v", r)
133 } else {
134 t.Errorf("Did not see expected panic")
135 }
136 }()
137 a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
138 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
139 v := archsimd.LoadInt8x32(a)
140 b := make([]int8, 31)
141 v.Store(b)
142 checkSlices(t, a, b)
143 }
144
145 func TestSlicesFloat64(t *testing.T) {
146 a := []float64{1, 2, 3, 4, 5, 6, 7, 8}
147 v := archsimd.LoadFloat64x4(a)
148 b := make([]float64, 4, 4)
149 v.Store(b)
150 for i := range b {
151 if a[i] != b[i] {
152 t.Errorf("a and b differ at index %d, a=%f, b=%f", i, a[i], b[i])
153 }
154 }
155 }
156
157
158 func TestMergeLocals(t *testing.T) {
159 if !archsimd.X86.AVX2() {
160 t.Skip("Test requires X86.AVX2, not available on this hardware")
161 return
162 }
163 testMergeLocalswrapper(t, archsimd.Int64x4.Add)
164 }
165
166
167 func forceSpill() {}
168
169 func testMergeLocalswrapper(t *testing.T, op func(archsimd.Int64x4, archsimd.Int64x4) archsimd.Int64x4) {
170 t.Helper()
171 s0 := []int64{0, 1, 2, 3}
172 s1 := []int64{-1, 0, -1, 0}
173 want := []int64{-1, 1, 1, 3}
174 v := archsimd.LoadInt64x4(s0)
175 m := archsimd.LoadInt64x4(s1)
176 forceSpill()
177 got := make([]int64, 4)
178 gotv := op(v, m)
179 gotv.Store(got)
180 for i := range len(want) {
181 if !(got[i] == want[i]) {
182 t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i])
183 }
184 }
185 }
186
187 func TestBitMaskFromBits(t *testing.T) {
188 if !archsimd.X86.AVX512() {
189 t.Skip("Test requires X86.AVX512, not available on this hardware")
190 return
191 }
192 results := [2]int64{}
193 want := [2]int64{0, 6}
194 m := archsimd.Mask64x2FromBits(0b10)
195 archsimd.LoadInt64x2([]int64{1, 2}).Add(archsimd.LoadInt64x2([]int64{3, 4})).Masked(m).StoreArray(&results)
196 for i := range 2 {
197 if results[i] != want[i] {
198 t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
199 }
200 }
201 }
202
203 var maskForTestBitMaskFromBitsLoad = uint8(0b10)
204
205 func TestBitMaskFromBitsLoad(t *testing.T) {
206 if !archsimd.X86.AVX512() {
207 t.Skip("Test requires X86.AVX512, not available on this hardware")
208 return
209 }
210 results := [2]int64{}
211 want := [2]int64{0, 6}
212 m := archsimd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad)
213 archsimd.LoadInt64x2([]int64{1, 2}).Add(archsimd.LoadInt64x2([]int64{3, 4})).Masked(m).StoreArray(&results)
214 for i := range 2 {
215 if results[i] != want[i] {
216 t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
217 }
218 }
219 }
220
221 func TestBitMaskToBits(t *testing.T) {
222 int8s := []int8{
223 0, 1, 1, 0, 0, 1, 0, 1,
224 1, 0, 1, 1, 0, 0, 1, 0,
225 1, 0, 0, 1, 1, 0, 1, 0,
226 0, 1, 1, 0, 0, 1, 0, 1,
227 1, 0, 0, 1, 0, 1, 1, 0,
228 0, 1, 0, 1, 1, 0, 0, 1,
229 1, 0, 1, 0, 0, 1, 1, 0,
230 0, 1, 1, 0, 1, 0, 0, 1,
231 }
232 int16s := make([]int16, 32)
233 for i := range int16s {
234 int16s[i] = int16(int8s[i])
235 }
236 int32s := make([]int32, 16)
237 for i := range int32s {
238 int32s[i] = int32(int8s[i])
239 }
240 int64s := make([]int64, 8)
241 for i := range int64s {
242 int64s[i] = int64(int8s[i])
243 }
244 want64 := uint64(0)
245 for i := range int8s {
246 want64 |= uint64(int8s[i]) << i
247 }
248 want32 := uint32(want64)
249 want16 := uint16(want64)
250 want8 := uint8(want64)
251 want4 := want8 & 0b1111
252 want2 := want4 & 0b11
253
254 if v := archsimd.LoadInt8x16(int8s[:16]).ToMask().ToBits(); v != want16 {
255 t.Errorf("want %b, got %b", want16, v)
256 }
257 if v := archsimd.LoadInt32x4(int32s[:4]).ToMask().ToBits(); v != want4 {
258 t.Errorf("want %b, got %b", want4, v)
259 }
260 if v := archsimd.LoadInt32x8(int32s[:8]).ToMask().ToBits(); v != want8 {
261 t.Errorf("want %b, got %b", want8, v)
262 }
263 if v := archsimd.LoadInt64x2(int64s[:2]).ToMask().ToBits(); v != want2 {
264 t.Errorf("want %b, got %b", want2, v)
265 }
266 if v := archsimd.LoadInt64x4(int64s[:4]).ToMask().ToBits(); v != want4 {
267 t.Errorf("want %b, got %b", want4, v)
268 }
269
270 if archsimd.X86.AVX2() {
271 if v := archsimd.LoadInt8x32(int8s[:32]).ToMask().ToBits(); v != want32 {
272 t.Errorf("want %b, got %b", want32, v)
273 }
274 }
275
276 if archsimd.X86.AVX512() {
277 if v := archsimd.LoadInt8x64(int8s).ToMask().ToBits(); v != want64 {
278 t.Errorf("want %b, got %b", want64, v)
279 }
280 if v := archsimd.LoadInt16x8(int16s[:8]).ToMask().ToBits(); v != want8 {
281 t.Errorf("want %b, got %b", want8, v)
282 }
283 if v := archsimd.LoadInt16x16(int16s[:16]).ToMask().ToBits(); v != want16 {
284 t.Errorf("want %b, got %b", want16, v)
285 }
286 if v := archsimd.LoadInt16x32(int16s).ToMask().ToBits(); v != want32 {
287 t.Errorf("want %b, got %b", want32, v)
288 }
289 if v := archsimd.LoadInt32x16(int32s).ToMask().ToBits(); v != want16 {
290 t.Errorf("want %b, got %b", want16, v)
291 }
292 if v := archsimd.LoadInt64x8(int64s).ToMask().ToBits(); v != want8 {
293 t.Errorf("want %b, got %b", want8, v)
294 }
295 }
296 }
297
298 var maskForTestBitMaskFromBitsStore uint8
299
300 func TestBitMaskToBitsStore(t *testing.T) {
301 if !archsimd.X86.AVX512() {
302 t.Skip("Test requires X86.AVX512, not available on this hardware")
303 return
304 }
305 maskForTestBitMaskFromBitsStore = archsimd.LoadInt16x8([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits()
306 if maskForTestBitMaskFromBitsStore != 0b101 {
307 t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore)
308 }
309 }
310
311 func TestMergeFloat(t *testing.T) {
312 if !archsimd.X86.AVX2() {
313 t.Skip("Test requires X86.AVX2, not available on this hardware")
314 return
315 }
316 k := make([]int64, 4, 4)
317 s := make([]float64, 4, 4)
318
319 a := archsimd.LoadFloat64x4([]float64{1, 2, 3, 4})
320 b := archsimd.LoadFloat64x4([]float64{4, 2, 3, 1})
321 g := a.Greater(b)
322 g.ToInt64x4().Store(k)
323 c := a.Merge(b, g)
324
325 c.Store(s)
326
327 checkSlices[int64](t, k, []int64{0, 0, 0, -1})
328 checkSlices[float64](t, s, []float64{4, 2, 3, 4})
329 }
330
331 func TestIfElseFloat(t *testing.T) {
332 if !archsimd.X86.AVX2() {
333 t.Skip("Test requires X86.AVX2, not available on this hardware")
334 return
335 }
336 k := make([]int64, 4, 4)
337 s := make([]float64, 4, 4)
338
339 a := archsimd.LoadFloat64x4([]float64{1, 2, 3, 4})
340 b := archsimd.LoadFloat64x4([]float64{4, 2, 3, 1})
341 g := a.Greater(b)
342 g.ToInt64x4().Store(k)
343 c := a.IfElse(g, b)
344
345 c.Store(s)
346
347 checkSlices[int64](t, k, []int64{0, 0, 0, -1})
348 checkSlices[float64](t, s, []float64{4, 2, 3, 4})
349 }
350
351 func TestMergeFloat512(t *testing.T) {
352 if !archsimd.X86.AVX512() {
353 t.Skip("Test requires X86.AVX512, not available on this hardware")
354 return
355 }
356
357 k := make([]int64, 8, 8)
358 s := make([]float64, 8, 8)
359
360 a := archsimd.LoadFloat64x8([]float64{1, 2, 3, 4, 5, 6, 7, 8})
361 b := archsimd.LoadFloat64x8([]float64{8, 7, 6, 5, 4, 2, 3, 1})
362 g := a.Greater(b)
363 g.ToInt64x8().Store(k)
364 c := a.Merge(b, g)
365 d := a.Masked(g)
366
367 checkSlices[int64](t, k, []int64{0, 0, 0, 0, -1, -1, -1, -1})
368
369 c.Store(s)
370 checkSlices[float64](t, s, []float64{8, 7, 6, 5, 5, 6, 7, 8})
371
372 d.Store(s)
373 checkSlices[float64](t, s, []float64{0, 0, 0, 0, 5, 6, 7, 8})
374 }
375
376 func TestIfElseFloat512(t *testing.T) {
377 if !archsimd.X86.AVX512() {
378 t.Skip("Test requires X86.AVX512, not available on this hardware")
379 return
380 }
381
382 k := make([]int64, 8, 8)
383 s := make([]float64, 8, 8)
384
385 a := archsimd.LoadFloat64x8([]float64{1, 2, 3, 4, 5, 6, 7, 8})
386 b := archsimd.LoadFloat64x8([]float64{8, 7, 6, 5, 4, 2, 3, 1})
387 g := a.Greater(b)
388 g.ToInt64x8().Store(k)
389 c := a.IfElse(g, b)
390 d := a.Masked(g)
391
392 checkSlices[int64](t, k, []int64{0, 0, 0, 0, -1, -1, -1, -1})
393
394 c.Store(s)
395 checkSlices[float64](t, s, []float64{8, 7, 6, 5, 5, 6, 7, 8})
396
397 d.Store(s)
398 checkSlices[float64](t, s, []float64{0, 0, 0, 0, 5, 6, 7, 8})
399 }
400
401 var ro uint64 = 2
402 var roBig uint64 = 1024 + 2
403
404 func TestRotateAllVariable(t *testing.T) {
405 got := make([]int32, 4)
406 archsimd.LoadInt32x4([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(ro).Store(got)
407 for _, v := range got {
408 if v != 0b1100 {
409 t.Errorf("Want 0b1100, got %b", v)
410 }
411 }
412 archsimd.LoadInt32x4([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(roBig).Store(got)
413 for _, v := range got {
414 if v != 0b1100 {
415 t.Errorf("Want 0b1100, got %b", v)
416 }
417 }
418 }
419
420 func TestRotateAllConst(t *testing.T) {
421 got := make([]int32, 4)
422 archsimd.LoadInt32x4([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(2).Store(got)
423 for _, v := range got {
424 if v != 0b1100 {
425 t.Errorf("Want 0b1100, got %b", v)
426 }
427 }
428 }
429
430 func TestBroadcastFloat32x8(t *testing.T) {
431 s := make([]float32, 8, 8)
432 archsimd.BroadcastFloat32x8(123456789).Store(s)
433 checkSlices(t, s, []float32{123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789})
434 }
435
436 func TestBroadcastInt8x32(t *testing.T) {
437 if !archsimd.X86.AVX2() {
438 t.Skip("Test requires X86.AVX2, not available on this hardware")
439 return
440 }
441 s := make([]int8, 32, 32)
442 archsimd.BroadcastInt8x32(-123).Store(s)
443 checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
444 -123, -123, -123, -123, -123, -123, -123, -123,
445 -123, -123, -123, -123, -123, -123, -123, -123,
446 -123, -123, -123, -123, -123, -123, -123, -123,
447 })
448 }
449
450 func TestMaskOpt512(t *testing.T) {
451 if !archsimd.X86.AVX512() {
452 t.Skip("Test requires X86.AVX512, not available on this hardware")
453 return
454 }
455
456 k := make([]int64, 8, 8)
457 s := make([]float64, 8, 8)
458
459 a := archsimd.LoadFloat64x8([]float64{2, 0, 2, 0, 2, 0, 2, 0})
460 b := archsimd.LoadFloat64x8([]float64{1, 1, 1, 1, 1, 1, 1, 1})
461 c := archsimd.LoadFloat64x8([]float64{1, 2, 3, 4, 5, 6, 7, 8})
462 d := archsimd.LoadFloat64x8([]float64{2, 4, 6, 8, 10, 12, 14, 16})
463 g := a.Greater(b)
464 e := c.Add(d).Masked(g)
465 e.Store(s)
466 g.ToInt64x8().Store(k)
467 checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0})
468 checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0})
469 }
470
471
472
473
474
475 func flattenedTranspose(x, y archsimd.Int32x4) (a, b archsimd.Int32x4) {
476 return x.InterleaveLo(y), x.InterleaveHi(y)
477 }
478
479 func TestFlattenedTranspose(t *testing.T) {
480 r := make([]int32, 4, 4)
481 s := make([]int32, 4, 4)
482
483 x := archsimd.LoadInt32x4([]int32{0xA, 0xB, 0xC, 0xD})
484 y := archsimd.LoadInt32x4([]int32{1, 2, 3, 4})
485 a, b := flattenedTranspose(x, y)
486
487 a.Store(r)
488 b.Store(s)
489
490 checkSlices[int32](t, r, []int32{0xA, 1, 0xB, 2})
491 checkSlices[int32](t, s, []int32{0xC, 3, 0xD, 4})
492
493 }
494
495 func TestClearAVXUpperBits(t *testing.T) {
496
497
498 if !archsimd.X86.AVX2() {
499 t.Skip("Test requires X86.AVX2, not available on this hardware")
500 return
501 }
502
503 r := make([]int64, 4)
504 s := make([]int64, 4)
505
506 x := archsimd.LoadInt64x4([]int64{10, 20, 30, 40})
507 y := archsimd.LoadInt64x4([]int64{1, 2, 3, 4})
508
509 x.Add(y).Store(r)
510 archsimd.ClearAVXUpperBits()
511 x.Sub(y).Store(s)
512
513 checkSlices[int64](t, r, []int64{11, 22, 33, 44})
514 checkSlices[int64](t, s, []int64{9, 18, 27, 36})
515 }
516
517 func TestLeadingZeros(t *testing.T) {
518 if !archsimd.X86.AVX512() {
519 t.Skip("Test requires X86.AVX512, not available on this hardware")
520 return
521 }
522
523 src := []uint64{0b1111, 0}
524 want := []uint64{60, 64}
525 got := make([]uint64, 2)
526 archsimd.LoadUint64x2(src).LeadingZeros().Store(got)
527 for i := range 2 {
528 if want[i] != got[i] {
529 t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i])
530 }
531 }
532 }
533
534 func TestIsZero(t *testing.T) {
535 v1 := archsimd.LoadUint64x2([]uint64{0, 1})
536 v2 := archsimd.LoadUint64x2([]uint64{0, 0})
537 if v1.IsZero() {
538 t.Errorf("Result incorrect, want false, got true")
539 }
540 if !v2.IsZero() {
541 t.Errorf("Result incorrect, want true, got false")
542 }
543 if !v1.And(v2).IsZero() {
544 t.Errorf("Result incorrect, want true, got false")
545 }
546 if v1.AndNot(v2).IsZero() {
547 t.Errorf("Result incorrect, want false, got true")
548 }
549 if !v2.And(v1).IsZero() {
550 t.Errorf("Result incorrect, want true, got false")
551 }
552 if !v2.AndNot(v1).IsZero() {
553 t.Errorf("Result incorrect, want true, got false")
554 }
555 }
556
557 func TestSelect4FromPairConst(t *testing.T) {
558 x := archsimd.LoadInt32x4([]int32{0, 1, 2, 3})
559 y := archsimd.LoadInt32x4([]int32{4, 5, 6, 7})
560
561 llll := x.ConcatPermuteScalars(0, 1, 2, 3, y)
562 hhhh := x.ConcatPermuteScalars(4, 5, 6, 7, y)
563 llhh := x.ConcatPermuteScalars(0, 1, 6, 7, y)
564 hhll := x.ConcatPermuteScalars(6, 7, 0, 1, y)
565
566 lllh := x.ConcatPermuteScalars(0, 1, 2, 7, y)
567 llhl := x.ConcatPermuteScalars(0, 1, 7, 2, y)
568 lhll := x.ConcatPermuteScalars(0, 7, 1, 2, y)
569 hlll := x.ConcatPermuteScalars(7, 0, 1, 2, y)
570
571 hhhl := x.ConcatPermuteScalars(4, 5, 6, 0, y)
572 hhlh := x.ConcatPermuteScalars(4, 5, 0, 6, y)
573 hlhh := x.ConcatPermuteScalars(4, 0, 5, 6, y)
574 lhhh := x.ConcatPermuteScalars(0, 4, 5, 6, y)
575
576 lhlh := x.ConcatPermuteScalars(0, 4, 1, 5, y)
577 hlhl := x.ConcatPermuteScalars(4, 0, 5, 1, y)
578 lhhl := x.ConcatPermuteScalars(0, 4, 5, 1, y)
579 hllh := x.ConcatPermuteScalars(4, 0, 1, 5, y)
580
581 r := make([]int32, 4, 4)
582
583 foo := func(v archsimd.Int32x4, a, b, c, d int32) {
584 v.Store(r)
585 checkSlices[int32](t, r, []int32{a, b, c, d})
586 }
587
588 foo(llll, 0, 1, 2, 3)
589 foo(hhhh, 4, 5, 6, 7)
590 foo(llhh, 0, 1, 6, 7)
591 foo(hhll, 6, 7, 0, 1)
592
593 foo(lllh, 0, 1, 2, 7)
594 foo(llhl, 0, 1, 7, 2)
595 foo(lhll, 0, 7, 1, 2)
596 foo(hlll, 7, 0, 1, 2)
597
598 foo(hhhl, 4, 5, 6, 0)
599 foo(hhlh, 4, 5, 0, 6)
600 foo(hlhh, 4, 0, 5, 6)
601 foo(lhhh, 0, 4, 5, 6)
602
603 foo(lhlh, 0, 4, 1, 5)
604 foo(hlhl, 4, 0, 5, 1)
605 foo(lhhl, 0, 4, 5, 1)
606 foo(hllh, 4, 0, 1, 5)
607 }
608
609
610 func selectFromPairInt32x4(x archsimd.Int32x4, a, b, c, d uint8, y archsimd.Int32x4) archsimd.Int32x4 {
611 return x.ConcatPermuteScalars(a, b, c, d, y)
612 }
613
614 func TestSelect4FromPairVar(t *testing.T) {
615 x := archsimd.LoadInt32x4([]int32{0, 1, 2, 3})
616 y := archsimd.LoadInt32x4([]int32{4, 5, 6, 7})
617
618 llll := selectFromPairInt32x4(x, 0, 1, 2, 3, y)
619 hhhh := selectFromPairInt32x4(x, 4, 5, 6, 7, y)
620 llhh := selectFromPairInt32x4(x, 0, 1, 6, 7, y)
621 hhll := selectFromPairInt32x4(x, 6, 7, 0, 1, y)
622
623 lllh := selectFromPairInt32x4(x, 0, 1, 2, 7, y)
624 llhl := selectFromPairInt32x4(x, 0, 1, 7, 2, y)
625 lhll := selectFromPairInt32x4(x, 0, 7, 1, 2, y)
626 hlll := selectFromPairInt32x4(x, 7, 0, 1, 2, y)
627
628 hhhl := selectFromPairInt32x4(x, 4, 5, 6, 0, y)
629 hhlh := selectFromPairInt32x4(x, 4, 5, 0, 6, y)
630 hlhh := selectFromPairInt32x4(x, 4, 0, 5, 6, y)
631 lhhh := selectFromPairInt32x4(x, 0, 4, 5, 6, y)
632
633 lhlh := selectFromPairInt32x4(x, 0, 4, 1, 5, y)
634 hlhl := selectFromPairInt32x4(x, 4, 0, 5, 1, y)
635 lhhl := selectFromPairInt32x4(x, 0, 4, 5, 1, y)
636 hllh := selectFromPairInt32x4(x, 4, 0, 1, 5, y)
637
638 r := make([]int32, 4, 4)
639
640 foo := func(v archsimd.Int32x4, a, b, c, d int32) {
641 v.Store(r)
642 checkSlices[int32](t, r, []int32{a, b, c, d})
643 }
644
645 foo(llll, 0, 1, 2, 3)
646 foo(hhhh, 4, 5, 6, 7)
647 foo(llhh, 0, 1, 6, 7)
648 foo(hhll, 6, 7, 0, 1)
649
650 foo(lllh, 0, 1, 2, 7)
651 foo(llhl, 0, 1, 7, 2)
652 foo(lhll, 0, 7, 1, 2)
653 foo(hlll, 7, 0, 1, 2)
654
655 foo(hhhl, 4, 5, 6, 0)
656 foo(hhlh, 4, 5, 0, 6)
657 foo(hlhh, 4, 0, 5, 6)
658 foo(lhhh, 0, 4, 5, 6)
659
660 foo(lhlh, 0, 4, 1, 5)
661 foo(hlhl, 4, 0, 5, 1)
662 foo(lhhl, 0, 4, 5, 1)
663 foo(hllh, 4, 0, 1, 5)
664 }
665
666 func TestSelect4FromPairConstGrouped(t *testing.T) {
667 x := archsimd.LoadFloat32x8([]float32{0, 1, 2, 3, 10, 11, 12, 13})
668 y := archsimd.LoadFloat32x8([]float32{4, 5, 6, 7, 14, 15, 16, 17})
669
670 llll := x.ConcatPermuteScalarsGrouped(0, 1, 2, 3, y)
671 hhhh := x.ConcatPermuteScalarsGrouped(4, 5, 6, 7, y)
672 llhh := x.ConcatPermuteScalarsGrouped(0, 1, 6, 7, y)
673 hhll := x.ConcatPermuteScalarsGrouped(6, 7, 0, 1, y)
674
675 lllh := x.ConcatPermuteScalarsGrouped(0, 1, 2, 7, y)
676 llhl := x.ConcatPermuteScalarsGrouped(0, 1, 7, 2, y)
677 lhll := x.ConcatPermuteScalarsGrouped(0, 7, 1, 2, y)
678 hlll := x.ConcatPermuteScalarsGrouped(7, 0, 1, 2, y)
679
680 hhhl := x.ConcatPermuteScalarsGrouped(4, 5, 6, 0, y)
681 hhlh := x.ConcatPermuteScalarsGrouped(4, 5, 0, 6, y)
682 hlhh := x.ConcatPermuteScalarsGrouped(4, 0, 5, 6, y)
683 lhhh := x.ConcatPermuteScalarsGrouped(0, 4, 5, 6, y)
684
685 lhlh := x.ConcatPermuteScalarsGrouped(0, 4, 1, 5, y)
686 hlhl := x.ConcatPermuteScalarsGrouped(4, 0, 5, 1, y)
687 lhhl := x.ConcatPermuteScalarsGrouped(0, 4, 5, 1, y)
688 hllh := x.ConcatPermuteScalarsGrouped(4, 0, 1, 5, y)
689
690 r := make([]float32, 8, 8)
691
692 foo := func(v archsimd.Float32x8, a, b, c, d float32) {
693 v.Store(r)
694 checkSlices[float32](t, r, []float32{a, b, c, d, 10 + a, 10 + b, 10 + c, 10 + d})
695 }
696
697 foo(llll, 0, 1, 2, 3)
698 foo(hhhh, 4, 5, 6, 7)
699 foo(llhh, 0, 1, 6, 7)
700 foo(hhll, 6, 7, 0, 1)
701
702 foo(lllh, 0, 1, 2, 7)
703 foo(llhl, 0, 1, 7, 2)
704 foo(lhll, 0, 7, 1, 2)
705 foo(hlll, 7, 0, 1, 2)
706
707 foo(hhhl, 4, 5, 6, 0)
708 foo(hhlh, 4, 5, 0, 6)
709 foo(hlhh, 4, 0, 5, 6)
710 foo(lhhh, 0, 4, 5, 6)
711
712 foo(lhlh, 0, 4, 1, 5)
713 foo(hlhl, 4, 0, 5, 1)
714 foo(lhhl, 0, 4, 5, 1)
715 foo(hllh, 4, 0, 1, 5)
716 }
717
718 func TestConcatPermuteScalarsConstGroupedUint32x16(t *testing.T) {
719 if !archsimd.X86.AVX512() {
720 t.Skip("Test requires X86.AVX512, not available on this hardware")
721 return
722 }
723 x := archsimd.LoadUint32x16([]uint32{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23, 30, 31, 32, 33})
724 y := archsimd.LoadUint32x16([]uint32{4, 5, 6, 7, 14, 15, 16, 17, 24, 25, 26, 27, 34, 35, 36, 37})
725
726 llll := x.ConcatPermuteScalarsGrouped(0, 1, 2, 3, y)
727 hhhh := x.ConcatPermuteScalarsGrouped(4, 5, 6, 7, y)
728 llhh := x.ConcatPermuteScalarsGrouped(0, 1, 6, 7, y)
729 hhll := x.ConcatPermuteScalarsGrouped(6, 7, 0, 1, y)
730
731 lllh := x.ConcatPermuteScalarsGrouped(0, 1, 2, 7, y)
732 llhl := x.ConcatPermuteScalarsGrouped(0, 1, 7, 2, y)
733 lhll := x.ConcatPermuteScalarsGrouped(0, 7, 1, 2, y)
734 hlll := x.ConcatPermuteScalarsGrouped(7, 0, 1, 2, y)
735
736 hhhl := x.ConcatPermuteScalarsGrouped(4, 5, 6, 0, y)
737 hhlh := x.ConcatPermuteScalarsGrouped(4, 5, 0, 6, y)
738 hlhh := x.ConcatPermuteScalarsGrouped(4, 0, 5, 6, y)
739 lhhh := x.ConcatPermuteScalarsGrouped(0, 4, 5, 6, y)
740
741 lhlh := x.ConcatPermuteScalarsGrouped(0, 4, 1, 5, y)
742 hlhl := x.ConcatPermuteScalarsGrouped(4, 0, 5, 1, y)
743 lhhl := x.ConcatPermuteScalarsGrouped(0, 4, 5, 1, y)
744 hllh := x.ConcatPermuteScalarsGrouped(4, 0, 1, 5, y)
745
746 r := make([]uint32, 16, 16)
747
748 foo := func(v archsimd.Uint32x16, a, b, c, d uint32) {
749 v.Store(r)
750 checkSlices[uint32](t, r, []uint32{a, b, c, d,
751 10 + a, 10 + b, 10 + c, 10 + d,
752 20 + a, 20 + b, 20 + c, 20 + d,
753 30 + a, 30 + b, 30 + c, 30 + d,
754 })
755 }
756
757 foo(llll, 0, 1, 2, 3)
758 foo(hhhh, 4, 5, 6, 7)
759 foo(llhh, 0, 1, 6, 7)
760 foo(hhll, 6, 7, 0, 1)
761
762 foo(lllh, 0, 1, 2, 7)
763 foo(llhl, 0, 1, 7, 2)
764 foo(lhll, 0, 7, 1, 2)
765 foo(hlll, 7, 0, 1, 2)
766
767 foo(hhhl, 4, 5, 6, 0)
768 foo(hhlh, 4, 5, 0, 6)
769 foo(hlhh, 4, 0, 5, 6)
770 foo(lhhh, 0, 4, 5, 6)
771
772 foo(lhlh, 0, 4, 1, 5)
773 foo(hlhl, 4, 0, 5, 1)
774 foo(lhhl, 0, 4, 5, 1)
775 foo(hllh, 4, 0, 1, 5)
776 }
777
778 func TestConcatPermute128Scalars(t *testing.T) {
779 x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
780 y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})
781
782 aa := x.ConcatPermute128Scalars(0, 0, y)
783 ab := x.ConcatPermute128Scalars(0, 1, y)
784 bc := x.ConcatPermute128Scalars(1, 2, y)
785 cd := x.ConcatPermute128Scalars(2, 3, y)
786 da := x.ConcatPermute128Scalars(3, 0, y)
787 dc := x.ConcatPermute128Scalars(3, 2, y)
788
789 r := make([]uint64, 4, 4)
790
791 foo := func(v archsimd.Uint64x4, a, b uint64) {
792 a, b = 2*a, 2*b
793 v.Store(r)
794 checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
795 }
796
797 foo(aa, 0, 0)
798 foo(ab, 0, 1)
799 foo(bc, 1, 2)
800 foo(cd, 2, 3)
801 foo(da, 3, 0)
802 foo(dc, 3, 2)
803 }
804
805 func TestConcatPermute128ScalarsError(t *testing.T) {
806 x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
807 y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})
808
809 defer func() {
810 if r := recover(); r != nil {
811 t.Logf("Saw expected panic %v", r)
812 }
813 }()
814 _ = x.ConcatPermute128Scalars(0, 4, y)
815
816 t.Errorf("Should have panicked")
817 }
818
819
820 func select128FromPair(x archsimd.Uint64x4, lo, hi uint8, y archsimd.Uint64x4) archsimd.Uint64x4 {
821 return x.ConcatPermute128Scalars(lo, hi, y)
822 }
823
824 func TestConcatPermute128ScalarsVar(t *testing.T) {
825 x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
826 y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})
827
828 aa := select128FromPair(x, 0, 0, y)
829 ab := select128FromPair(x, 0, 1, y)
830 bc := select128FromPair(x, 1, 2, y)
831 cd := select128FromPair(x, 2, 3, y)
832 da := select128FromPair(x, 3, 0, y)
833 dc := select128FromPair(x, 3, 2, y)
834
835 r := make([]uint64, 4, 4)
836
837 foo := func(v archsimd.Uint64x4, a, b uint64) {
838 a, b = 2*a, 2*b
839 v.Store(r)
840 checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
841 }
842
843 foo(aa, 0, 0)
844 foo(ab, 0, 1)
845 foo(bc, 1, 2)
846 foo(cd, 2, 3)
847 foo(da, 3, 0)
848 foo(dc, 3, 2)
849 }
850
851 func TestSelect2FromPairConst(t *testing.T) {
852 x := archsimd.LoadUint64x2([]uint64{0, 1})
853 y := archsimd.LoadUint64x2([]uint64{2, 3})
854
855 ll := x.ConcatPermuteScalars(0, 1, y)
856 hh := x.ConcatPermuteScalars(3, 2, y)
857 lh := x.ConcatPermuteScalars(0, 3, y)
858 hl := x.ConcatPermuteScalars(2, 1, y)
859
860 r := make([]uint64, 2, 2)
861
862 foo := func(v archsimd.Uint64x2, a, b uint64) {
863 v.Store(r)
864 checkSlices[uint64](t, r, []uint64{a, b})
865 }
866
867 foo(ll, 0, 1)
868 foo(hh, 3, 2)
869 foo(lh, 0, 3)
870 foo(hl, 2, 1)
871 }
872
873 func TestSelect2FromPairConstGroupedUint(t *testing.T) {
874 x := archsimd.LoadUint64x4([]uint64{0, 1, 10, 11})
875 y := archsimd.LoadUint64x4([]uint64{2, 3, 12, 13})
876
877 ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
878 hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
879 lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
880 hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
881
882 r := make([]uint64, 4, 4)
883
884 foo := func(v archsimd.Uint64x4, a, b uint64) {
885 v.Store(r)
886 checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10})
887 }
888
889 foo(ll, 0, 1)
890 foo(hh, 3, 2)
891 foo(lh, 0, 3)
892 foo(hl, 2, 1)
893 }
894
895 func TestSelect2FromPairConstGroupedFloat(t *testing.T) {
896 x := archsimd.LoadFloat64x4([]float64{0, 1, 10, 11})
897 y := archsimd.LoadFloat64x4([]float64{2, 3, 12, 13})
898
899 ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
900 hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
901 lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
902 hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
903
904 r := make([]float64, 4, 4)
905
906 foo := func(v archsimd.Float64x4, a, b float64) {
907 v.Store(r)
908 checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10})
909 }
910
911 foo(ll, 0, 1)
912 foo(hh, 3, 2)
913 foo(lh, 0, 3)
914 foo(hl, 2, 1)
915 }
916
917 func TestSelect2FromPairConstGroupedInt(t *testing.T) {
918 x := archsimd.LoadInt64x4([]int64{0, 1, 10, 11})
919 y := archsimd.LoadInt64x4([]int64{2, 3, 12, 13})
920
921 ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
922 hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
923 lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
924 hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
925
926 r := make([]int64, 4, 4)
927
928 foo := func(v archsimd.Int64x4, a, b int64) {
929 v.Store(r)
930 checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10})
931 }
932
933 foo(ll, 0, 1)
934 foo(hh, 3, 2)
935 foo(lh, 0, 3)
936 foo(hl, 2, 1)
937 }
938
939 func TestSelect2FromPairConstGroupedInt512(t *testing.T) {
940 if !archsimd.X86.AVX512() {
941 t.Skip("Test requires X86.AVX512, not available on this hardware")
942 return
943 }
944
945 x := archsimd.LoadInt64x8([]int64{0, 1, 10, 11, 20, 21, 30, 31})
946 y := archsimd.LoadInt64x8([]int64{2, 3, 12, 13, 22, 23, 32, 33})
947
948 ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
949 hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
950 lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
951 hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
952
953 r := make([]int64, 8, 8)
954
955 foo := func(v archsimd.Int64x8, a, b int64) {
956 v.Store(r)
957 checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30})
958 }
959
960 foo(ll, 0, 1)
961 foo(hh, 3, 2)
962 foo(lh, 0, 3)
963 foo(hl, 2, 1)
964 }
965
966 func TestStringAMD64(t *testing.T) {
967 x := archsimd.LoadUint32x4([]uint32{0, 1, 2, 3})
968 y := archsimd.LoadInt64x4([]int64{-4, -5, -6, -7})
969 z := archsimd.LoadFloat32x4([]float32{0.5, 1.5, -2.5, 3.5e9})
970 w := archsimd.LoadFloat64x4([]float64{0.5, 1.5, -2.5, 3.5e9})
971
972 sx := "{0,1,2,3}"
973 sy := "{-4,-5,-6,-7}"
974 sz := "{0.5,1.5,-2.5,3.5e+09}"
975 sw := sz
976
977 if x.String() != sx {
978 t.Errorf("x=%s wanted %s", x, sx)
979 }
980 if y.String() != sy {
981 t.Errorf("y=%s wanted %s", y, sy)
982 }
983 if z.String() != sz {
984 t.Errorf("z=%s wanted %s", z, sz)
985 }
986 if w.String() != sw {
987 t.Errorf("w=%s wanted %s", w, sw)
988 }
989 t.Logf("w=%s", w)
990 t.Logf("x=%s", x)
991 t.Logf("y=%s", y)
992 t.Logf("z=%s", z)
993 }
994
995 func TestMaskString(t *testing.T) {
996 x := archsimd.LoadUint32x4([]uint32{0, 1, 2, 3})
997 var y archsimd.Uint32x4
998
999 m := x.Equal(y)
1000
1001 w := "{1,0,0,0}"
1002
1003 if g := m.String(); g != w {
1004 t.Errorf("got=%s wanted %s", g, w)
1005 }
1006 }
1007
1008
1009 func a() []int32 {
1010 return make([]int32, 16, 16)
1011 }
1012
1013
1014
1015 func applyTo3(x, y, z archsimd.Int32x16, f func(x, y, z int32) int32) []int32 {
1016 ax, ay, az := a(), a(), a()
1017 x.Store(ax)
1018 y.Store(ay)
1019 z.Store(az)
1020
1021 r := a()
1022 for i := range r {
1023 r[i] = f(ax[i], ay[i], az[i])
1024 }
1025 return r
1026 }
1027
1028
1029
1030 func applyTo4(x, y, z, w archsimd.Int32x16, f func(x, y, z, w int32) int32) []int32 {
1031 ax, ay, az, aw := a(), a(), a(), a()
1032 x.Store(ax)
1033 y.Store(ay)
1034 z.Store(az)
1035 w.Store(aw)
1036
1037 r := make([]int32, len(ax), len(ax))
1038 for i := range r {
1039 r[i] = f(ax[i], ay[i], az[i], aw[i])
1040 }
1041 return r
1042 }
1043
1044 func TestSelectTernOptInt32x16(t *testing.T) {
1045 if !archsimd.X86.AVX512() {
1046 t.Skip("Test requires X86.AVX512, not available on this hardware")
1047 return
1048 }
1049 ax := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
1050 ay := []int32{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}
1051 az := []int32{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}
1052 aw := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
1053 am := []int32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
1054
1055 x := archsimd.LoadInt32x16(ax)
1056 y := archsimd.LoadInt32x16(ay)
1057 z := archsimd.LoadInt32x16(az)
1058 w := archsimd.LoadInt32x16(aw)
1059 m := archsimd.LoadInt32x16(am)
1060
1061 foo := func(v archsimd.Int32x16, s []int32) {
1062 r := make([]int32, 16, 16)
1063 v.Store(r)
1064 checkSlices[int32](t, r, s)
1065 }
1066
1067 t0 := w.Xor(y).Xor(z)
1068 ft0 := func(w, y, z int32) int32 {
1069 return w ^ y ^ z
1070 }
1071 foo(t0, applyTo3(w, y, z, ft0))
1072
1073 t1 := m.And(w.Xor(y).Xor(z.Not()))
1074 ft1 := func(m, w, y, z int32) int32 {
1075 return m & (w ^ y ^ ^z)
1076 }
1077 foo(t1, applyTo4(m, w, y, z, ft1))
1078
1079 t2 := x.Xor(y).Xor(z).And(x.Xor(y).Xor(z.Not()))
1080 ft2 := func(x, y, z int32) int32 {
1081 return (x ^ y ^ z) & (x ^ y ^ ^z)
1082 }
1083 foo(t2, applyTo3(x, y, z, ft2))
1084 }
1085
1086 func TestMaskedMerge(t *testing.T) {
1087 if !archsimd.X86.AVX2() {
1088 t.Skip("Test requires X86.AVX2, not available on this hardware")
1089 return
1090 }
1091 x := archsimd.LoadInt64x4([]int64{1, 2, 3, 4})
1092 y := archsimd.LoadInt64x4([]int64{5, 6, 1, 1})
1093 z := archsimd.LoadInt64x4([]int64{-1, -2, -3, -4})
1094 res := make([]int64, 4)
1095 expected := []int64{6, 8, -3, -4}
1096 mask := x.Less(y)
1097 if archsimd.X86.AVX512() {
1098 x.Add(y).Merge(z, mask).Store(res)
1099 } else {
1100 x.Add(y).Merge(z, mask).Store(res)
1101 }
1102 for i := range 4 {
1103 if res[i] != expected[i] {
1104 t.Errorf("got %d wanted %d", res[i], expected[i])
1105 }
1106 }
1107 }
1108
1109 func TestMaskedIfElse(t *testing.T) {
1110 if !archsimd.X86.AVX2() {
1111 t.Skip("Test requires X86.AVX2, not available on this hardware")
1112 return
1113 }
1114 x := archsimd.LoadInt64x4([]int64{1, 2, 3, 4})
1115 y := archsimd.LoadInt64x4([]int64{5, 6, 1, 1})
1116 z := archsimd.LoadInt64x4([]int64{-1, -2, -3, -4})
1117 res := make([]int64, 4)
1118 expected := []int64{6, 8, -3, -4}
1119 mask := x.Less(y)
1120 if archsimd.X86.AVX512() {
1121 x.Add(y).IfElse(mask, z).Store(res)
1122 } else {
1123 x.Add(y).IfElse(mask, z).Store(res)
1124 }
1125 for i := range 4 {
1126 if res[i] != expected[i] {
1127 t.Errorf("got %d wanted %d", res[i], expected[i])
1128 }
1129 }
1130 }
1131
1132 func TestPermuteScalars(t *testing.T) {
1133 x := []int32{11, 12, 13, 14}
1134 want := []int32{12, 13, 14, 11}
1135 got := make([]int32, 4)
1136 archsimd.LoadInt32x4(x).PermuteScalars(1, 2, 3, 0).Store(got)
1137 checkSlices(t, got, want)
1138 }
1139
1140 func TestPermuteScalarsGrouped(t *testing.T) {
1141 if !archsimd.X86.AVX2() {
1142 t.Skip("Test requires X86.AVX2, not available on this hardware")
1143 return
1144 }
1145 x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
1146 want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
1147 got := make([]int32, 8)
1148 archsimd.LoadInt32x8(x).PermuteScalarsGrouped(1, 2, 3, 0).Store(got)
1149 checkSlices(t, got, want)
1150 }
1151
1152 func TestPermuteScalarsHi(t *testing.T) {
1153 x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
1154 want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
1155 got := make([]int16, len(x))
1156 archsimd.LoadInt16x8(x).PermuteScalarsHi(1, 2, 3, 0).Store(got)
1157 checkSlices(t, got, want)
1158 }
1159
1160 func TestPermuteScalarsLo(t *testing.T) {
1161 x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
1162 want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
1163 got := make([]int16, len(x))
1164 archsimd.LoadInt16x8(x).PermuteScalarsLo(1, 2, 3, 0).Store(got)
1165 checkSlices(t, got, want)
1166 }
1167
1168 func TestPermuteScalarsHiGrouped(t *testing.T) {
1169 if !archsimd.X86.AVX2() {
1170 t.Skip("Test requires X86.AVX2, not available on this hardware")
1171 return
1172 }
1173 x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
1174 want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
1175 got := make([]int16, len(x))
1176 archsimd.LoadInt16x16(x).PermuteScalarsHiGrouped(1, 2, 3, 0).Store(got)
1177 checkSlices(t, got, want)
1178 }
1179
1180 func TestPermuteScalarsLoGrouped(t *testing.T) {
1181 if !archsimd.X86.AVX2() {
1182 t.Skip("Test requires X86.AVX2, not available on this hardware")
1183 return
1184 }
1185 x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
1186 want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
1187 got := make([]int16, len(x))
1188 archsimd.LoadInt16x16(x).PermuteScalarsLoGrouped(1, 2, 3, 0).Store(got)
1189 checkSlices(t, got, want)
1190 }
1191
1192 func TestClMul(t *testing.T) {
1193 var x = archsimd.LoadUint64x2([]uint64{1, 5})
1194 var y = archsimd.LoadUint64x2([]uint64{3, 9})
1195
1196 foo := func(v archsimd.Uint64x2, s []uint64) {
1197 r := make([]uint64, 2, 2)
1198 v.Store(r)
1199 checkSlices[uint64](t, r, s)
1200 }
1201
1202 foo(x.CarrylessMultiplyEven(y), []uint64{3, 0})
1203 foo(x.CarrylessMultiplyEvenOdd(y), []uint64{9, 0})
1204 foo(x.CarrylessMultiplyOddEven(y), []uint64{15, 0})
1205 foo(x.CarrylessMultiplyOdd(y), []uint64{45, 0})
1206 foo(y.CarrylessMultiplyEven(y), []uint64{5, 0})
1207
1208 }
1209
1210 func addPairsSlice[T number](a, b []T) []T {
1211 r := make([]T, len(a))
1212 for i := range len(a) / 2 {
1213 r[i] = a[2*i] + a[2*i+1]
1214 r[i+len(a)/2] = b[2*i] + b[2*i+1]
1215 }
1216 return r
1217 }
1218
1219 func subPairsSlice[T number](a, b []T) []T {
1220 r := make([]T, len(a))
1221 for i := range len(a) / 2 {
1222 r[i] = a[2*i] - a[2*i+1]
1223 r[i+len(a)/2] = b[2*i] - b[2*i+1]
1224 }
1225 return r
1226 }
1227
1228 func addPairsGroupedSlice[T number](a, b []T) []T {
1229 group := int(128 / unsafe.Sizeof(a[0]))
1230 r := make([]T, 0, len(a))
1231 for i := range len(a) / group {
1232 r = append(r, addPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
1233 }
1234 return r
1235 }
1236
1237 func subPairsGroupedSlice[T number](a, b []T) []T {
1238 group := int(128 / unsafe.Sizeof(a[0]))
1239 r := make([]T, 0, len(a))
1240 for i := range len(a) / group {
1241 r = append(r, subPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
1242 }
1243 return r
1244 }
1245
1246 func TestAddSubPairs(t *testing.T) {
1247 testInt16x8Binary(t, archsimd.Int16x8.ConcatAddPairs, addPairsSlice[int16])
1248 testInt16x8Binary(t, archsimd.Int16x8.ConcatSubPairs, subPairsSlice[int16])
1249 testUint16x8Binary(t, archsimd.Uint16x8.ConcatAddPairs, addPairsSlice[uint16])
1250 testUint16x8Binary(t, archsimd.Uint16x8.ConcatSubPairs, subPairsSlice[uint16])
1251 testInt32x4Binary(t, archsimd.Int32x4.ConcatAddPairs, addPairsSlice[int32])
1252 testInt32x4Binary(t, archsimd.Int32x4.ConcatSubPairs, subPairsSlice[int32])
1253 testUint32x4Binary(t, archsimd.Uint32x4.ConcatAddPairs, addPairsSlice[uint32])
1254 testUint32x4Binary(t, archsimd.Uint32x4.ConcatSubPairs, subPairsSlice[uint32])
1255 testFloat32x4Binary(t, archsimd.Float32x4.ConcatAddPairs, addPairsSlice[float32])
1256 testFloat32x4Binary(t, archsimd.Float32x4.ConcatSubPairs, subPairsSlice[float32])
1257 testFloat64x2Binary(t, archsimd.Float64x2.ConcatAddPairs, addPairsSlice[float64])
1258 testFloat64x2Binary(t, archsimd.Float64x2.ConcatSubPairs, subPairsSlice[float64])
1259
1260
1261 if archsimd.X86.AVX2() {
1262 testInt16x16Binary(t, archsimd.Int16x16.ConcatAddPairsGrouped, addPairsGroupedSlice[int16])
1263 testInt16x16Binary(t, archsimd.Int16x16.ConcatSubPairsGrouped, subPairsGroupedSlice[int16])
1264 testUint16x16Binary(t, archsimd.Uint16x16.ConcatAddPairsGrouped, addPairsGroupedSlice[uint16])
1265 testUint16x16Binary(t, archsimd.Uint16x16.ConcatSubPairsGrouped, subPairsGroupedSlice[uint16])
1266 testInt32x8Binary(t, archsimd.Int32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[int32])
1267 testInt32x8Binary(t, archsimd.Int32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[int32])
1268 testUint32x8Binary(t, archsimd.Uint32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[uint32])
1269 testUint32x8Binary(t, archsimd.Uint32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[uint32])
1270 testFloat32x8Binary(t, archsimd.Float32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[float32])
1271 testFloat32x8Binary(t, archsimd.Float32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[float32])
1272 testFloat64x4Binary(t, archsimd.Float64x4.ConcatAddPairsGrouped, addPairsGroupedSlice[float64])
1273 testFloat64x4Binary(t, archsimd.Float64x4.ConcatSubPairsGrouped, subPairsGroupedSlice[float64])
1274 }
1275 }
1276
1277 func convConcatSlice[T, U number](a, b []T, conv func(T) U) []U {
1278 r := make([]U, len(a)+len(b))
1279 for i, v := range a {
1280 r[i] = conv(v)
1281 }
1282 for i, v := range b {
1283 r[len(a)+i] = conv(v)
1284 }
1285 return r
1286 }
1287
1288 func convConcatGroupedSlice[T, U number](a, b []T, conv func(T) U) []U {
1289 group := int(128 / unsafe.Sizeof(a[0]))
1290 r := make([]U, 0, len(a)+len(b))
1291 for i := 0; i < len(a)/group; i++ {
1292 r = append(r, convConcatSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group], conv)...)
1293 }
1294 return r
1295 }
1296
1297 func TestSaturateConcat(t *testing.T) {
1298
1299 forSlicePair(t, int32s, 4, func(x, y []int32) bool {
1300 a, b := archsimd.LoadInt32x4(x), archsimd.LoadInt32x4(y)
1301 var out [8]int16
1302 a.SaturateToInt16Concat(b).StoreArray(&out)
1303 want := convConcatSlice(x, y, satToInt16)
1304 return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
1305 })
1306
1307 forSlicePair(t, int32s, 4, func(x, y []int32) bool {
1308 a, b := archsimd.LoadInt32x4(x), archsimd.LoadInt32x4(y)
1309 var out [8]uint16
1310 a.SaturateToUint16Concat(b).StoreArray(&out)
1311 want := convConcatSlice(x, y, satToUint16)
1312 return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
1313 })
1314
1315 if archsimd.X86.AVX2() {
1316
1317 forSlicePair(t, int32s, 8, func(x, y []int32) bool {
1318 a, b := archsimd.LoadInt32x8(x), archsimd.LoadInt32x8(y)
1319 var out [16]int16
1320 a.SaturateToInt16ConcatGrouped(b).StoreArray(&out)
1321 want := convConcatGroupedSlice(x, y, satToInt16)
1322 return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
1323 })
1324
1325 forSlicePair(t, int32s, 8, func(x, y []int32) bool {
1326 a, b := archsimd.LoadInt32x8(x), archsimd.LoadInt32x8(y)
1327 var out [16]uint16
1328 a.SaturateToUint16ConcatGrouped(b).StoreArray(&out)
1329 want := convConcatGroupedSlice(x, y, satToUint16)
1330 return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
1331 })
1332 }
1333
1334 if archsimd.X86.AVX512() {
1335
1336 forSlicePair(t, int32s, 16, func(x, y []int32) bool {
1337 a, b := archsimd.LoadInt32x16(x), archsimd.LoadInt32x16(y)
1338 var out [32]int16
1339 a.SaturateToInt16ConcatGrouped(b).StoreArray(&out)
1340 want := convConcatGroupedSlice(x, y, satToInt16)
1341 return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
1342 })
1343
1344 forSlicePair(t, int32s, 16, func(x, y []int32) bool {
1345 a, b := archsimd.LoadInt32x16(x), archsimd.LoadInt32x16(y)
1346 var out [32]uint16
1347 a.SaturateToUint16ConcatGrouped(b).StoreArray(&out)
1348 want := convConcatGroupedSlice(x, y, satToUint16)
1349 return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
1350 })
1351 }
1352 }
1353
1354 func testMaskOr8x64(t *testing.T) {
1355 if !archsimd.X86.AVX512() {
1356 return
1357 }
1358 s := make([]int8, 64)
1359 want := []int8{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0,
1360 -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0,
1361 -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0,
1362 -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1}
1363 var a archsimd.Int8x64
1364 b := archsimd.LoadInt8x64(want)
1365 m1 := a.Less(a)
1366 m2 := b.Less(a)
1367 m3 := m1.Or(m2)
1368 c := m3.ToInt8x64()
1369 c.Store(s)
1370 checkSlices(t, s, want)
1371 }
1372
1373 func testMaskOr16x32(t *testing.T) {
1374 if !archsimd.X86.AVX512() {
1375 return
1376 }
1377 s := make([]int16, 32)
1378 want := []int16{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0,
1379 -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1}
1380 var a archsimd.Int16x32
1381 b := archsimd.LoadInt16x32(want)
1382 m1 := a.Less(a)
1383 m2 := b.Less(a)
1384 m3 := m1.Or(m2)
1385 c := m3.ToInt16x32()
1386 c.Store(s)
1387 checkSlices(t, s, want)
1388 }
1389
1390 func testMaskOr32x16(t *testing.T) {
1391 if !archsimd.X86.AVX512() {
1392 return
1393 }
1394 s := make([]int32, 16)
1395 want := []int32{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1}
1396 var a archsimd.Int32x16
1397 b := archsimd.LoadInt32x16(want)
1398 m1 := a.Less(a)
1399 m2 := b.Less(a)
1400 m3 := m1.Or(m2)
1401 c := m3.ToInt32x16()
1402 c.Store(s)
1403 checkSlices(t, s, want)
1404 }
1405
1406 func testMaskOr64x8(t *testing.T) {
1407 if !archsimd.X86.AVX512() {
1408 return
1409 }
1410 s := make([]int64, 8)
1411 want := []int64{-1, 0, 0, 0, 0, 0, -1, -1}
1412 var a archsimd.Int64x8
1413 b := archsimd.LoadInt64x8(want)
1414 m1 := a.Less(a)
1415 m2 := b.Less(a)
1416 m3 := m1.Or(m2)
1417 c := m3.ToInt64x8()
1418 c.Store(s)
1419 checkSlices(t, s, want)
1420 }
1421
1422 func testMaskOr8x32(t *testing.T) {
1423 if !archsimd.X86.AVX512() {
1424 return
1425 }
1426 s := make([]int8, 32)
1427 want := []int8{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0,
1428 -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1}
1429 var a archsimd.Int8x32
1430 b := archsimd.LoadInt8x32(want)
1431 m1 := a.Less(a)
1432 m2 := b.Less(a)
1433 m3 := m1.Or(m2)
1434 c := m3.ToInt8x32()
1435 c.Store(s)
1436 checkSlices(t, s, want)
1437 }
1438
1439 func testMaskOr16x16(t *testing.T) {
1440 if !archsimd.X86.AVX512() {
1441 return
1442 }
1443 s := make([]int16, 16)
1444 want := []int16{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, -1}
1445 var a archsimd.Int16x16
1446 b := archsimd.LoadInt16x16(want)
1447 m1 := a.Less(a)
1448 m2 := b.Less(a)
1449 m3 := m1.Or(m2)
1450 c := m3.ToInt16x16()
1451 c.Store(s)
1452 checkSlices(t, s, want)
1453 }
1454
1455 func testMaskOr32x8(t *testing.T) {
1456 if !archsimd.X86.AVX512() {
1457 return
1458 }
1459 s := make([]int32, 8)
1460 want := []int32{-1, 0, 0, 0, 0, 0, -1, -1}
1461 var a archsimd.Int32x8
1462 b := archsimd.LoadInt32x8(want)
1463 m1 := a.Less(a)
1464 m2 := b.Less(a)
1465 m3 := m1.Or(m2)
1466 c := m3.ToInt32x8()
1467 c.Store(s)
1468 checkSlices(t, s, want)
1469 }
1470
1471 func testMaskOr64x4(t *testing.T) {
1472 if !archsimd.X86.AVX512() {
1473 return
1474 }
1475 s := make([]int64, 4)
1476 want := []int64{-1, 0, 0, -1}
1477 var a archsimd.Int64x4
1478 b := archsimd.LoadInt64x4(want)
1479 m1 := a.Less(a)
1480 m2 := b.Less(a)
1481 m3 := m1.Or(m2)
1482 c := m3.ToInt64x4()
1483 c.Store(s)
1484 checkSlices(t, s, want)
1485 }
1486
1487 func testMaskOr8x16(t *testing.T) {
1488 if !archsimd.X86.AVX512() {
1489 return
1490 }
1491 s := make([]int8, 16)
1492 want := []int8{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, -1}
1493 var a archsimd.Int8x16
1494 b := archsimd.LoadInt8x16(want)
1495 m1 := a.Less(a)
1496 m2 := b.Less(a)
1497 m3 := m1.Or(m2)
1498 c := m3.ToInt8x16()
1499 c.Store(s)
1500 checkSlices(t, s, want)
1501 }
1502
1503 func testMaskOr16x8(t *testing.T) {
1504 if !archsimd.X86.AVX512() {
1505 return
1506 }
1507 s := make([]int16, 8)
1508 want := []int16{-1, 0, 0, 0, 0, 0, -1, -1}
1509 var a archsimd.Int16x8
1510 b := archsimd.LoadInt16x8(want)
1511 m1 := a.Less(a)
1512 m2 := b.Less(a)
1513 m3 := m1.Or(m2)
1514 c := m3.ToInt16x8()
1515 c.Store(s)
1516 checkSlices(t, s, want)
1517 }
1518
1519 func testMaskOr32x4(t *testing.T) {
1520 if !archsimd.X86.AVX512() {
1521 return
1522 }
1523 s := make([]int32, 4)
1524 want := []int32{-1, 0, 0, -1}
1525 var a archsimd.Int32x4
1526 b := archsimd.LoadInt32x4(want)
1527 m1 := a.Less(a)
1528 m2 := b.Less(a)
1529 m3 := m1.Or(m2)
1530 c := m3.ToInt32x4()
1531 c.Store(s)
1532 checkSlices(t, s, want)
1533 }
1534
1535 func testMaskOr64x2(t *testing.T) {
1536 if !archsimd.X86.AVX512() {
1537 return
1538 }
1539 s := make([]int64, 2)
1540 want := []int64{-1, 0}
1541 var a archsimd.Int64x2
1542 b := archsimd.LoadInt64x2(want)
1543 m1 := a.Less(a)
1544 m2 := b.Less(a)
1545 m3 := m1.Or(m2)
1546 c := m3.ToInt64x2()
1547 c.Store(s)
1548 checkSlices(t, s, want)
1549 }
1550
1551 func TestMaskOr(t *testing.T) {
1552 if !archsimd.X86.AVX512() {
1553 t.Skip("Test requires X86.AVX512, not available on this hardware")
1554 }
1555 testMaskOr8x64(t)
1556 testMaskOr16x32(t)
1557 testMaskOr32x16(t)
1558 testMaskOr64x8(t)
1559 testMaskOr8x32(t)
1560 testMaskOr16x16(t)
1561 testMaskOr32x8(t)
1562 testMaskOr64x4(t)
1563 testMaskOr8x16(t)
1564 testMaskOr16x8(t)
1565 testMaskOr32x4(t)
1566 testMaskOr64x2(t)
1567 }
1568
View as plain text