Source file src/simd/archsimd/internal/simd_test/simd_amd64_test.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd && amd64
     6  
     7  package simd_test
     8  
     9  import (
    10  	"fmt"
    11  	"os"
    12  	"simd/archsimd"
    13  	"slices"
    14  	"testing"
    15  	"unsafe"
    16  )
    17  
    18  func TestMain(m *testing.M) {
    19  	if !archsimd.X86.AVX() {
    20  		fmt.Fprintln(os.Stderr, "Skipping tests: AVX is not available")
    21  		os.Exit(0)
    22  	}
    23  	os.Exit(m.Run())
    24  }
    25  
    26  func TestPermute(t *testing.T) {
    27  	if !archsimd.X86.AVX512() {
    28  		t.Skip("Test requires X86.AVX512, not available on this hardware")
    29  		return
    30  	}
    31  	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
    32  	indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
    33  	want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
    34  	got := make([]int64, 8)
    35  	archsimd.LoadInt64x8(x).Permute(archsimd.LoadUint64x8(indices)).Store(got)
    36  	checkSlices(t, got, want)
    37  }
    38  
    39  func TestPermuteOrZero(t *testing.T) {
    40  	x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
    41  	indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
    42  	want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
    43  	got := make([]uint8, len(x))
    44  	archsimd.LoadUint8x16(x).PermuteOrZero(archsimd.LoadInt8x16(indices)).Store(got)
    45  	checkSlices(t, got, want)
    46  }
    47  
    48  func TestConcatPermute(t *testing.T) {
    49  	if !archsimd.X86.AVX512() {
    50  		t.Skip("Test requires X86.AVX512, not available on this hardware")
    51  		return
    52  	}
    53  	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
    54  	y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
    55  	indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
    56  	want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
    57  	got := make([]int64, 8)
    58  	archsimd.LoadInt64x8(x).ConcatPermute(archsimd.LoadInt64x8(y), archsimd.LoadUint64x8(indices)).Store(got)
    59  	checkSlices(t, got, want)
    60  }
    61  
    62  func TestCompress(t *testing.T) {
    63  	if !archsimd.X86.AVX512() {
    64  		t.Skip("Test requires X86.AVX512, not available on this hardware")
    65  		return
    66  	}
    67  	v1234 := archsimd.LoadInt32x4([]int32{1, 2, 3, 4})
    68  	v2400 := v1234.Compress(archsimd.Mask32x4FromBits(0b1010))
    69  	got := make([]int32, 4)
    70  	v2400.Store(got)
    71  	want := []int32{2, 4, 0, 0}
    72  	if !slices.Equal(got, want) {
    73  		t.Errorf("want and got differ, want=%v, got=%v", want, got)
    74  	}
    75  }
    76  
    77  func TestExpand(t *testing.T) {
    78  	if !archsimd.X86.AVX512() {
    79  		t.Skip("Test requires X86.AVX512, not available on this hardware")
    80  		return
    81  	}
    82  	v3400 := archsimd.LoadInt32x4([]int32{3, 4, 0, 0})
    83  	v2400 := v3400.Expand(archsimd.Mask32x4FromBits(0b1010))
    84  	got := make([]int32, 4)
    85  	v2400.Store(got)
    86  	want := []int32{0, 3, 0, 4}
    87  	if !slices.Equal(got, want) {
    88  		t.Errorf("want and got differ, want=%v, got=%v", want, got)
    89  	}
    90  }
    91  
    92  func TestSlicesInt8(t *testing.T) {
    93  	if !archsimd.X86.AVX2() {
    94  		t.Skip("Test requires X86.AVX2, not available on this hardware")
    95  		return
    96  	}
    97  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
    98  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
    99  	v := archsimd.LoadInt8x32(a)
   100  	b := make([]int8, 32, 32)
   101  	v.Store(b)
   102  	checkSlices(t, a, b)
   103  }
   104  
   105  func TestSlicesInt8TooShortLoad(t *testing.T) {
   106  	if !archsimd.X86.AVX2() {
   107  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   108  		return
   109  	}
   110  	defer func() {
   111  		if r := recover(); r != nil {
   112  			t.Logf("Saw EXPECTED panic %v", r)
   113  		} else {
   114  			t.Errorf("Did not see expected panic")
   115  		}
   116  	}()
   117  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   118  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31} // TOO SHORT, should panic
   119  	v := archsimd.LoadInt8x32(a)
   120  	b := make([]int8, 32, 32)
   121  	v.Store(b)
   122  	checkSlices(t, a, b)
   123  }
   124  
   125  func TestSlicesInt8TooShortStore(t *testing.T) {
   126  	if !archsimd.X86.AVX2() {
   127  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   128  		return
   129  	}
   130  	defer func() {
   131  		if r := recover(); r != nil {
   132  			t.Logf("Saw EXPECTED panic %v", r)
   133  		} else {
   134  			t.Errorf("Did not see expected panic")
   135  		}
   136  	}()
   137  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   138  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   139  	v := archsimd.LoadInt8x32(a)
   140  	b := make([]int8, 31) // TOO SHORT, should panic
   141  	v.Store(b)
   142  	checkSlices(t, a, b)
   143  }
   144  
   145  func TestSlicesFloat64(t *testing.T) {
   146  	a := []float64{1, 2, 3, 4, 5, 6, 7, 8} // too long, should be fine
   147  	v := archsimd.LoadFloat64x4(a)
   148  	b := make([]float64, 4, 4)
   149  	v.Store(b)
   150  	for i := range b {
   151  		if a[i] != b[i] {
   152  			t.Errorf("a and b differ at index %d, a=%f, b=%f", i, a[i], b[i])
   153  		}
   154  	}
   155  }
   156  
   157  // TODO: try to reduce this test to be smaller.
   158  func TestMergeLocals(t *testing.T) {
   159  	if !archsimd.X86.AVX2() {
   160  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   161  		return
   162  	}
   163  	testMergeLocalswrapper(t, archsimd.Int64x4.Add)
   164  }
   165  
   166  //go:noinline
   167  func forceSpill() {}
   168  
   169  func testMergeLocalswrapper(t *testing.T, op func(archsimd.Int64x4, archsimd.Int64x4) archsimd.Int64x4) {
   170  	t.Helper()
   171  	s0 := []int64{0, 1, 2, 3}
   172  	s1 := []int64{-1, 0, -1, 0}
   173  	want := []int64{-1, 1, 1, 3}
   174  	v := archsimd.LoadInt64x4(s0)
   175  	m := archsimd.LoadInt64x4(s1)
   176  	forceSpill()
   177  	got := make([]int64, 4)
   178  	gotv := op(v, m)
   179  	gotv.Store(got)
   180  	for i := range len(want) {
   181  		if !(got[i] == want[i]) {
   182  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i])
   183  		}
   184  	}
   185  }
   186  
   187  func TestBitMaskFromBits(t *testing.T) {
   188  	if !archsimd.X86.AVX512() {
   189  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   190  		return
   191  	}
   192  	results := [2]int64{}
   193  	want := [2]int64{0, 6}
   194  	m := archsimd.Mask64x2FromBits(0b10)
   195  	archsimd.LoadInt64x2([]int64{1, 2}).Add(archsimd.LoadInt64x2([]int64{3, 4})).Masked(m).StoreArray(&results)
   196  	for i := range 2 {
   197  		if results[i] != want[i] {
   198  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
   199  		}
   200  	}
   201  }
   202  
   203  var maskForTestBitMaskFromBitsLoad = uint8(0b10)
   204  
   205  func TestBitMaskFromBitsLoad(t *testing.T) {
   206  	if !archsimd.X86.AVX512() {
   207  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   208  		return
   209  	}
   210  	results := [2]int64{}
   211  	want := [2]int64{0, 6}
   212  	m := archsimd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad)
   213  	archsimd.LoadInt64x2([]int64{1, 2}).Add(archsimd.LoadInt64x2([]int64{3, 4})).Masked(m).StoreArray(&results)
   214  	for i := range 2 {
   215  		if results[i] != want[i] {
   216  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
   217  		}
   218  	}
   219  }
   220  
   221  func TestBitMaskToBits(t *testing.T) {
   222  	int8s := []int8{
   223  		0, 1, 1, 0, 0, 1, 0, 1,
   224  		1, 0, 1, 1, 0, 0, 1, 0,
   225  		1, 0, 0, 1, 1, 0, 1, 0,
   226  		0, 1, 1, 0, 0, 1, 0, 1,
   227  		1, 0, 0, 1, 0, 1, 1, 0,
   228  		0, 1, 0, 1, 1, 0, 0, 1,
   229  		1, 0, 1, 0, 0, 1, 1, 0,
   230  		0, 1, 1, 0, 1, 0, 0, 1,
   231  	}
   232  	int16s := make([]int16, 32)
   233  	for i := range int16s {
   234  		int16s[i] = int16(int8s[i])
   235  	}
   236  	int32s := make([]int32, 16)
   237  	for i := range int32s {
   238  		int32s[i] = int32(int8s[i])
   239  	}
   240  	int64s := make([]int64, 8)
   241  	for i := range int64s {
   242  		int64s[i] = int64(int8s[i])
   243  	}
   244  	want64 := uint64(0)
   245  	for i := range int8s {
   246  		want64 |= uint64(int8s[i]) << i
   247  	}
   248  	want32 := uint32(want64)
   249  	want16 := uint16(want64)
   250  	want8 := uint8(want64)
   251  	want4 := want8 & 0b1111
   252  	want2 := want4 & 0b11
   253  
   254  	if v := archsimd.LoadInt8x16(int8s[:16]).ToMask().ToBits(); v != want16 {
   255  		t.Errorf("want %b, got %b", want16, v)
   256  	}
   257  	if v := archsimd.LoadInt32x4(int32s[:4]).ToMask().ToBits(); v != want4 {
   258  		t.Errorf("want %b, got %b", want4, v)
   259  	}
   260  	if v := archsimd.LoadInt32x8(int32s[:8]).ToMask().ToBits(); v != want8 {
   261  		t.Errorf("want %b, got %b", want8, v)
   262  	}
   263  	if v := archsimd.LoadInt64x2(int64s[:2]).ToMask().ToBits(); v != want2 {
   264  		t.Errorf("want %b, got %b", want2, v)
   265  	}
   266  	if v := archsimd.LoadInt64x4(int64s[:4]).ToMask().ToBits(); v != want4 {
   267  		t.Errorf("want %b, got %b", want4, v)
   268  	}
   269  
   270  	if archsimd.X86.AVX2() {
   271  		if v := archsimd.LoadInt8x32(int8s[:32]).ToMask().ToBits(); v != want32 {
   272  			t.Errorf("want %b, got %b", want32, v)
   273  		}
   274  	}
   275  
   276  	if archsimd.X86.AVX512() {
   277  		if v := archsimd.LoadInt8x64(int8s).ToMask().ToBits(); v != want64 {
   278  			t.Errorf("want %b, got %b", want64, v)
   279  		}
   280  		if v := archsimd.LoadInt16x8(int16s[:8]).ToMask().ToBits(); v != want8 {
   281  			t.Errorf("want %b, got %b", want8, v)
   282  		}
   283  		if v := archsimd.LoadInt16x16(int16s[:16]).ToMask().ToBits(); v != want16 {
   284  			t.Errorf("want %b, got %b", want16, v)
   285  		}
   286  		if v := archsimd.LoadInt16x32(int16s).ToMask().ToBits(); v != want32 {
   287  			t.Errorf("want %b, got %b", want32, v)
   288  		}
   289  		if v := archsimd.LoadInt32x16(int32s).ToMask().ToBits(); v != want16 {
   290  			t.Errorf("want %b, got %b", want16, v)
   291  		}
   292  		if v := archsimd.LoadInt64x8(int64s).ToMask().ToBits(); v != want8 {
   293  			t.Errorf("want %b, got %b", want8, v)
   294  		}
   295  	}
   296  }
   297  
   298  var maskForTestBitMaskFromBitsStore uint8
   299  
   300  func TestBitMaskToBitsStore(t *testing.T) {
   301  	if !archsimd.X86.AVX512() {
   302  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   303  		return
   304  	}
   305  	maskForTestBitMaskFromBitsStore = archsimd.LoadInt16x8([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits()
   306  	if maskForTestBitMaskFromBitsStore != 0b101 {
   307  		t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore)
   308  	}
   309  }
   310  
   311  func TestMergeFloat(t *testing.T) {
   312  	if !archsimd.X86.AVX2() {
   313  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   314  		return
   315  	}
   316  	k := make([]int64, 4, 4)
   317  	s := make([]float64, 4, 4)
   318  
   319  	a := archsimd.LoadFloat64x4([]float64{1, 2, 3, 4})
   320  	b := archsimd.LoadFloat64x4([]float64{4, 2, 3, 1})
   321  	g := a.Greater(b)
   322  	g.ToInt64x4().Store(k)
   323  	c := a.Merge(b, g)
   324  
   325  	c.Store(s)
   326  
   327  	checkSlices[int64](t, k, []int64{0, 0, 0, -1})
   328  	checkSlices[float64](t, s, []float64{4, 2, 3, 4})
   329  }
   330  
   331  func TestIfElseFloat(t *testing.T) {
   332  	if !archsimd.X86.AVX2() {
   333  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   334  		return
   335  	}
   336  	k := make([]int64, 4, 4)
   337  	s := make([]float64, 4, 4)
   338  
   339  	a := archsimd.LoadFloat64x4([]float64{1, 2, 3, 4})
   340  	b := archsimd.LoadFloat64x4([]float64{4, 2, 3, 1})
   341  	g := a.Greater(b)
   342  	g.ToInt64x4().Store(k)
   343  	c := a.IfElse(g, b)
   344  
   345  	c.Store(s)
   346  
   347  	checkSlices[int64](t, k, []int64{0, 0, 0, -1})
   348  	checkSlices[float64](t, s, []float64{4, 2, 3, 4})
   349  }
   350  
   351  func TestMergeFloat512(t *testing.T) {
   352  	if !archsimd.X86.AVX512() {
   353  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   354  		return
   355  	}
   356  
   357  	k := make([]int64, 8, 8)
   358  	s := make([]float64, 8, 8)
   359  
   360  	a := archsimd.LoadFloat64x8([]float64{1, 2, 3, 4, 5, 6, 7, 8})
   361  	b := archsimd.LoadFloat64x8([]float64{8, 7, 6, 5, 4, 2, 3, 1})
   362  	g := a.Greater(b)
   363  	g.ToInt64x8().Store(k)
   364  	c := a.Merge(b, g)
   365  	d := a.Masked(g)
   366  
   367  	checkSlices[int64](t, k, []int64{0, 0, 0, 0, -1, -1, -1, -1})
   368  
   369  	c.Store(s)
   370  	checkSlices[float64](t, s, []float64{8, 7, 6, 5, 5, 6, 7, 8})
   371  
   372  	d.Store(s)
   373  	checkSlices[float64](t, s, []float64{0, 0, 0, 0, 5, 6, 7, 8})
   374  }
   375  
   376  func TestIfElseFloat512(t *testing.T) {
   377  	if !archsimd.X86.AVX512() {
   378  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   379  		return
   380  	}
   381  
   382  	k := make([]int64, 8, 8)
   383  	s := make([]float64, 8, 8)
   384  
   385  	a := archsimd.LoadFloat64x8([]float64{1, 2, 3, 4, 5, 6, 7, 8})
   386  	b := archsimd.LoadFloat64x8([]float64{8, 7, 6, 5, 4, 2, 3, 1})
   387  	g := a.Greater(b)
   388  	g.ToInt64x8().Store(k)
   389  	c := a.IfElse(g, b)
   390  	d := a.Masked(g)
   391  
   392  	checkSlices[int64](t, k, []int64{0, 0, 0, 0, -1, -1, -1, -1})
   393  
   394  	c.Store(s)
   395  	checkSlices[float64](t, s, []float64{8, 7, 6, 5, 5, 6, 7, 8})
   396  
   397  	d.Store(s)
   398  	checkSlices[float64](t, s, []float64{0, 0, 0, 0, 5, 6, 7, 8})
   399  }
   400  
   401  var ro uint64 = 2
   402  var roBig uint64 = 1024 + 2
   403  
   404  func TestRotateAllVariable(t *testing.T) {
   405  	got := make([]int32, 4)
   406  	archsimd.LoadInt32x4([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(ro).Store(got)
   407  	for _, v := range got {
   408  		if v != 0b1100 {
   409  			t.Errorf("Want 0b1100, got %b", v)
   410  		}
   411  	}
   412  	archsimd.LoadInt32x4([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(roBig).Store(got)
   413  	for _, v := range got {
   414  		if v != 0b1100 {
   415  			t.Errorf("Want 0b1100, got %b", v)
   416  		}
   417  	}
   418  }
   419  
   420  func TestRotateAllConst(t *testing.T) {
   421  	got := make([]int32, 4)
   422  	archsimd.LoadInt32x4([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(2).Store(got)
   423  	for _, v := range got {
   424  		if v != 0b1100 {
   425  			t.Errorf("Want 0b1100, got %b", v)
   426  		}
   427  	}
   428  }
   429  
   430  func TestBroadcastFloat32x8(t *testing.T) {
   431  	s := make([]float32, 8, 8)
   432  	archsimd.BroadcastFloat32x8(123456789).Store(s)
   433  	checkSlices(t, s, []float32{123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789})
   434  }
   435  
   436  func TestBroadcastInt8x32(t *testing.T) {
   437  	if !archsimd.X86.AVX2() {
   438  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   439  		return
   440  	}
   441  	s := make([]int8, 32, 32)
   442  	archsimd.BroadcastInt8x32(-123).Store(s)
   443  	checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
   444  		-123, -123, -123, -123, -123, -123, -123, -123,
   445  		-123, -123, -123, -123, -123, -123, -123, -123,
   446  		-123, -123, -123, -123, -123, -123, -123, -123,
   447  	})
   448  }
   449  
   450  func TestMaskOpt512(t *testing.T) {
   451  	if !archsimd.X86.AVX512() {
   452  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   453  		return
   454  	}
   455  
   456  	k := make([]int64, 8, 8)
   457  	s := make([]float64, 8, 8)
   458  
   459  	a := archsimd.LoadFloat64x8([]float64{2, 0, 2, 0, 2, 0, 2, 0})
   460  	b := archsimd.LoadFloat64x8([]float64{1, 1, 1, 1, 1, 1, 1, 1})
   461  	c := archsimd.LoadFloat64x8([]float64{1, 2, 3, 4, 5, 6, 7, 8})
   462  	d := archsimd.LoadFloat64x8([]float64{2, 4, 6, 8, 10, 12, 14, 16})
   463  	g := a.Greater(b)
   464  	e := c.Add(d).Masked(g)
   465  	e.Store(s)
   466  	g.ToInt64x8().Store(k)
   467  	checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0})
   468  	checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0})
   469  }
   470  
   471  // flattenedTranspose tranposes x and y, regarded as a pair of 2x2
   472  // matrices, but then flattens the rows in order, i.e
   473  // x: ABCD ==> a: A1B2
   474  // y: 1234     b: C3D4
   475  func flattenedTranspose(x, y archsimd.Int32x4) (a, b archsimd.Int32x4) {
   476  	return x.InterleaveLo(y), x.InterleaveHi(y)
   477  }
   478  
   479  func TestFlattenedTranspose(t *testing.T) {
   480  	r := make([]int32, 4, 4)
   481  	s := make([]int32, 4, 4)
   482  
   483  	x := archsimd.LoadInt32x4([]int32{0xA, 0xB, 0xC, 0xD})
   484  	y := archsimd.LoadInt32x4([]int32{1, 2, 3, 4})
   485  	a, b := flattenedTranspose(x, y)
   486  
   487  	a.Store(r)
   488  	b.Store(s)
   489  
   490  	checkSlices[int32](t, r, []int32{0xA, 1, 0xB, 2})
   491  	checkSlices[int32](t, s, []int32{0xC, 3, 0xD, 4})
   492  
   493  }
   494  
   495  func TestClearAVXUpperBits(t *testing.T) {
   496  	// Test that ClearAVXUpperBits is safe even if there are SIMD values
   497  	// alive (although usually one should not do this).
   498  	if !archsimd.X86.AVX2() {
   499  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   500  		return
   501  	}
   502  
   503  	r := make([]int64, 4)
   504  	s := make([]int64, 4)
   505  
   506  	x := archsimd.LoadInt64x4([]int64{10, 20, 30, 40})
   507  	y := archsimd.LoadInt64x4([]int64{1, 2, 3, 4})
   508  
   509  	x.Add(y).Store(r)
   510  	archsimd.ClearAVXUpperBits()
   511  	x.Sub(y).Store(s)
   512  
   513  	checkSlices[int64](t, r, []int64{11, 22, 33, 44})
   514  	checkSlices[int64](t, s, []int64{9, 18, 27, 36})
   515  }
   516  
   517  func TestLeadingZeros(t *testing.T) {
   518  	if !archsimd.X86.AVX512() {
   519  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   520  		return
   521  	}
   522  
   523  	src := []uint64{0b1111, 0}
   524  	want := []uint64{60, 64}
   525  	got := make([]uint64, 2)
   526  	archsimd.LoadUint64x2(src).LeadingZeros().Store(got)
   527  	for i := range 2 {
   528  		if want[i] != got[i] {
   529  			t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i])
   530  		}
   531  	}
   532  }
   533  
   534  func TestIsZero(t *testing.T) {
   535  	v1 := archsimd.LoadUint64x2([]uint64{0, 1})
   536  	v2 := archsimd.LoadUint64x2([]uint64{0, 0})
   537  	if v1.IsZero() {
   538  		t.Errorf("Result incorrect, want false, got true")
   539  	}
   540  	if !v2.IsZero() {
   541  		t.Errorf("Result incorrect, want true, got false")
   542  	}
   543  	if !v1.And(v2).IsZero() {
   544  		t.Errorf("Result incorrect, want true, got false")
   545  	}
   546  	if v1.AndNot(v2).IsZero() {
   547  		t.Errorf("Result incorrect, want false, got true")
   548  	}
   549  	if !v2.And(v1).IsZero() {
   550  		t.Errorf("Result incorrect, want true, got false")
   551  	}
   552  	if !v2.AndNot(v1).IsZero() {
   553  		t.Errorf("Result incorrect, want true, got false")
   554  	}
   555  }
   556  
   557  func TestSelect4FromPairConst(t *testing.T) {
   558  	x := archsimd.LoadInt32x4([]int32{0, 1, 2, 3})
   559  	y := archsimd.LoadInt32x4([]int32{4, 5, 6, 7})
   560  
   561  	llll := x.ConcatPermuteScalars(0, 1, 2, 3, y)
   562  	hhhh := x.ConcatPermuteScalars(4, 5, 6, 7, y)
   563  	llhh := x.ConcatPermuteScalars(0, 1, 6, 7, y)
   564  	hhll := x.ConcatPermuteScalars(6, 7, 0, 1, y)
   565  
   566  	lllh := x.ConcatPermuteScalars(0, 1, 2, 7, y)
   567  	llhl := x.ConcatPermuteScalars(0, 1, 7, 2, y)
   568  	lhll := x.ConcatPermuteScalars(0, 7, 1, 2, y)
   569  	hlll := x.ConcatPermuteScalars(7, 0, 1, 2, y)
   570  
   571  	hhhl := x.ConcatPermuteScalars(4, 5, 6, 0, y)
   572  	hhlh := x.ConcatPermuteScalars(4, 5, 0, 6, y)
   573  	hlhh := x.ConcatPermuteScalars(4, 0, 5, 6, y)
   574  	lhhh := x.ConcatPermuteScalars(0, 4, 5, 6, y)
   575  
   576  	lhlh := x.ConcatPermuteScalars(0, 4, 1, 5, y)
   577  	hlhl := x.ConcatPermuteScalars(4, 0, 5, 1, y)
   578  	lhhl := x.ConcatPermuteScalars(0, 4, 5, 1, y)
   579  	hllh := x.ConcatPermuteScalars(4, 0, 1, 5, y)
   580  
   581  	r := make([]int32, 4, 4)
   582  
   583  	foo := func(v archsimd.Int32x4, a, b, c, d int32) {
   584  		v.Store(r)
   585  		checkSlices[int32](t, r, []int32{a, b, c, d})
   586  	}
   587  
   588  	foo(llll, 0, 1, 2, 3)
   589  	foo(hhhh, 4, 5, 6, 7)
   590  	foo(llhh, 0, 1, 6, 7)
   591  	foo(hhll, 6, 7, 0, 1)
   592  
   593  	foo(lllh, 0, 1, 2, 7)
   594  	foo(llhl, 0, 1, 7, 2)
   595  	foo(lhll, 0, 7, 1, 2)
   596  	foo(hlll, 7, 0, 1, 2)
   597  
   598  	foo(hhhl, 4, 5, 6, 0)
   599  	foo(hhlh, 4, 5, 0, 6)
   600  	foo(hlhh, 4, 0, 5, 6)
   601  	foo(lhhh, 0, 4, 5, 6)
   602  
   603  	foo(lhlh, 0, 4, 1, 5)
   604  	foo(hlhl, 4, 0, 5, 1)
   605  	foo(lhhl, 0, 4, 5, 1)
   606  	foo(hllh, 4, 0, 1, 5)
   607  }
   608  
   609  //go:noinline
   610  func selectFromPairInt32x4(x archsimd.Int32x4, a, b, c, d uint8, y archsimd.Int32x4) archsimd.Int32x4 {
   611  	return x.ConcatPermuteScalars(a, b, c, d, y)
   612  }
   613  
   614  func TestSelect4FromPairVar(t *testing.T) {
   615  	x := archsimd.LoadInt32x4([]int32{0, 1, 2, 3})
   616  	y := archsimd.LoadInt32x4([]int32{4, 5, 6, 7})
   617  
   618  	llll := selectFromPairInt32x4(x, 0, 1, 2, 3, y)
   619  	hhhh := selectFromPairInt32x4(x, 4, 5, 6, 7, y)
   620  	llhh := selectFromPairInt32x4(x, 0, 1, 6, 7, y)
   621  	hhll := selectFromPairInt32x4(x, 6, 7, 0, 1, y)
   622  
   623  	lllh := selectFromPairInt32x4(x, 0, 1, 2, 7, y)
   624  	llhl := selectFromPairInt32x4(x, 0, 1, 7, 2, y)
   625  	lhll := selectFromPairInt32x4(x, 0, 7, 1, 2, y)
   626  	hlll := selectFromPairInt32x4(x, 7, 0, 1, 2, y)
   627  
   628  	hhhl := selectFromPairInt32x4(x, 4, 5, 6, 0, y)
   629  	hhlh := selectFromPairInt32x4(x, 4, 5, 0, 6, y)
   630  	hlhh := selectFromPairInt32x4(x, 4, 0, 5, 6, y)
   631  	lhhh := selectFromPairInt32x4(x, 0, 4, 5, 6, y)
   632  
   633  	lhlh := selectFromPairInt32x4(x, 0, 4, 1, 5, y)
   634  	hlhl := selectFromPairInt32x4(x, 4, 0, 5, 1, y)
   635  	lhhl := selectFromPairInt32x4(x, 0, 4, 5, 1, y)
   636  	hllh := selectFromPairInt32x4(x, 4, 0, 1, 5, y)
   637  
   638  	r := make([]int32, 4, 4)
   639  
   640  	foo := func(v archsimd.Int32x4, a, b, c, d int32) {
   641  		v.Store(r)
   642  		checkSlices[int32](t, r, []int32{a, b, c, d})
   643  	}
   644  
   645  	foo(llll, 0, 1, 2, 3)
   646  	foo(hhhh, 4, 5, 6, 7)
   647  	foo(llhh, 0, 1, 6, 7)
   648  	foo(hhll, 6, 7, 0, 1)
   649  
   650  	foo(lllh, 0, 1, 2, 7)
   651  	foo(llhl, 0, 1, 7, 2)
   652  	foo(lhll, 0, 7, 1, 2)
   653  	foo(hlll, 7, 0, 1, 2)
   654  
   655  	foo(hhhl, 4, 5, 6, 0)
   656  	foo(hhlh, 4, 5, 0, 6)
   657  	foo(hlhh, 4, 0, 5, 6)
   658  	foo(lhhh, 0, 4, 5, 6)
   659  
   660  	foo(lhlh, 0, 4, 1, 5)
   661  	foo(hlhl, 4, 0, 5, 1)
   662  	foo(lhhl, 0, 4, 5, 1)
   663  	foo(hllh, 4, 0, 1, 5)
   664  }
   665  
   666  func TestSelect4FromPairConstGrouped(t *testing.T) {
   667  	x := archsimd.LoadFloat32x8([]float32{0, 1, 2, 3, 10, 11, 12, 13})
   668  	y := archsimd.LoadFloat32x8([]float32{4, 5, 6, 7, 14, 15, 16, 17})
   669  
   670  	llll := x.ConcatPermuteScalarsGrouped(0, 1, 2, 3, y)
   671  	hhhh := x.ConcatPermuteScalarsGrouped(4, 5, 6, 7, y)
   672  	llhh := x.ConcatPermuteScalarsGrouped(0, 1, 6, 7, y)
   673  	hhll := x.ConcatPermuteScalarsGrouped(6, 7, 0, 1, y)
   674  
   675  	lllh := x.ConcatPermuteScalarsGrouped(0, 1, 2, 7, y)
   676  	llhl := x.ConcatPermuteScalarsGrouped(0, 1, 7, 2, y)
   677  	lhll := x.ConcatPermuteScalarsGrouped(0, 7, 1, 2, y)
   678  	hlll := x.ConcatPermuteScalarsGrouped(7, 0, 1, 2, y)
   679  
   680  	hhhl := x.ConcatPermuteScalarsGrouped(4, 5, 6, 0, y)
   681  	hhlh := x.ConcatPermuteScalarsGrouped(4, 5, 0, 6, y)
   682  	hlhh := x.ConcatPermuteScalarsGrouped(4, 0, 5, 6, y)
   683  	lhhh := x.ConcatPermuteScalarsGrouped(0, 4, 5, 6, y)
   684  
   685  	lhlh := x.ConcatPermuteScalarsGrouped(0, 4, 1, 5, y)
   686  	hlhl := x.ConcatPermuteScalarsGrouped(4, 0, 5, 1, y)
   687  	lhhl := x.ConcatPermuteScalarsGrouped(0, 4, 5, 1, y)
   688  	hllh := x.ConcatPermuteScalarsGrouped(4, 0, 1, 5, y)
   689  
   690  	r := make([]float32, 8, 8)
   691  
   692  	foo := func(v archsimd.Float32x8, a, b, c, d float32) {
   693  		v.Store(r)
   694  		checkSlices[float32](t, r, []float32{a, b, c, d, 10 + a, 10 + b, 10 + c, 10 + d})
   695  	}
   696  
   697  	foo(llll, 0, 1, 2, 3)
   698  	foo(hhhh, 4, 5, 6, 7)
   699  	foo(llhh, 0, 1, 6, 7)
   700  	foo(hhll, 6, 7, 0, 1)
   701  
   702  	foo(lllh, 0, 1, 2, 7)
   703  	foo(llhl, 0, 1, 7, 2)
   704  	foo(lhll, 0, 7, 1, 2)
   705  	foo(hlll, 7, 0, 1, 2)
   706  
   707  	foo(hhhl, 4, 5, 6, 0)
   708  	foo(hhlh, 4, 5, 0, 6)
   709  	foo(hlhh, 4, 0, 5, 6)
   710  	foo(lhhh, 0, 4, 5, 6)
   711  
   712  	foo(lhlh, 0, 4, 1, 5)
   713  	foo(hlhl, 4, 0, 5, 1)
   714  	foo(lhhl, 0, 4, 5, 1)
   715  	foo(hllh, 4, 0, 1, 5)
   716  }
   717  
   718  func TestConcatPermuteScalarsConstGroupedUint32x16(t *testing.T) {
   719  	if !archsimd.X86.AVX512() {
   720  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   721  		return
   722  	}
   723  	x := archsimd.LoadUint32x16([]uint32{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23, 30, 31, 32, 33})
   724  	y := archsimd.LoadUint32x16([]uint32{4, 5, 6, 7, 14, 15, 16, 17, 24, 25, 26, 27, 34, 35, 36, 37})
   725  
   726  	llll := x.ConcatPermuteScalarsGrouped(0, 1, 2, 3, y)
   727  	hhhh := x.ConcatPermuteScalarsGrouped(4, 5, 6, 7, y)
   728  	llhh := x.ConcatPermuteScalarsGrouped(0, 1, 6, 7, y)
   729  	hhll := x.ConcatPermuteScalarsGrouped(6, 7, 0, 1, y)
   730  
   731  	lllh := x.ConcatPermuteScalarsGrouped(0, 1, 2, 7, y)
   732  	llhl := x.ConcatPermuteScalarsGrouped(0, 1, 7, 2, y)
   733  	lhll := x.ConcatPermuteScalarsGrouped(0, 7, 1, 2, y)
   734  	hlll := x.ConcatPermuteScalarsGrouped(7, 0, 1, 2, y)
   735  
   736  	hhhl := x.ConcatPermuteScalarsGrouped(4, 5, 6, 0, y)
   737  	hhlh := x.ConcatPermuteScalarsGrouped(4, 5, 0, 6, y)
   738  	hlhh := x.ConcatPermuteScalarsGrouped(4, 0, 5, 6, y)
   739  	lhhh := x.ConcatPermuteScalarsGrouped(0, 4, 5, 6, y)
   740  
   741  	lhlh := x.ConcatPermuteScalarsGrouped(0, 4, 1, 5, y)
   742  	hlhl := x.ConcatPermuteScalarsGrouped(4, 0, 5, 1, y)
   743  	lhhl := x.ConcatPermuteScalarsGrouped(0, 4, 5, 1, y)
   744  	hllh := x.ConcatPermuteScalarsGrouped(4, 0, 1, 5, y)
   745  
   746  	r := make([]uint32, 16, 16)
   747  
   748  	foo := func(v archsimd.Uint32x16, a, b, c, d uint32) {
   749  		v.Store(r)
   750  		checkSlices[uint32](t, r, []uint32{a, b, c, d,
   751  			10 + a, 10 + b, 10 + c, 10 + d,
   752  			20 + a, 20 + b, 20 + c, 20 + d,
   753  			30 + a, 30 + b, 30 + c, 30 + d,
   754  		})
   755  	}
   756  
   757  	foo(llll, 0, 1, 2, 3)
   758  	foo(hhhh, 4, 5, 6, 7)
   759  	foo(llhh, 0, 1, 6, 7)
   760  	foo(hhll, 6, 7, 0, 1)
   761  
   762  	foo(lllh, 0, 1, 2, 7)
   763  	foo(llhl, 0, 1, 7, 2)
   764  	foo(lhll, 0, 7, 1, 2)
   765  	foo(hlll, 7, 0, 1, 2)
   766  
   767  	foo(hhhl, 4, 5, 6, 0)
   768  	foo(hhlh, 4, 5, 0, 6)
   769  	foo(hlhh, 4, 0, 5, 6)
   770  	foo(lhhh, 0, 4, 5, 6)
   771  
   772  	foo(lhlh, 0, 4, 1, 5)
   773  	foo(hlhl, 4, 0, 5, 1)
   774  	foo(lhhl, 0, 4, 5, 1)
   775  	foo(hllh, 4, 0, 1, 5)
   776  }
   777  
   778  func TestConcatPermute128Scalars(t *testing.T) {
   779  	x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
   780  	y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})
   781  
   782  	aa := x.ConcatPermute128Scalars(0, 0, y)
   783  	ab := x.ConcatPermute128Scalars(0, 1, y)
   784  	bc := x.ConcatPermute128Scalars(1, 2, y)
   785  	cd := x.ConcatPermute128Scalars(2, 3, y)
   786  	da := x.ConcatPermute128Scalars(3, 0, y)
   787  	dc := x.ConcatPermute128Scalars(3, 2, y)
   788  
   789  	r := make([]uint64, 4, 4)
   790  
   791  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   792  		a, b = 2*a, 2*b
   793  		v.Store(r)
   794  		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
   795  	}
   796  
   797  	foo(aa, 0, 0)
   798  	foo(ab, 0, 1)
   799  	foo(bc, 1, 2)
   800  	foo(cd, 2, 3)
   801  	foo(da, 3, 0)
   802  	foo(dc, 3, 2)
   803  }
   804  
   805  func TestConcatPermute128ScalarsError(t *testing.T) {
   806  	x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
   807  	y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})
   808  
   809  	defer func() {
   810  		if r := recover(); r != nil {
   811  			t.Logf("Saw expected panic %v", r)
   812  		}
   813  	}()
   814  	_ = x.ConcatPermute128Scalars(0, 4, y)
   815  
   816  	t.Errorf("Should have panicked")
   817  }
   818  
   819  //go:noinline
   820  func select128FromPair(x archsimd.Uint64x4, lo, hi uint8, y archsimd.Uint64x4) archsimd.Uint64x4 {
   821  	return x.ConcatPermute128Scalars(lo, hi, y)
   822  }
   823  
   824  func TestConcatPermute128ScalarsVar(t *testing.T) {
   825  	x := archsimd.LoadUint64x4([]uint64{0, 1, 2, 3})
   826  	y := archsimd.LoadUint64x4([]uint64{4, 5, 6, 7})
   827  
   828  	aa := select128FromPair(x, 0, 0, y)
   829  	ab := select128FromPair(x, 0, 1, y)
   830  	bc := select128FromPair(x, 1, 2, y)
   831  	cd := select128FromPair(x, 2, 3, y)
   832  	da := select128FromPair(x, 3, 0, y)
   833  	dc := select128FromPair(x, 3, 2, y)
   834  
   835  	r := make([]uint64, 4, 4)
   836  
   837  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   838  		a, b = 2*a, 2*b
   839  		v.Store(r)
   840  		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
   841  	}
   842  
   843  	foo(aa, 0, 0)
   844  	foo(ab, 0, 1)
   845  	foo(bc, 1, 2)
   846  	foo(cd, 2, 3)
   847  	foo(da, 3, 0)
   848  	foo(dc, 3, 2)
   849  }
   850  
   851  func TestSelect2FromPairConst(t *testing.T) {
   852  	x := archsimd.LoadUint64x2([]uint64{0, 1})
   853  	y := archsimd.LoadUint64x2([]uint64{2, 3})
   854  
   855  	ll := x.ConcatPermuteScalars(0, 1, y)
   856  	hh := x.ConcatPermuteScalars(3, 2, y)
   857  	lh := x.ConcatPermuteScalars(0, 3, y)
   858  	hl := x.ConcatPermuteScalars(2, 1, y)
   859  
   860  	r := make([]uint64, 2, 2)
   861  
   862  	foo := func(v archsimd.Uint64x2, a, b uint64) {
   863  		v.Store(r)
   864  		checkSlices[uint64](t, r, []uint64{a, b})
   865  	}
   866  
   867  	foo(ll, 0, 1)
   868  	foo(hh, 3, 2)
   869  	foo(lh, 0, 3)
   870  	foo(hl, 2, 1)
   871  }
   872  
   873  func TestSelect2FromPairConstGroupedUint(t *testing.T) {
   874  	x := archsimd.LoadUint64x4([]uint64{0, 1, 10, 11})
   875  	y := archsimd.LoadUint64x4([]uint64{2, 3, 12, 13})
   876  
   877  	ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
   878  	hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
   879  	lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
   880  	hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
   881  
   882  	r := make([]uint64, 4, 4)
   883  
   884  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   885  		v.Store(r)
   886  		checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10})
   887  	}
   888  
   889  	foo(ll, 0, 1)
   890  	foo(hh, 3, 2)
   891  	foo(lh, 0, 3)
   892  	foo(hl, 2, 1)
   893  }
   894  
   895  func TestSelect2FromPairConstGroupedFloat(t *testing.T) {
   896  	x := archsimd.LoadFloat64x4([]float64{0, 1, 10, 11})
   897  	y := archsimd.LoadFloat64x4([]float64{2, 3, 12, 13})
   898  
   899  	ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
   900  	hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
   901  	lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
   902  	hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
   903  
   904  	r := make([]float64, 4, 4)
   905  
   906  	foo := func(v archsimd.Float64x4, a, b float64) {
   907  		v.Store(r)
   908  		checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10})
   909  	}
   910  
   911  	foo(ll, 0, 1)
   912  	foo(hh, 3, 2)
   913  	foo(lh, 0, 3)
   914  	foo(hl, 2, 1)
   915  }
   916  
   917  func TestSelect2FromPairConstGroupedInt(t *testing.T) {
   918  	x := archsimd.LoadInt64x4([]int64{0, 1, 10, 11})
   919  	y := archsimd.LoadInt64x4([]int64{2, 3, 12, 13})
   920  
   921  	ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
   922  	hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
   923  	lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
   924  	hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
   925  
   926  	r := make([]int64, 4, 4)
   927  
   928  	foo := func(v archsimd.Int64x4, a, b int64) {
   929  		v.Store(r)
   930  		checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10})
   931  	}
   932  
   933  	foo(ll, 0, 1)
   934  	foo(hh, 3, 2)
   935  	foo(lh, 0, 3)
   936  	foo(hl, 2, 1)
   937  }
   938  
   939  func TestSelect2FromPairConstGroupedInt512(t *testing.T) {
   940  	if !archsimd.X86.AVX512() {
   941  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   942  		return
   943  	}
   944  
   945  	x := archsimd.LoadInt64x8([]int64{0, 1, 10, 11, 20, 21, 30, 31})
   946  	y := archsimd.LoadInt64x8([]int64{2, 3, 12, 13, 22, 23, 32, 33})
   947  
   948  	ll := x.ConcatPermuteScalarsGrouped(0, 1, y)
   949  	hh := x.ConcatPermuteScalarsGrouped(3, 2, y)
   950  	lh := x.ConcatPermuteScalarsGrouped(0, 3, y)
   951  	hl := x.ConcatPermuteScalarsGrouped(2, 1, y)
   952  
   953  	r := make([]int64, 8, 8)
   954  
   955  	foo := func(v archsimd.Int64x8, a, b int64) {
   956  		v.Store(r)
   957  		checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30})
   958  	}
   959  
   960  	foo(ll, 0, 1)
   961  	foo(hh, 3, 2)
   962  	foo(lh, 0, 3)
   963  	foo(hl, 2, 1)
   964  }
   965  
   966  func TestStringAMD64(t *testing.T) {
   967  	x := archsimd.LoadUint32x4([]uint32{0, 1, 2, 3})
   968  	y := archsimd.LoadInt64x4([]int64{-4, -5, -6, -7})
   969  	z := archsimd.LoadFloat32x4([]float32{0.5, 1.5, -2.5, 3.5e9})
   970  	w := archsimd.LoadFloat64x4([]float64{0.5, 1.5, -2.5, 3.5e9})
   971  
   972  	sx := "{0,1,2,3}"
   973  	sy := "{-4,-5,-6,-7}"
   974  	sz := "{0.5,1.5,-2.5,3.5e+09}"
   975  	sw := sz
   976  
   977  	if x.String() != sx {
   978  		t.Errorf("x=%s wanted %s", x, sx)
   979  	}
   980  	if y.String() != sy {
   981  		t.Errorf("y=%s wanted %s", y, sy)
   982  	}
   983  	if z.String() != sz {
   984  		t.Errorf("z=%s wanted %s", z, sz)
   985  	}
   986  	if w.String() != sw {
   987  		t.Errorf("w=%s wanted %s", w, sw)
   988  	}
   989  	t.Logf("w=%s", w)
   990  	t.Logf("x=%s", x)
   991  	t.Logf("y=%s", y)
   992  	t.Logf("z=%s", z)
   993  }
   994  
   995  func TestMaskString(t *testing.T) {
   996  	x := archsimd.LoadUint32x4([]uint32{0, 1, 2, 3})
   997  	var y archsimd.Uint32x4
   998  
   999  	m := x.Equal(y)
  1000  
  1001  	w := "{1,0,0,0}"
  1002  
  1003  	if g := m.String(); g != w {
  1004  		t.Errorf("got=%s wanted %s", g, w)
  1005  	}
  1006  }
  1007  
  1008  // a returns an slice of 16 int32
  1009  func a() []int32 {
  1010  	return make([]int32, 16, 16)
  1011  }
  1012  
  1013  // applyTo3 returns a 16-element slice of the results of
  1014  // applying f to the respective elements of vectors x, y, and z.
  1015  func applyTo3(x, y, z archsimd.Int32x16, f func(x, y, z int32) int32) []int32 {
  1016  	ax, ay, az := a(), a(), a()
  1017  	x.Store(ax)
  1018  	y.Store(ay)
  1019  	z.Store(az)
  1020  
  1021  	r := a()
  1022  	for i := range r {
  1023  		r[i] = f(ax[i], ay[i], az[i])
  1024  	}
  1025  	return r
  1026  }
  1027  
  1028  // applyTo4 returns a 16-element slice of the results of
  1029  // applying f to the respective elements of vectors x, y, z, and w.
  1030  func applyTo4(x, y, z, w archsimd.Int32x16, f func(x, y, z, w int32) int32) []int32 {
  1031  	ax, ay, az, aw := a(), a(), a(), a()
  1032  	x.Store(ax)
  1033  	y.Store(ay)
  1034  	z.Store(az)
  1035  	w.Store(aw)
  1036  
  1037  	r := make([]int32, len(ax), len(ax))
  1038  	for i := range r {
  1039  		r[i] = f(ax[i], ay[i], az[i], aw[i])
  1040  	}
  1041  	return r
  1042  }
  1043  
  1044  func TestSelectTernOptInt32x16(t *testing.T) {
  1045  	if !archsimd.X86.AVX512() {
  1046  		t.Skip("Test requires X86.AVX512, not available on this hardware")
  1047  		return
  1048  	}
  1049  	ax := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
  1050  	ay := []int32{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}
  1051  	az := []int32{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}
  1052  	aw := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
  1053  	am := []int32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
  1054  
  1055  	x := archsimd.LoadInt32x16(ax)
  1056  	y := archsimd.LoadInt32x16(ay)
  1057  	z := archsimd.LoadInt32x16(az)
  1058  	w := archsimd.LoadInt32x16(aw)
  1059  	m := archsimd.LoadInt32x16(am)
  1060  
  1061  	foo := func(v archsimd.Int32x16, s []int32) {
  1062  		r := make([]int32, 16, 16)
  1063  		v.Store(r)
  1064  		checkSlices[int32](t, r, s)
  1065  	}
  1066  
  1067  	t0 := w.Xor(y).Xor(z)
  1068  	ft0 := func(w, y, z int32) int32 {
  1069  		return w ^ y ^ z
  1070  	}
  1071  	foo(t0, applyTo3(w, y, z, ft0))
  1072  
  1073  	t1 := m.And(w.Xor(y).Xor(z.Not()))
  1074  	ft1 := func(m, w, y, z int32) int32 {
  1075  		return m & (w ^ y ^ ^z)
  1076  	}
  1077  	foo(t1, applyTo4(m, w, y, z, ft1))
  1078  
  1079  	t2 := x.Xor(y).Xor(z).And(x.Xor(y).Xor(z.Not()))
  1080  	ft2 := func(x, y, z int32) int32 {
  1081  		return (x ^ y ^ z) & (x ^ y ^ ^z)
  1082  	}
  1083  	foo(t2, applyTo3(x, y, z, ft2))
  1084  }
  1085  
  1086  func TestMaskedMerge(t *testing.T) {
  1087  	if !archsimd.X86.AVX2() {
  1088  		t.Skip("Test requires X86.AVX2, not available on this hardware")
  1089  		return
  1090  	}
  1091  	x := archsimd.LoadInt64x4([]int64{1, 2, 3, 4})
  1092  	y := archsimd.LoadInt64x4([]int64{5, 6, 1, 1})
  1093  	z := archsimd.LoadInt64x4([]int64{-1, -2, -3, -4})
  1094  	res := make([]int64, 4)
  1095  	expected := []int64{6, 8, -3, -4}
  1096  	mask := x.Less(y)
  1097  	if archsimd.X86.AVX512() {
  1098  		x.Add(y).Merge(z, mask).Store(res)
  1099  	} else {
  1100  		x.Add(y).Merge(z, mask).Store(res)
  1101  	}
  1102  	for i := range 4 {
  1103  		if res[i] != expected[i] {
  1104  			t.Errorf("got %d wanted %d", res[i], expected[i])
  1105  		}
  1106  	}
  1107  }
  1108  
  1109  func TestMaskedIfElse(t *testing.T) {
  1110  	if !archsimd.X86.AVX2() {
  1111  		t.Skip("Test requires X86.AVX2, not available on this hardware")
  1112  		return
  1113  	}
  1114  	x := archsimd.LoadInt64x4([]int64{1, 2, 3, 4})
  1115  	y := archsimd.LoadInt64x4([]int64{5, 6, 1, 1})
  1116  	z := archsimd.LoadInt64x4([]int64{-1, -2, -3, -4})
  1117  	res := make([]int64, 4)
  1118  	expected := []int64{6, 8, -3, -4}
  1119  	mask := x.Less(y)
  1120  	if archsimd.X86.AVX512() {
  1121  		x.Add(y).IfElse(mask, z).Store(res)
  1122  	} else {
  1123  		x.Add(y).IfElse(mask, z).Store(res)
  1124  	}
  1125  	for i := range 4 {
  1126  		if res[i] != expected[i] {
  1127  			t.Errorf("got %d wanted %d", res[i], expected[i])
  1128  		}
  1129  	}
  1130  }
  1131  
  1132  func TestPermuteScalars(t *testing.T) {
  1133  	x := []int32{11, 12, 13, 14}
  1134  	want := []int32{12, 13, 14, 11}
  1135  	got := make([]int32, 4)
  1136  	archsimd.LoadInt32x4(x).PermuteScalars(1, 2, 3, 0).Store(got)
  1137  	checkSlices(t, got, want)
  1138  }
  1139  
  1140  func TestPermuteScalarsGrouped(t *testing.T) {
  1141  	if !archsimd.X86.AVX2() {
  1142  		t.Skip("Test requires X86.AVX2, not available on this hardware")
  1143  		return
  1144  	}
  1145  	x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
  1146  	want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
  1147  	got := make([]int32, 8)
  1148  	archsimd.LoadInt32x8(x).PermuteScalarsGrouped(1, 2, 3, 0).Store(got)
  1149  	checkSlices(t, got, want)
  1150  }
  1151  
  1152  func TestPermuteScalarsHi(t *testing.T) {
  1153  	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
  1154  	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
  1155  	got := make([]int16, len(x))
  1156  	archsimd.LoadInt16x8(x).PermuteScalarsHi(1, 2, 3, 0).Store(got)
  1157  	checkSlices(t, got, want)
  1158  }
  1159  
  1160  func TestPermuteScalarsLo(t *testing.T) {
  1161  	x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
  1162  	want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
  1163  	got := make([]int16, len(x))
  1164  	archsimd.LoadInt16x8(x).PermuteScalarsLo(1, 2, 3, 0).Store(got)
  1165  	checkSlices(t, got, want)
  1166  }
  1167  
  1168  func TestPermuteScalarsHiGrouped(t *testing.T) {
  1169  	if !archsimd.X86.AVX2() {
  1170  		t.Skip("Test requires X86.AVX2, not available on this hardware")
  1171  		return
  1172  	}
  1173  	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
  1174  	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
  1175  	got := make([]int16, len(x))
  1176  	archsimd.LoadInt16x16(x).PermuteScalarsHiGrouped(1, 2, 3, 0).Store(got)
  1177  	checkSlices(t, got, want)
  1178  }
  1179  
  1180  func TestPermuteScalarsLoGrouped(t *testing.T) {
  1181  	if !archsimd.X86.AVX2() {
  1182  		t.Skip("Test requires X86.AVX2, not available on this hardware")
  1183  		return
  1184  	}
  1185  	x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
  1186  	want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
  1187  	got := make([]int16, len(x))
  1188  	archsimd.LoadInt16x16(x).PermuteScalarsLoGrouped(1, 2, 3, 0).Store(got)
  1189  	checkSlices(t, got, want)
  1190  }
  1191  
  1192  func TestClMul(t *testing.T) {
  1193  	var x = archsimd.LoadUint64x2([]uint64{1, 5})
  1194  	var y = archsimd.LoadUint64x2([]uint64{3, 9})
  1195  
  1196  	foo := func(v archsimd.Uint64x2, s []uint64) {
  1197  		r := make([]uint64, 2, 2)
  1198  		v.Store(r)
  1199  		checkSlices[uint64](t, r, s)
  1200  	}
  1201  
  1202  	foo(x.CarrylessMultiplyEven(y), []uint64{3, 0})
  1203  	foo(x.CarrylessMultiplyEvenOdd(y), []uint64{9, 0})
  1204  	foo(x.CarrylessMultiplyOddEven(y), []uint64{15, 0})
  1205  	foo(x.CarrylessMultiplyOdd(y), []uint64{45, 0})
  1206  	foo(y.CarrylessMultiplyEven(y), []uint64{5, 0})
  1207  
  1208  }
  1209  
  1210  func addPairsSlice[T number](a, b []T) []T {
  1211  	r := make([]T, len(a))
  1212  	for i := range len(a) / 2 {
  1213  		r[i] = a[2*i] + a[2*i+1]
  1214  		r[i+len(a)/2] = b[2*i] + b[2*i+1]
  1215  	}
  1216  	return r
  1217  }
  1218  
  1219  func subPairsSlice[T number](a, b []T) []T {
  1220  	r := make([]T, len(a))
  1221  	for i := range len(a) / 2 {
  1222  		r[i] = a[2*i] - a[2*i+1]
  1223  		r[i+len(a)/2] = b[2*i] - b[2*i+1]
  1224  	}
  1225  	return r
  1226  }
  1227  
  1228  func addPairsGroupedSlice[T number](a, b []T) []T {
  1229  	group := int(128 / unsafe.Sizeof(a[0]))
  1230  	r := make([]T, 0, len(a))
  1231  	for i := range len(a) / group {
  1232  		r = append(r, addPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
  1233  	}
  1234  	return r
  1235  }
  1236  
  1237  func subPairsGroupedSlice[T number](a, b []T) []T {
  1238  	group := int(128 / unsafe.Sizeof(a[0]))
  1239  	r := make([]T, 0, len(a))
  1240  	for i := range len(a) / group {
  1241  		r = append(r, subPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
  1242  	}
  1243  	return r
  1244  }
  1245  
  1246  func TestAddSubPairs(t *testing.T) {
  1247  	testInt16x8Binary(t, archsimd.Int16x8.ConcatAddPairs, addPairsSlice[int16])
  1248  	testInt16x8Binary(t, archsimd.Int16x8.ConcatSubPairs, subPairsSlice[int16])
  1249  	testUint16x8Binary(t, archsimd.Uint16x8.ConcatAddPairs, addPairsSlice[uint16])
  1250  	testUint16x8Binary(t, archsimd.Uint16x8.ConcatSubPairs, subPairsSlice[uint16])
  1251  	testInt32x4Binary(t, archsimd.Int32x4.ConcatAddPairs, addPairsSlice[int32])
  1252  	testInt32x4Binary(t, archsimd.Int32x4.ConcatSubPairs, subPairsSlice[int32])
  1253  	testUint32x4Binary(t, archsimd.Uint32x4.ConcatAddPairs, addPairsSlice[uint32])
  1254  	testUint32x4Binary(t, archsimd.Uint32x4.ConcatSubPairs, subPairsSlice[uint32])
  1255  	testFloat32x4Binary(t, archsimd.Float32x4.ConcatAddPairs, addPairsSlice[float32])
  1256  	testFloat32x4Binary(t, archsimd.Float32x4.ConcatSubPairs, subPairsSlice[float32])
  1257  	testFloat64x2Binary(t, archsimd.Float64x2.ConcatAddPairs, addPairsSlice[float64])
  1258  	testFloat64x2Binary(t, archsimd.Float64x2.ConcatSubPairs, subPairsSlice[float64])
  1259  
  1260  	// Grouped versions
  1261  	if archsimd.X86.AVX2() {
  1262  		testInt16x16Binary(t, archsimd.Int16x16.ConcatAddPairsGrouped, addPairsGroupedSlice[int16])
  1263  		testInt16x16Binary(t, archsimd.Int16x16.ConcatSubPairsGrouped, subPairsGroupedSlice[int16])
  1264  		testUint16x16Binary(t, archsimd.Uint16x16.ConcatAddPairsGrouped, addPairsGroupedSlice[uint16])
  1265  		testUint16x16Binary(t, archsimd.Uint16x16.ConcatSubPairsGrouped, subPairsGroupedSlice[uint16])
  1266  		testInt32x8Binary(t, archsimd.Int32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[int32])
  1267  		testInt32x8Binary(t, archsimd.Int32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[int32])
  1268  		testUint32x8Binary(t, archsimd.Uint32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[uint32])
  1269  		testUint32x8Binary(t, archsimd.Uint32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[uint32])
  1270  		testFloat32x8Binary(t, archsimd.Float32x8.ConcatAddPairsGrouped, addPairsGroupedSlice[float32])
  1271  		testFloat32x8Binary(t, archsimd.Float32x8.ConcatSubPairsGrouped, subPairsGroupedSlice[float32])
  1272  		testFloat64x4Binary(t, archsimd.Float64x4.ConcatAddPairsGrouped, addPairsGroupedSlice[float64])
  1273  		testFloat64x4Binary(t, archsimd.Float64x4.ConcatSubPairsGrouped, subPairsGroupedSlice[float64])
  1274  	}
  1275  }
  1276  
  1277  func convConcatSlice[T, U number](a, b []T, conv func(T) U) []U {
  1278  	r := make([]U, len(a)+len(b))
  1279  	for i, v := range a {
  1280  		r[i] = conv(v)
  1281  	}
  1282  	for i, v := range b {
  1283  		r[len(a)+i] = conv(v)
  1284  	}
  1285  	return r
  1286  }
  1287  
  1288  func convConcatGroupedSlice[T, U number](a, b []T, conv func(T) U) []U {
  1289  	group := int(128 / unsafe.Sizeof(a[0]))
  1290  	r := make([]U, 0, len(a)+len(b))
  1291  	for i := 0; i < len(a)/group; i++ {
  1292  		r = append(r, convConcatSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group], conv)...)
  1293  	}
  1294  	return r
  1295  }
  1296  
  1297  func TestSaturateConcat(t *testing.T) {
  1298  	// Int32x4.SaturateToInt16Concat
  1299  	forSlicePair(t, int32s, 4, func(x, y []int32) bool {
  1300  		a, b := archsimd.LoadInt32x4(x), archsimd.LoadInt32x4(y)
  1301  		var out [8]int16
  1302  		a.SaturateToInt16Concat(b).StoreArray(&out)
  1303  		want := convConcatSlice(x, y, satToInt16)
  1304  		return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1305  	})
  1306  	// Int32x4.SaturateToUint16Concat
  1307  	forSlicePair(t, int32s, 4, func(x, y []int32) bool {
  1308  		a, b := archsimd.LoadInt32x4(x), archsimd.LoadInt32x4(y)
  1309  		var out [8]uint16
  1310  		a.SaturateToUint16Concat(b).StoreArray(&out)
  1311  		want := convConcatSlice(x, y, satToUint16)
  1312  		return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1313  	})
  1314  
  1315  	if archsimd.X86.AVX2() {
  1316  		// Int32x8.SaturateToInt16ConcatGrouped
  1317  		forSlicePair(t, int32s, 8, func(x, y []int32) bool {
  1318  			a, b := archsimd.LoadInt32x8(x), archsimd.LoadInt32x8(y)
  1319  			var out [16]int16
  1320  			a.SaturateToInt16ConcatGrouped(b).StoreArray(&out)
  1321  			want := convConcatGroupedSlice(x, y, satToInt16)
  1322  			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1323  		})
  1324  		// Int32x8.SaturateToUint16ConcatGrouped
  1325  		forSlicePair(t, int32s, 8, func(x, y []int32) bool {
  1326  			a, b := archsimd.LoadInt32x8(x), archsimd.LoadInt32x8(y)
  1327  			var out [16]uint16
  1328  			a.SaturateToUint16ConcatGrouped(b).StoreArray(&out)
  1329  			want := convConcatGroupedSlice(x, y, satToUint16)
  1330  			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1331  		})
  1332  	}
  1333  
  1334  	if archsimd.X86.AVX512() {
  1335  		// Int32x16.SaturateToInt16ConcatGrouped
  1336  		forSlicePair(t, int32s, 16, func(x, y []int32) bool {
  1337  			a, b := archsimd.LoadInt32x16(x), archsimd.LoadInt32x16(y)
  1338  			var out [32]int16
  1339  			a.SaturateToInt16ConcatGrouped(b).StoreArray(&out)
  1340  			want := convConcatGroupedSlice(x, y, satToInt16)
  1341  			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1342  		})
  1343  		// Int32x16.SaturateToUint16ConcatGrouped
  1344  		forSlicePair(t, int32s, 16, func(x, y []int32) bool {
  1345  			a, b := archsimd.LoadInt32x16(x), archsimd.LoadInt32x16(y)
  1346  			var out [32]uint16
  1347  			a.SaturateToUint16ConcatGrouped(b).StoreArray(&out)
  1348  			want := convConcatGroupedSlice(x, y, satToUint16)
  1349  			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1350  		})
  1351  	}
  1352  }
  1353  
  1354  func testMaskOr8x64(t *testing.T) {
  1355  	if !archsimd.X86.AVX512() {
  1356  		return // compiler needs to see check+return to ensure mask register use
  1357  	}
  1358  	s := make([]int8, 64)
  1359  	want := []int8{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0,
  1360  		-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0,
  1361  		-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0,
  1362  		-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1}
  1363  	var a archsimd.Int8x64
  1364  	b := archsimd.LoadInt8x64(want)
  1365  	m1 := a.Less(a)
  1366  	m2 := b.Less(a)
  1367  	m3 := m1.Or(m2)
  1368  	c := m3.ToInt8x64()
  1369  	c.Store(s)
  1370  	checkSlices(t, s, want)
  1371  }
  1372  
  1373  func testMaskOr16x32(t *testing.T) {
  1374  	if !archsimd.X86.AVX512() {
  1375  		return // compiler needs to see check+return to ensure mask register use
  1376  	}
  1377  	s := make([]int16, 32)
  1378  	want := []int16{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0,
  1379  		-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1}
  1380  	var a archsimd.Int16x32
  1381  	b := archsimd.LoadInt16x32(want)
  1382  	m1 := a.Less(a)
  1383  	m2 := b.Less(a)
  1384  	m3 := m1.Or(m2)
  1385  	c := m3.ToInt16x32()
  1386  	c.Store(s)
  1387  	checkSlices(t, s, want)
  1388  }
  1389  
  1390  func testMaskOr32x16(t *testing.T) {
  1391  	if !archsimd.X86.AVX512() {
  1392  		return // compiler needs to see check+return to ensure mask register use
  1393  	}
  1394  	s := make([]int32, 16)
  1395  	want := []int32{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1}
  1396  	var a archsimd.Int32x16
  1397  	b := archsimd.LoadInt32x16(want)
  1398  	m1 := a.Less(a)
  1399  	m2 := b.Less(a)
  1400  	m3 := m1.Or(m2)
  1401  	c := m3.ToInt32x16()
  1402  	c.Store(s)
  1403  	checkSlices(t, s, want)
  1404  }
  1405  
  1406  func testMaskOr64x8(t *testing.T) {
  1407  	if !archsimd.X86.AVX512() {
  1408  		return // compiler needs to see check+return to ensure mask register use
  1409  	}
  1410  	s := make([]int64, 8)
  1411  	want := []int64{-1, 0, 0, 0, 0, 0, -1, -1}
  1412  	var a archsimd.Int64x8
  1413  	b := archsimd.LoadInt64x8(want)
  1414  	m1 := a.Less(a)
  1415  	m2 := b.Less(a)
  1416  	m3 := m1.Or(m2)
  1417  	c := m3.ToInt64x8()
  1418  	c.Store(s)
  1419  	checkSlices(t, s, want)
  1420  }
  1421  
  1422  func testMaskOr8x32(t *testing.T) {
  1423  	if !archsimd.X86.AVX512() {
  1424  		return // compiler needs to see check+return to ensure mask register use
  1425  	}
  1426  	s := make([]int8, 32)
  1427  	want := []int8{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0,
  1428  		-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1}
  1429  	var a archsimd.Int8x32
  1430  	b := archsimd.LoadInt8x32(want)
  1431  	m1 := a.Less(a)
  1432  	m2 := b.Less(a)
  1433  	m3 := m1.Or(m2)
  1434  	c := m3.ToInt8x32()
  1435  	c.Store(s)
  1436  	checkSlices(t, s, want)
  1437  }
  1438  
  1439  func testMaskOr16x16(t *testing.T) {
  1440  	if !archsimd.X86.AVX512() {
  1441  		return // compiler needs to see check+return to ensure mask register use
  1442  	}
  1443  	s := make([]int16, 16)
  1444  	want := []int16{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, -1}
  1445  	var a archsimd.Int16x16
  1446  	b := archsimd.LoadInt16x16(want)
  1447  	m1 := a.Less(a)
  1448  	m2 := b.Less(a)
  1449  	m3 := m1.Or(m2)
  1450  	c := m3.ToInt16x16()
  1451  	c.Store(s)
  1452  	checkSlices(t, s, want)
  1453  }
  1454  
  1455  func testMaskOr32x8(t *testing.T) {
  1456  	if !archsimd.X86.AVX512() {
  1457  		return // compiler needs to see check+return to ensure mask register use
  1458  	}
  1459  	s := make([]int32, 8)
  1460  	want := []int32{-1, 0, 0, 0, 0, 0, -1, -1}
  1461  	var a archsimd.Int32x8
  1462  	b := archsimd.LoadInt32x8(want)
  1463  	m1 := a.Less(a)
  1464  	m2 := b.Less(a)
  1465  	m3 := m1.Or(m2)
  1466  	c := m3.ToInt32x8()
  1467  	c.Store(s)
  1468  	checkSlices(t, s, want)
  1469  }
  1470  
  1471  func testMaskOr64x4(t *testing.T) {
  1472  	if !archsimd.X86.AVX512() {
  1473  		return // compiler needs to see check+return to ensure mask register use
  1474  	}
  1475  	s := make([]int64, 4)
  1476  	want := []int64{-1, 0, 0, -1}
  1477  	var a archsimd.Int64x4
  1478  	b := archsimd.LoadInt64x4(want)
  1479  	m1 := a.Less(a)
  1480  	m2 := b.Less(a)
  1481  	m3 := m1.Or(m2)
  1482  	c := m3.ToInt64x4()
  1483  	c.Store(s)
  1484  	checkSlices(t, s, want)
  1485  }
  1486  
  1487  func testMaskOr8x16(t *testing.T) {
  1488  	if !archsimd.X86.AVX512() {
  1489  		return // compiler needs to see check+return to ensure mask register use
  1490  	}
  1491  	s := make([]int8, 16)
  1492  	want := []int8{-1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, -1}
  1493  	var a archsimd.Int8x16
  1494  	b := archsimd.LoadInt8x16(want)
  1495  	m1 := a.Less(a)
  1496  	m2 := b.Less(a)
  1497  	m3 := m1.Or(m2)
  1498  	c := m3.ToInt8x16()
  1499  	c.Store(s)
  1500  	checkSlices(t, s, want)
  1501  }
  1502  
  1503  func testMaskOr16x8(t *testing.T) {
  1504  	if !archsimd.X86.AVX512() {
  1505  		return // compiler needs to see check+return to ensure mask register use
  1506  	}
  1507  	s := make([]int16, 8)
  1508  	want := []int16{-1, 0, 0, 0, 0, 0, -1, -1}
  1509  	var a archsimd.Int16x8
  1510  	b := archsimd.LoadInt16x8(want)
  1511  	m1 := a.Less(a)
  1512  	m2 := b.Less(a)
  1513  	m3 := m1.Or(m2)
  1514  	c := m3.ToInt16x8()
  1515  	c.Store(s)
  1516  	checkSlices(t, s, want)
  1517  }
  1518  
  1519  func testMaskOr32x4(t *testing.T) {
  1520  	if !archsimd.X86.AVX512() {
  1521  		return // compiler needs to see check+return to ensure mask register use
  1522  	}
  1523  	s := make([]int32, 4)
  1524  	want := []int32{-1, 0, 0, -1}
  1525  	var a archsimd.Int32x4
  1526  	b := archsimd.LoadInt32x4(want)
  1527  	m1 := a.Less(a)
  1528  	m2 := b.Less(a)
  1529  	m3 := m1.Or(m2)
  1530  	c := m3.ToInt32x4()
  1531  	c.Store(s)
  1532  	checkSlices(t, s, want)
  1533  }
  1534  
  1535  func testMaskOr64x2(t *testing.T) {
  1536  	if !archsimd.X86.AVX512() {
  1537  		return // compiler needs to see check+return to ensure mask register use
  1538  	}
  1539  	s := make([]int64, 2)
  1540  	want := []int64{-1, 0}
  1541  	var a archsimd.Int64x2
  1542  	b := archsimd.LoadInt64x2(want)
  1543  	m1 := a.Less(a)
  1544  	m2 := b.Less(a)
  1545  	m3 := m1.Or(m2)
  1546  	c := m3.ToInt64x2()
  1547  	c.Store(s)
  1548  	checkSlices(t, s, want)
  1549  }
  1550  
  1551  func TestMaskOr(t *testing.T) {
  1552  	if !archsimd.X86.AVX512() {
  1553  		t.Skip("Test requires X86.AVX512, not available on this hardware")
  1554  	}
  1555  	testMaskOr8x64(t)
  1556  	testMaskOr16x32(t)
  1557  	testMaskOr32x16(t)
  1558  	testMaskOr64x8(t)
  1559  	testMaskOr8x32(t)
  1560  	testMaskOr16x16(t)
  1561  	testMaskOr32x8(t)
  1562  	testMaskOr64x4(t)
  1563  	testMaskOr8x16(t)
  1564  	testMaskOr16x8(t)
  1565  	testMaskOr32x4(t)
  1566  	testMaskOr64x2(t)
  1567  }
  1568  

View as plain text