Source file src/simd/simd_emulated.go

     1  // Copyright 2026 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd && !(amd64 || wasm || arm64)
     6  
     7  package simd
     8  
     9  import (
    10  	"fmt"
    11  	"math"
    12  	"math/bits"
    13  )
    14  
    15  // VectorSize returns the bit length of the emulated vector (fixed to 128).
    16  func VectorBitSize() int {
    17  	return 128
    18  }
    19  
    20  // Emulated returns whether simd is emulated.
    21  func Emulated() bool {
    22  	return true
    23  }
    24  
    25  // HasHardwareCarrylessMultiply returns whether this platform
    26  // has a hardware-implemented version of carryless multiply.
    27  // With default GODEBUG=simd settings, if this is false,
    28  // it is emulated and merely slow, but with non-default settings
    29  // this can indicate the possibility of a missing instruction
    30  // that will fail ("SIGILL") if it is executed.
    31  func HasHardwareCarrylessMultiply() bool {
    32  	return false
    33  }
    34  
    35  // LoadInt8s loads a slice of int8 into an Int8s vector.
    36  func LoadInt8s(s []int8) Int8s {
    37  	var a, b uint64
    38  	for i := 0; i < 16; i++ {
    39  		val := uint64(uint8(s[i]))
    40  		if i < 8 {
    41  			a |= val << (8 * i)
    42  		} else {
    43  			b |= val << (8 * (i - 8))
    44  		}
    45  	}
    46  	return Int8s{a: a, b: b}
    47  }
    48  
    49  // LoadInt8sPart loads a partial slice of int8 into an Int8s vector.
    50  func LoadInt8sPart(s []int8) (Int8s, int) {
    51  	var a, b uint64
    52  	n := len(s)
    53  	if n > 16 {
    54  		n = 16
    55  	}
    56  	for i := 0; i < n; i++ {
    57  		val := uint64(uint8(s[i]))
    58  		if i < 8 {
    59  			a |= val << (8 * i)
    60  		} else {
    61  			b |= val << (8 * (i - 8))
    62  		}
    63  	}
    64  	return Int8s{a: a, b: b}, n
    65  }
    66  
    67  func (x Int8s) get(i int) int8 {
    68  	if i < 8 {
    69  		return int8(x.a >> (8 * i))
    70  	}
    71  	return int8(x.b >> (8 * (i - 8)))
    72  }
    73  
    74  func (x *Int8s) set(i int, v int8) {
    75  	val := uint64(uint8(v))
    76  	if i < 8 {
    77  		mask := uint64(0xff) << (8 * i)
    78  		x.a = (x.a &^ mask) | (val << (8 * i))
    79  	} else {
    80  		mask := uint64(0xff) << (8 * (i - 8))
    81  		x.b = (x.b &^ mask) | (val << (8 * (i - 8)))
    82  	}
    83  }
    84  
    85  // Abs returns the element-wise absolute value of x.
    86  func (x Int8s) Abs() Int8s {
    87  	var res Int8s
    88  	for i := 0; i < 16; i++ {
    89  		v := x.get(i)
    90  		if v < 0 {
    91  			res.set(i, -v)
    92  		} else {
    93  			res.set(i, v)
    94  		}
    95  	}
    96  	return res
    97  }
    98  
    99  // Add returns the element-wise sum of x and y.
   100  func (x Int8s) Add(y Int8s) Int8s {
   101  	var res Int8s
   102  	for i := 0; i < 16; i++ {
   103  		res.set(i, x.get(i)+y.get(i))
   104  	}
   105  	return res
   106  }
   107  
   108  // AddSaturated returns the element-wise saturated sum of x and y.
   109  func (x Int8s) AddSaturated(y Int8s) Int8s {
   110  	var res Int8s
   111  	for i := 0; i < 16; i++ {
   112  		sum := int(x.get(i)) + int(y.get(i))
   113  		if sum > math.MaxInt8 {
   114  			res.set(i, math.MaxInt8)
   115  		} else if sum < math.MinInt8 {
   116  			res.set(i, math.MinInt8)
   117  		} else {
   118  			res.set(i, int8(sum))
   119  		}
   120  	}
   121  	return res
   122  }
   123  
   124  // And returns the bitwise AND of x and y.
   125  func (x Int8s) And(y Int8s) Int8s {
   126  	return Int8s{a: x.a & y.a, b: x.b & y.b}
   127  }
   128  
   129  // AndNot returns the bitwise AND NOT of x and y.
   130  func (x Int8s) AndNot(y Int8s) Int8s {
   131  	return Int8s{a: x.a &^ y.a, b: x.b &^ y.b}
   132  }
   133  
   134  // Equal returns a mask indicating where x and y are equal.
   135  func (x Int8s) Equal(y Int8s) Mask8s {
   136  	var res Mask8s
   137  	for i := 0; i < 16; i++ {
   138  		if x.get(i) == y.get(i) {
   139  			res.set(i, true)
   140  		}
   141  	}
   142  	return res
   143  }
   144  
   145  // Greater returns a mask indicating where x is greater than y.
   146  func (x Int8s) Greater(y Int8s) Mask8s {
   147  	var res Mask8s
   148  	for i := 0; i < 16; i++ {
   149  		if x.get(i) > y.get(i) {
   150  			res.set(i, true)
   151  		}
   152  	}
   153  	return res
   154  }
   155  
   156  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
   157  func (x Int8s) GreaterEqual(y Int8s) Mask8s {
   158  	var res Mask8s
   159  	for i := 0; i < 16; i++ {
   160  		if x.get(i) >= y.get(i) {
   161  			res.set(i, true)
   162  		}
   163  	}
   164  	return res
   165  }
   166  
   167  // Less returns a mask indicating where x is less than y.
   168  func (x Int8s) Less(y Int8s) Mask8s {
   169  	var res Mask8s
   170  	for i := 0; i < 16; i++ {
   171  		if x.get(i) < y.get(i) {
   172  			res.set(i, true)
   173  		}
   174  	}
   175  	return res
   176  }
   177  
   178  // LessEqual returns a mask indicating where x is less than or equal to y.
   179  func (x Int8s) LessEqual(y Int8s) Mask8s {
   180  	var res Mask8s
   181  	for i := 0; i < 16; i++ {
   182  		if x.get(i) <= y.get(i) {
   183  			res.set(i, true)
   184  		}
   185  	}
   186  	return res
   187  }
   188  
   189  // NotEqual returns a mask indicating where x and y are not equal.
   190  func (x Int8s) NotEqual(y Int8s) Mask8s {
   191  	var res Mask8s
   192  	for i := 0; i < 16; i++ {
   193  		if x.get(i) != y.get(i) {
   194  			res.set(i, true)
   195  		}
   196  	}
   197  	return res
   198  }
   199  
   200  // Len returns the number of elements in the vector.
   201  func (x Int8s) Len() int {
   202  	return 16
   203  }
   204  
   205  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
   206  func (x Int8s) Masked(mask Mask8s) Int8s {
   207  	return Int8s{a: x.a & mask.a, b: x.b & mask.b}
   208  }
   209  
   210  // Max returns the element-wise maximum of x and y.
   211  func (x Int8s) Max(y Int8s) Int8s {
   212  	var res Int8s
   213  	for i := 0; i < 16; i++ {
   214  		vx := x.get(i)
   215  		vy := y.get(i)
   216  		if vx > vy {
   217  			res.set(i, vx)
   218  		} else {
   219  			res.set(i, vy)
   220  		}
   221  	}
   222  	return res
   223  }
   224  
   225  // Mul returns the element-wise product of x and y.
   226  func (x Int8s) Mul(y Int8s) Int8s {
   227  	var res Int8s
   228  	for i := 0; i < 16; i++ {
   229  		res.set(i, x.get(i)*y.get(i))
   230  	}
   231  	return res
   232  }
   233  
   234  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
   235  func (x Int8s) IfElse(mask Mask8s, y Int8s) Int8s {
   236  	return Int8s{
   237  		a: (x.a & mask.a) | (y.a &^ mask.a),
   238  		b: (x.b & mask.b) | (y.b &^ mask.b),
   239  	}
   240  }
   241  
   242  // Min returns the element-wise minimum of x and y.
   243  func (x Int8s) Min(y Int8s) Int8s {
   244  	var res Int8s
   245  	for i := 0; i < 16; i++ {
   246  		vx := x.get(i)
   247  		vy := y.get(i)
   248  		if vx < vy {
   249  			res.set(i, vx)
   250  		} else {
   251  			res.set(i, vy)
   252  		}
   253  	}
   254  	return res
   255  }
   256  
   257  // Neg returns the element-wise negation of x.
   258  func (x Int8s) Neg() Int8s {
   259  	var res Int8s
   260  	for i := 0; i < 16; i++ {
   261  		res.set(i, -x.get(i))
   262  	}
   263  	return res
   264  }
   265  
   266  // Not returns the bitwise NOT of x.
   267  func (x Int8s) Not() Int8s {
   268  	return Int8s{a: ^x.a, b: ^x.b}
   269  }
   270  
   271  // Or returns the bitwise OR of x and y.
   272  func (x Int8s) Or(y Int8s) Int8s {
   273  	return Int8s{a: x.a | y.a, b: x.b | y.b}
   274  }
   275  
   276  // Store stores the vector elements into the slice s.
   277  func (x Int8s) Store(s []int8) {
   278  	for i := 0; i < 16 && i < len(s); i++ {
   279  		s[i] = x.get(i)
   280  	}
   281  }
   282  
   283  // StorePart stores a partial vector into the slice s.
   284  func (x Int8s) StorePart(s []int8) int {
   285  	x.Store(s)
   286  	return min(len(s), x.Len())
   287  }
   288  
   289  // String returns a string representation of the vector.
   290  func (x Int8s) String() string {
   291  	var parts [16]int8
   292  	for i := 0; i < 16; i++ {
   293  		parts[i] = x.get(i)
   294  	}
   295  	return fmt.Sprint(parts)
   296  }
   297  
   298  // Sub returns the element-wise difference of x and y.
   299  func (x Int8s) Sub(y Int8s) Int8s {
   300  	var res Int8s
   301  	for i := 0; i < 16; i++ {
   302  		res.set(i, x.get(i)-y.get(i))
   303  	}
   304  	return res
   305  }
   306  
   307  // SubSaturated returns the element-wise saturated difference of x and y.
   308  func (x Int8s) SubSaturated(y Int8s) Int8s {
   309  	var res Int8s
   310  	for i := 0; i < 16; i++ {
   311  		diff := int(x.get(i)) - int(y.get(i))
   312  		if diff > math.MaxInt8 {
   313  			res.set(i, math.MaxInt8)
   314  		} else if diff < math.MinInt8 {
   315  			res.set(i, math.MinInt8)
   316  		} else {
   317  			res.set(i, int8(diff))
   318  		}
   319  	}
   320  	return res
   321  }
   322  
   323  // ToMask returns a mask representation of the vector.
   324  func (x Int8s) ToMask() Mask8s {
   325  	var res Mask8s
   326  	for i := 0; i < 16; i++ {
   327  		if x.get(i) != 0 {
   328  			res.set(i, true)
   329  		}
   330  	}
   331  	return res
   332  }
   333  
   334  // Xor returns the bitwise XOR of x and y.
   335  func (x Int8s) Xor(y Int8s) Int8s {
   336  	return Int8s{a: x.a ^ y.a, b: x.b ^ y.b}
   337  }
   338  
   339  // ConvertToUint8 converts the vector elements to uint8.
   340  func (x Int8s) ConvertToUint8() Uint8s {
   341  	return Uint8s{a: x.a, b: x.b}
   342  }
   343  
   344  // ToBits reinterprets the vector bits as a Uint8s vector.
   345  func (x Int8s) ToBits() Uint8s {
   346  	return Uint8s{a: x.a, b: x.b}
   347  }
   348  
   349  // LoadInt16s loads a slice of int16 into an Int16s vector.
   350  func LoadInt16s(s []int16) Int16s {
   351  	var a, b uint64
   352  	for i := 0; i < 8; i++ {
   353  		val := uint64(uint16(s[i]))
   354  		if i < 4 {
   355  			a |= val << (16 * i)
   356  		} else {
   357  			b |= val << (16 * (i - 4))
   358  		}
   359  	}
   360  	return Int16s{a: a, b: b}
   361  }
   362  
   363  // LoadInt16sPart loads a partial slice of int16 into an Int16s vector.
   364  func LoadInt16sPart(s []int16) (Int16s, int) {
   365  	var a, b uint64
   366  	n := len(s)
   367  	if n > 8 {
   368  		n = 8
   369  	}
   370  	for i := 0; i < n; i++ {
   371  		val := uint64(uint16(s[i]))
   372  		if i < 4 {
   373  			a |= val << (16 * i)
   374  		} else {
   375  			b |= val << (16 * (i - 4))
   376  		}
   377  	}
   378  	return Int16s{a: a, b: b}, n
   379  }
   380  
   381  func (x Int16s) get(i int) int16 {
   382  	if i < 4 {
   383  		return int16(x.a >> (16 * i))
   384  	}
   385  	return int16(x.b >> (16 * (i - 4)))
   386  }
   387  
   388  func (x *Int16s) set(i int, v int16) {
   389  	val := uint64(uint16(v))
   390  	if i < 4 {
   391  		mask := uint64(0xffff) << (16 * i)
   392  		x.a = (x.a &^ mask) | (val << (16 * i))
   393  	} else {
   394  		mask := uint64(0xffff) << (16 * (i - 4))
   395  		x.b = (x.b &^ mask) | (val << (16 * (i - 4)))
   396  	}
   397  }
   398  
   399  // Abs returns the element-wise absolute value of x.
   400  func (x Int16s) Abs() Int16s {
   401  	var res Int16s
   402  	for i := 0; i < 8; i++ {
   403  		v := x.get(i)
   404  		if v < 0 {
   405  			res.set(i, -v)
   406  		} else {
   407  			res.set(i, v)
   408  		}
   409  	}
   410  	return res
   411  }
   412  
   413  // Add returns the element-wise sum of x and y.
   414  func (x Int16s) Add(y Int16s) Int16s {
   415  	var res Int16s
   416  	for i := 0; i < 8; i++ {
   417  		res.set(i, x.get(i)+y.get(i))
   418  	}
   419  	return res
   420  }
   421  
   422  // AddSaturated returns the element-wise saturated sum of x and y.
   423  func (x Int16s) AddSaturated(y Int16s) Int16s {
   424  	var res Int16s
   425  	for i := 0; i < 8; i++ {
   426  		sum := int(x.get(i)) + int(y.get(i))
   427  		if sum > math.MaxInt16 {
   428  			res.set(i, math.MaxInt16)
   429  		} else if sum < math.MinInt16 {
   430  			res.set(i, math.MinInt16)
   431  		} else {
   432  			res.set(i, int16(sum))
   433  		}
   434  	}
   435  	return res
   436  }
   437  
   438  // And returns the bitwise AND of x and y.
   439  func (x Int16s) And(y Int16s) Int16s {
   440  	return Int16s{a: x.a & y.a, b: x.b & y.b}
   441  }
   442  
   443  // AndNot returns the bitwise AND NOT of x and y.
   444  func (x Int16s) AndNot(y Int16s) Int16s {
   445  	return Int16s{a: x.a &^ y.a, b: x.b &^ y.b}
   446  }
   447  
   448  // Equal returns a mask indicating where x and y are equal.
   449  func (x Int16s) Equal(y Int16s) Mask16s {
   450  	var res Mask16s
   451  	for i := 0; i < 8; i++ {
   452  		if x.get(i) == y.get(i) {
   453  			res.set(i, true)
   454  		}
   455  	}
   456  	return res
   457  }
   458  
   459  // Greater returns a mask indicating where x is greater than y.
   460  func (x Int16s) Greater(y Int16s) Mask16s {
   461  	var res Mask16s
   462  	for i := 0; i < 8; i++ {
   463  		if x.get(i) > y.get(i) {
   464  			res.set(i, true)
   465  		}
   466  	}
   467  	return res
   468  }
   469  
   470  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
   471  func (x Int16s) GreaterEqual(y Int16s) Mask16s {
   472  	var res Mask16s
   473  	for i := 0; i < 8; i++ {
   474  		if x.get(i) >= y.get(i) {
   475  			res.set(i, true)
   476  		}
   477  	}
   478  	return res
   479  }
   480  
   481  // Less returns a mask indicating where x is less than y.
   482  func (x Int16s) Less(y Int16s) Mask16s {
   483  	var res Mask16s
   484  	for i := 0; i < 8; i++ {
   485  		if x.get(i) < y.get(i) {
   486  			res.set(i, true)
   487  		}
   488  	}
   489  	return res
   490  }
   491  
   492  // LessEqual returns a mask indicating where x is less than or equal to y.
   493  func (x Int16s) LessEqual(y Int16s) Mask16s {
   494  	var res Mask16s
   495  	for i := 0; i < 8; i++ {
   496  		if x.get(i) <= y.get(i) {
   497  			res.set(i, true)
   498  		}
   499  	}
   500  	return res
   501  }
   502  
   503  // NotEqual returns a mask indicating where x and y are not equal.
   504  func (x Int16s) NotEqual(y Int16s) Mask16s {
   505  	var res Mask16s
   506  	for i := 0; i < 8; i++ {
   507  		if x.get(i) != y.get(i) {
   508  			res.set(i, true)
   509  		}
   510  	}
   511  	return res
   512  }
   513  
   514  // Len returns the number of elements in the vector.
   515  func (x Int16s) Len() int {
   516  	return 8
   517  }
   518  
   519  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
   520  func (x Int16s) Masked(mask Mask16s) Int16s {
   521  	return Int16s{a: x.a & mask.a, b: x.b & mask.b}
   522  }
   523  
   524  // Max returns the element-wise maximum of x and y.
   525  func (x Int16s) Max(y Int16s) Int16s {
   526  	var res Int16s
   527  	for i := 0; i < 8; i++ {
   528  		vx := x.get(i)
   529  		vy := y.get(i)
   530  		if vx > vy {
   531  			res.set(i, vx)
   532  		} else {
   533  			res.set(i, vy)
   534  		}
   535  	}
   536  	return res
   537  }
   538  
   539  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
   540  func (x Int16s) IfElse(mask Mask16s, y Int16s) Int16s {
   541  	return Int16s{
   542  		a: (x.a & mask.a) | (y.a &^ mask.a),
   543  		b: (x.b & mask.b) | (y.b &^ mask.b),
   544  	}
   545  }
   546  
   547  // Min returns the element-wise minimum of x and y.
   548  func (x Int16s) Min(y Int16s) Int16s {
   549  	var res Int16s
   550  	for i := 0; i < 8; i++ {
   551  		vx := x.get(i)
   552  		vy := y.get(i)
   553  		if vx < vy {
   554  			res.set(i, vx)
   555  		} else {
   556  			res.set(i, vy)
   557  		}
   558  	}
   559  	return res
   560  }
   561  
   562  // Mul returns the element-wise product of x and y.
   563  func (x Int16s) Mul(y Int16s) Int16s {
   564  	var res Int16s
   565  	for i := 0; i < 8; i++ {
   566  		res.set(i, x.get(i)*y.get(i))
   567  	}
   568  	return res
   569  }
   570  
   571  // Neg returns the element-wise negation of x.
   572  func (x Int16s) Neg() Int16s {
   573  	var res Int16s
   574  	for i := 0; i < 8; i++ {
   575  		res.set(i, -x.get(i))
   576  	}
   577  	return res
   578  }
   579  
   580  // Not returns the bitwise NOT of x.
   581  func (x Int16s) Not() Int16s {
   582  	return Int16s{a: ^x.a, b: ^x.b}
   583  }
   584  
   585  // Or returns the bitwise OR of x and y.
   586  func (x Int16s) Or(y Int16s) Int16s {
   587  	return Int16s{a: x.a | y.a, b: x.b | y.b}
   588  }
   589  
   590  // ShiftAllLeft shifts all elements left by y bits.
   591  func (x Int16s) ShiftAllLeft(y uint8) Int16s {
   592  	var res Int16s
   593  	for i := 0; i < 8; i++ {
   594  		res.set(i, x.get(i)<<y)
   595  	}
   596  	return res
   597  }
   598  
   599  // ShiftAllRight shifts all elements right by y bits.
   600  func (x Int16s) ShiftAllRight(y uint8) Int16s {
   601  	var res Int16s
   602  	for i := 0; i < 8; i++ {
   603  		res.set(i, x.get(i)>>y)
   604  	}
   605  	return res
   606  }
   607  
   608  // RotateAllLeft rotates all elements left by dist bits.
   609  func (x Int16s) RotateAllLeft(dist uint64) Int16s {
   610  	var res Int16s
   611  	d := dist & 15
   612  	for i := 0; i < 8; i++ {
   613  		u := uint16(x.get(i))
   614  		r := (u << d) | (u >> ((16 - d) & 15))
   615  		res.set(i, int16(r))
   616  	}
   617  	return res
   618  }
   619  
   620  // RotateAllRight rotates all elements right by dist bits.
   621  func (x Int16s) RotateAllRight(dist uint64) Int16s {
   622  	var res Int16s
   623  	d := dist & 15
   624  	for i := 0; i < 8; i++ {
   625  		u := uint16(x.get(i))
   626  		r := (u >> d) | (u << ((16 - d) & 15))
   627  		res.set(i, int16(r))
   628  	}
   629  	return res
   630  }
   631  
   632  // Store stores the vector elements into the slice s.
   633  func (x Int16s) Store(s []int16) {
   634  	for i := 0; i < 8 && i < len(s); i++ {
   635  		s[i] = x.get(i)
   636  	}
   637  }
   638  
   639  // StorePart stores a partial vector into the slice s.
   640  func (x Int16s) StorePart(s []int16) int {
   641  	x.Store(s)
   642  	return min(len(s), x.Len())
   643  }
   644  
   645  // String returns a string representation of the vector.
   646  func (x Int16s) String() string {
   647  	var parts [8]int16
   648  	for i := 0; i < 8; i++ {
   649  		parts[i] = x.get(i)
   650  	}
   651  	return fmt.Sprint(parts)
   652  }
   653  
   654  // Sub returns the element-wise difference of x and y.
   655  func (x Int16s) Sub(y Int16s) Int16s {
   656  	var res Int16s
   657  	for i := 0; i < 8; i++ {
   658  		res.set(i, x.get(i)-y.get(i))
   659  	}
   660  	return res
   661  }
   662  
   663  // SubSaturated returns the element-wise saturated difference of x and y.
   664  func (x Int16s) SubSaturated(y Int16s) Int16s {
   665  	var res Int16s
   666  	for i := 0; i < 8; i++ {
   667  		diff := int(x.get(i)) - int(y.get(i))
   668  		if diff > math.MaxInt16 {
   669  			res.set(i, math.MaxInt16)
   670  		} else if diff < math.MinInt16 {
   671  			res.set(i, math.MinInt16)
   672  		} else {
   673  			res.set(i, int16(diff))
   674  		}
   675  	}
   676  	return res
   677  }
   678  
   679  // ToMask returns a mask representation of the vector.
   680  func (x Int16s) ToMask() Mask16s {
   681  	var res Mask16s
   682  	for i := 0; i < 8; i++ {
   683  		if x.get(i) != 0 {
   684  			res.set(i, true)
   685  		}
   686  	}
   687  	return res
   688  }
   689  
   690  // Xor returns the bitwise XOR of x and y.
   691  func (x Int16s) Xor(y Int16s) Int16s {
   692  	return Int16s{a: x.a ^ y.a, b: x.b ^ y.b}
   693  }
   694  
   695  // ConvertToUint16 converts the vector elements to uint16.
   696  func (x Int16s) ConvertToUint16() Uint16s {
   697  	return Uint16s{a: x.a, b: x.b}
   698  }
   699  
   700  // ToBits reinterprets the vector bits as a Uint16s vector.
   701  func (x Int16s) ToBits() Uint16s {
   702  	return Uint16s{a: x.a, b: x.b}
   703  }
   704  
   705  // LoadInt32s loads a slice of int32 into an Int32s vector.
   706  func LoadInt32s(s []int32) Int32s {
   707  	var a, b uint64
   708  	for i := 0; i < 4; i++ {
   709  		val := uint64(uint32(s[i]))
   710  		if i < 2 {
   711  			a |= val << (32 * i)
   712  		} else {
   713  			b |= val << (32 * (i - 2))
   714  		}
   715  	}
   716  	return Int32s{a: a, b: b}
   717  }
   718  
   719  // LoadInt32sPart loads a partial slice of int32 into an Int32s vector.
   720  func LoadInt32sPart(s []int32) (Int32s, int) {
   721  	var a, b uint64
   722  	n := len(s)
   723  	if n > 4 {
   724  		n = 4
   725  	}
   726  	for i := 0; i < n; i++ {
   727  		val := uint64(uint32(s[i]))
   728  		if i < 2 {
   729  			a |= val << (32 * i)
   730  		} else {
   731  			b |= val << (32 * (i - 2))
   732  		}
   733  	}
   734  	return Int32s{a: a, b: b}, n
   735  }
   736  
   737  func (x Int32s) get(i int) int32 {
   738  	if i < 2 {
   739  		return int32(x.a >> (32 * i))
   740  	}
   741  	return int32(x.b >> (32 * (i - 2)))
   742  }
   743  
   744  func (x *Int32s) set(i int, v int32) {
   745  	val := uint64(uint32(v))
   746  	if i < 2 {
   747  		mask := uint64(0xffffffff) << (32 * i)
   748  		x.a = (x.a &^ mask) | (val << (32 * i))
   749  	} else {
   750  		mask := uint64(0xffffffff) << (32 * (i - 2))
   751  		x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
   752  	}
   753  }
   754  
   755  // Abs returns the element-wise absolute value of x.
   756  func (x Int32s) Abs() Int32s {
   757  	var res Int32s
   758  	for i := 0; i < 4; i++ {
   759  		v := x.get(i)
   760  		if v < 0 {
   761  			res.set(i, -v)
   762  		} else {
   763  			res.set(i, v)
   764  		}
   765  	}
   766  	return res
   767  }
   768  
   769  // Add returns the element-wise sum of x and y.
   770  func (x Int32s) Add(y Int32s) Int32s {
   771  	var res Int32s
   772  	for i := 0; i < 4; i++ {
   773  		res.set(i, x.get(i)+y.get(i))
   774  	}
   775  	return res
   776  }
   777  
   778  // And returns the bitwise AND of x and y.
   779  func (x Int32s) And(y Int32s) Int32s {
   780  	return Int32s{a: x.a & y.a, b: x.b & y.b}
   781  }
   782  
   783  // AndNot returns the bitwise AND NOT of x and y.
   784  func (x Int32s) AndNot(y Int32s) Int32s {
   785  	return Int32s{a: x.a &^ y.a, b: x.b &^ y.b}
   786  }
   787  
   788  // ConvertToFloat32 converts the vector elements to float32.
   789  func (x Int32s) ConvertToFloat32() Float32s {
   790  	var res Float32s
   791  	for i := 0; i < 4; i++ {
   792  		res.set(i, float32(x.get(i)))
   793  	}
   794  	return res
   795  }
   796  
   797  // Equal returns a mask indicating where x and y are equal.
   798  func (x Int32s) Equal(y Int32s) Mask32s {
   799  	var res Mask32s
   800  	for i := 0; i < 4; i++ {
   801  		if x.get(i) == y.get(i) {
   802  			res.set(i, true)
   803  		}
   804  	}
   805  	return res
   806  }
   807  
   808  // Greater returns a mask indicating where x is greater than y.
   809  func (x Int32s) Greater(y Int32s) Mask32s {
   810  	var res Mask32s
   811  	for i := 0; i < 4; i++ {
   812  		if x.get(i) > y.get(i) {
   813  			res.set(i, true)
   814  		}
   815  	}
   816  	return res
   817  }
   818  
   819  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
   820  func (x Int32s) GreaterEqual(y Int32s) Mask32s {
   821  	var res Mask32s
   822  	for i := 0; i < 4; i++ {
   823  		if x.get(i) >= y.get(i) {
   824  			res.set(i, true)
   825  		}
   826  	}
   827  	return res
   828  }
   829  
   830  // Less returns a mask indicating where x is less than y.
   831  func (x Int32s) Less(y Int32s) Mask32s {
   832  	var res Mask32s
   833  	for i := 0; i < 4; i++ {
   834  		if x.get(i) < y.get(i) {
   835  			res.set(i, true)
   836  		}
   837  	}
   838  	return res
   839  }
   840  
   841  // LessEqual returns a mask indicating where x is less than or equal to y.
   842  func (x Int32s) LessEqual(y Int32s) Mask32s {
   843  	var res Mask32s
   844  	for i := 0; i < 4; i++ {
   845  		if x.get(i) <= y.get(i) {
   846  			res.set(i, true)
   847  		}
   848  	}
   849  	return res
   850  }
   851  
   852  // NotEqual returns a mask indicating where x and y are not equal.
   853  func (x Int32s) NotEqual(y Int32s) Mask32s {
   854  	var res Mask32s
   855  	for i := 0; i < 4; i++ {
   856  		if x.get(i) != y.get(i) {
   857  			res.set(i, true)
   858  		}
   859  	}
   860  	return res
   861  }
   862  
   863  // Len returns the number of elements in the vector.
   864  func (x Int32s) Len() int {
   865  	return 4
   866  }
   867  
   868  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
   869  func (x Int32s) Masked(mask Mask32s) Int32s {
   870  	return Int32s{a: x.a & mask.a, b: x.b & mask.b}
   871  }
   872  
   873  // Max returns the element-wise maximum of x and y.
   874  func (x Int32s) Max(y Int32s) Int32s {
   875  	var res Int32s
   876  	for i := 0; i < 4; i++ {
   877  		vx := x.get(i)
   878  		vy := y.get(i)
   879  		if vx > vy {
   880  			res.set(i, vx)
   881  		} else {
   882  			res.set(i, vy)
   883  		}
   884  	}
   885  	return res
   886  }
   887  
   888  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
   889  func (x Int32s) IfElse(mask Mask32s, y Int32s) Int32s {
   890  	return Int32s{
   891  		a: (x.a & mask.a) | (y.a &^ mask.a),
   892  		b: (x.b & mask.b) | (y.b &^ mask.b),
   893  	}
   894  }
   895  
   896  // Min returns the element-wise minimum of x and y.
   897  func (x Int32s) Min(y Int32s) Int32s {
   898  	var res Int32s
   899  	for i := 0; i < 4; i++ {
   900  		vx := x.get(i)
   901  		vy := y.get(i)
   902  		if vx < vy {
   903  			res.set(i, vx)
   904  		} else {
   905  			res.set(i, vy)
   906  		}
   907  	}
   908  	return res
   909  }
   910  
   911  // Mul returns the element-wise product of x and y.
   912  func (x Int32s) Mul(y Int32s) Int32s {
   913  	var res Int32s
   914  	for i := 0; i < 4; i++ {
   915  		res.set(i, x.get(i)*y.get(i))
   916  	}
   917  	return res
   918  }
   919  
   920  // Neg returns the element-wise negation of x.
   921  func (x Int32s) Neg() Int32s {
   922  	var res Int32s
   923  	for i := 0; i < 4; i++ {
   924  		res.set(i, -x.get(i))
   925  	}
   926  	return res
   927  }
   928  
   929  // Not returns the bitwise NOT of x.
   930  func (x Int32s) Not() Int32s {
   931  	return Int32s{a: ^x.a, b: ^x.b}
   932  }
   933  
   934  // Or returns the bitwise OR of x and y.
   935  func (x Int32s) Or(y Int32s) Int32s {
   936  	return Int32s{a: x.a | y.a, b: x.b | y.b}
   937  }
   938  
   939  // ShiftAllLeft shifts all elements left by y bits.
   940  func (x Int32s) ShiftAllLeft(y uint8) Int32s {
   941  	var res Int32s
   942  	for i := 0; i < 4; i++ {
   943  		res.set(i, x.get(i)<<y)
   944  	}
   945  	return res
   946  }
   947  
   948  // ShiftAllRight shifts all elements right by y bits.
   949  func (x Int32s) ShiftAllRight(y uint8) Int32s {
   950  	var res Int32s
   951  	for i := 0; i < 4; i++ {
   952  		res.set(i, x.get(i)>>y)
   953  	}
   954  	return res
   955  }
   956  
   957  // RotateAllLeft rotates all elements left by dist bits.
   958  func (x Int32s) RotateAllLeft(dist uint64) Int32s {
   959  	var res Int32s
   960  	d := dist & 31
   961  	for i := 0; i < 4; i++ {
   962  		u := uint32(x.get(i))
   963  		r := (u << d) | (u >> ((32 - d) & 31))
   964  		res.set(i, int32(r))
   965  	}
   966  	return res
   967  }
   968  
   969  // RotateAllRight rotates all elements right by dist bits.
   970  func (x Int32s) RotateAllRight(dist uint64) Int32s {
   971  	var res Int32s
   972  	d := dist & 31
   973  	for i := 0; i < 4; i++ {
   974  		u := uint32(x.get(i))
   975  		r := (u >> d) | (u << ((32 - d) & 31))
   976  		res.set(i, int32(r))
   977  	}
   978  	return res
   979  }
   980  
   981  // Store stores the vector elements into the slice s.
   982  func (x Int32s) Store(s []int32) {
   983  	for i := 0; i < 4 && i < len(s); i++ {
   984  		s[i] = x.get(i)
   985  	}
   986  }
   987  
   988  // StorePart stores a partial vector into the slice s.
   989  func (x Int32s) StorePart(s []int32) int {
   990  	x.Store(s)
   991  	return min(len(s), x.Len())
   992  }
   993  
   994  // String returns a string representation of the vector.
   995  func (x Int32s) String() string {
   996  	var parts [4]int32
   997  	for i := 0; i < 4; i++ {
   998  		parts[i] = x.get(i)
   999  	}
  1000  	return fmt.Sprint(parts)
  1001  }
  1002  
  1003  // Sub returns the element-wise difference of x and y.
  1004  func (x Int32s) Sub(y Int32s) Int32s {
  1005  	var res Int32s
  1006  	for i := 0; i < 4; i++ {
  1007  		res.set(i, x.get(i)-y.get(i))
  1008  	}
  1009  	return res
  1010  }
  1011  
  1012  // ToMask returns a mask representation of the vector.
  1013  func (x Int32s) ToMask() Mask32s {
  1014  	var res Mask32s
  1015  	for i := 0; i < 4; i++ {
  1016  		if x.get(i) != 0 {
  1017  			res.set(i, true)
  1018  		}
  1019  	}
  1020  	return res
  1021  }
  1022  
  1023  // Xor returns the bitwise XOR of x and y.
  1024  func (x Int32s) Xor(y Int32s) Int32s {
  1025  	return Int32s{a: x.a ^ y.a, b: x.b ^ y.b}
  1026  }
  1027  
  1028  // ConvertToUint32 converts the vector elements to uint32.
  1029  func (x Int32s) ConvertToUint32() Uint32s {
  1030  	return Uint32s{a: x.a, b: x.b}
  1031  }
  1032  
  1033  // ToBits reinterprets the vector bits as a Uint32s vector.
  1034  func (x Int32s) ToBits() Uint32s {
  1035  	return Uint32s{a: x.a, b: x.b}
  1036  }
  1037  
  1038  // LoadInt64s loads a slice of int64 into an Int64s vector.
  1039  func LoadInt64s(s []int64) Int64s {
  1040  	var a, b uint64
  1041  	a = uint64(s[0])
  1042  	b = uint64(s[1])
  1043  	return Int64s{a: a, b: b}
  1044  }
  1045  
  1046  // LoadInt64sPart loads a partial slice of int64 into an Int64s vector.
  1047  func LoadInt64sPart(s []int64) (Int64s, int) {
  1048  	var a, b uint64
  1049  	if len(s) > 0 {
  1050  		a = uint64(s[0])
  1051  	}
  1052  	if len(s) > 1 {
  1053  		b = uint64(s[1])
  1054  	}
  1055  	return Int64s{a: a, b: b}, len(s)
  1056  }
  1057  
  1058  func (x Int64s) get(i int) int64 {
  1059  	if i == 0 {
  1060  		return int64(x.a)
  1061  	}
  1062  	return int64(x.b)
  1063  }
  1064  
  1065  func (x *Int64s) set(i int, v int64) {
  1066  	if i == 0 {
  1067  		x.a = uint64(v)
  1068  	} else {
  1069  		x.b = uint64(v)
  1070  	}
  1071  }
  1072  
  1073  // Add returns the element-wise sum of x and y.
  1074  func (x Int64s) Add(y Int64s) Int64s {
  1075  	return Int64s{a: x.a + y.a, b: x.b + y.b}
  1076  }
  1077  
  1078  // And returns the bitwise AND of x and y.
  1079  func (x Int64s) And(y Int64s) Int64s {
  1080  	return Int64s{a: x.a & y.a, b: x.b & y.b}
  1081  }
  1082  
  1083  // AndNot returns the bitwise AND NOT of x and y.
  1084  func (x Int64s) AndNot(y Int64s) Int64s {
  1085  	return Int64s{a: x.a &^ y.a, b: x.b &^ y.b}
  1086  }
  1087  
  1088  // Equal returns a mask indicating where x and y are equal.
  1089  func (x Int64s) Equal(y Int64s) Mask64s {
  1090  	var res Mask64s
  1091  	if x.a == y.a {
  1092  		res.a = ^uint64(0)
  1093  	}
  1094  	if x.b == y.b {
  1095  		res.b = ^uint64(0)
  1096  	}
  1097  	return res
  1098  }
  1099  
  1100  // Greater returns a mask indicating where x is greater than y.
  1101  func (x Int64s) Greater(y Int64s) Mask64s {
  1102  	var res Mask64s
  1103  	if int64(x.a) > int64(y.a) {
  1104  		res.a = ^uint64(0)
  1105  	}
  1106  	if int64(x.b) > int64(y.b) {
  1107  		res.b = ^uint64(0)
  1108  	}
  1109  	return res
  1110  }
  1111  
  1112  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  1113  func (x Int64s) GreaterEqual(y Int64s) Mask64s {
  1114  	var res Mask64s
  1115  	if int64(x.a) >= int64(y.a) {
  1116  		res.a = ^uint64(0)
  1117  	}
  1118  	if int64(x.b) >= int64(y.b) {
  1119  		res.b = ^uint64(0)
  1120  	}
  1121  	return res
  1122  }
  1123  
  1124  // Less returns a mask indicating where x is less than y.
  1125  func (x Int64s) Less(y Int64s) Mask64s {
  1126  	var res Mask64s
  1127  	if int64(x.a) < int64(y.a) {
  1128  		res.a = ^uint64(0)
  1129  	}
  1130  	if int64(x.b) < int64(y.b) {
  1131  		res.b = ^uint64(0)
  1132  	}
  1133  	return res
  1134  }
  1135  
  1136  // LessEqual returns a mask indicating where x is less than or equal to y.
  1137  func (x Int64s) LessEqual(y Int64s) Mask64s {
  1138  	var res Mask64s
  1139  	if int64(x.a) <= int64(y.a) {
  1140  		res.a = ^uint64(0)
  1141  	}
  1142  	if int64(x.b) <= int64(y.b) {
  1143  		res.b = ^uint64(0)
  1144  	}
  1145  	return res
  1146  }
  1147  
  1148  // NotEqual returns a mask indicating where x and y are not equal.
  1149  func (x Int64s) NotEqual(y Int64s) Mask64s {
  1150  	var res Mask64s
  1151  	if x.a != y.a {
  1152  		res.a = ^uint64(0)
  1153  	}
  1154  	if x.b != y.b {
  1155  		res.b = ^uint64(0)
  1156  	}
  1157  	return res
  1158  }
  1159  
  1160  // Len returns the number of elements in the vector.
  1161  func (x Int64s) Len() int {
  1162  	return 2
  1163  }
  1164  
  1165  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  1166  func (x Int64s) Masked(mask Mask64s) Int64s {
  1167  	return Int64s{a: x.a & mask.a, b: x.b & mask.b}
  1168  }
  1169  
  1170  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  1171  func (x Int64s) IfElse(mask Mask64s, y Int64s) Int64s {
  1172  	return Int64s{
  1173  		a: (x.a & mask.a) | (y.a &^ mask.a),
  1174  		b: (x.b & mask.b) | (y.b &^ mask.b),
  1175  	}
  1176  }
  1177  
  1178  // Neg returns the element-wise negation of x.
  1179  func (x Int64s) Neg() Int64s {
  1180  	return Int64s{a: uint64(-int64(x.a)), b: uint64(-int64(x.b))}
  1181  }
  1182  
  1183  // Not returns the bitwise NOT of x.
  1184  func (x Int64s) Not() Int64s {
  1185  	return Int64s{a: ^x.a, b: ^x.b}
  1186  }
  1187  
  1188  // Or returns the bitwise OR of x and y.
  1189  func (x Int64s) Or(y Int64s) Int64s {
  1190  	return Int64s{a: x.a | y.a, b: x.b | y.b}
  1191  }
  1192  
  1193  // ShiftAllLeft shifts all elements left by y bits.
  1194  func (x Int64s) ShiftAllLeft(y uint8) Int64s {
  1195  	return Int64s{a: x.a << y, b: x.b << y}
  1196  }
  1197  
  1198  // RotateAllLeft rotates all elements left by dist bits.
  1199  func (x Int64s) RotateAllLeft(dist uint64) Int64s {
  1200  	d := dist & 63
  1201  	return Int64s{
  1202  		a: (x.a << d) | (x.a >> ((64 - d) & 63)),
  1203  		b: (x.b << d) | (x.b >> ((64 - d) & 63)),
  1204  	}
  1205  }
  1206  
  1207  // RotateAllRight rotates all elements right by dist bits.
  1208  func (x Int64s) RotateAllRight(dist uint64) Int64s {
  1209  	d := dist & 63
  1210  	return Int64s{
  1211  		a: (x.a >> d) | (x.a << ((64 - d) & 63)),
  1212  		b: (x.b >> d) | (x.b << ((64 - d) & 63)),
  1213  	}
  1214  }
  1215  
  1216  // Store stores the vector elements into the slice s.
  1217  func (x Int64s) Store(s []int64) {
  1218  	if len(s) > 0 {
  1219  		s[0] = int64(x.a)
  1220  	}
  1221  	if len(s) > 1 {
  1222  		s[1] = int64(x.b)
  1223  	}
  1224  }
  1225  
  1226  // StorePart stores a partial vector into the slice s.
  1227  func (x Int64s) StorePart(s []int64) int {
  1228  	x.Store(s)
  1229  	return min(len(s), x.Len())
  1230  }
  1231  
  1232  // String returns a string representation of the vector.
  1233  func (x Int64s) String() string {
  1234  	return fmt.Sprint([2]int64{int64(x.a), int64(x.b)})
  1235  }
  1236  
  1237  // Sub returns the element-wise difference of x and y.
  1238  func (x Int64s) Sub(y Int64s) Int64s {
  1239  	return Int64s{a: x.a - y.a, b: x.b - y.b}
  1240  }
  1241  
  1242  // ToMask returns a mask representation of the vector.
  1243  func (x Int64s) ToMask() Mask64s {
  1244  	var res Mask64s
  1245  	if x.a != 0 {
  1246  		res.a = ^uint64(0)
  1247  	}
  1248  	if x.b != 0 {
  1249  		res.b = ^uint64(0)
  1250  	}
  1251  	return res
  1252  }
  1253  
  1254  // Xor returns the bitwise XOR of x and y.
  1255  func (x Int64s) Xor(y Int64s) Int64s {
  1256  	return Int64s{a: x.a ^ y.a, b: x.b ^ y.b}
  1257  }
  1258  
  1259  // ConvertToUint64 converts the vector elements to uint64.
  1260  func (x Int64s) ConvertToUint64() Uint64s {
  1261  	return Uint64s{a: x.a, b: x.b}
  1262  }
  1263  
  1264  // ToBits reinterprets the vector bits as a Uint64s vector.
  1265  func (x Int64s) ToBits() Uint64s {
  1266  	return Uint64s{a: x.a, b: x.b}
  1267  }
  1268  
  1269  // LoadUint8s loads a slice of uint8 into an Uint8s vector.
  1270  func LoadUint8s(s []uint8) Uint8s {
  1271  	var a, b uint64
  1272  	for i := 0; i < 16; i++ {
  1273  		val := uint64(s[i])
  1274  		if i < 8 {
  1275  			a |= val << (8 * i)
  1276  		} else {
  1277  			b |= val << (8 * (i - 8))
  1278  		}
  1279  	}
  1280  	return Uint8s{a: a, b: b}
  1281  }
  1282  
  1283  // LoadUint8sPart loads a partial slice of uint8 into an Uint8s vector.
  1284  func LoadUint8sPart(s []uint8) (Uint8s, int) {
  1285  	var a, b uint64
  1286  	n := len(s)
  1287  	if n > 16 {
  1288  		n = 16
  1289  	}
  1290  	for i := 0; i < n; i++ {
  1291  		val := uint64(s[i])
  1292  		if i < 8 {
  1293  			a |= val << (8 * i)
  1294  		} else {
  1295  			b |= val << (8 * (i - 8))
  1296  		}
  1297  	}
  1298  	return Uint8s{a: a, b: b}, n
  1299  }
  1300  
  1301  func (x Uint8s) get(i int) uint8 {
  1302  	if i < 8 {
  1303  		return uint8(x.a >> (8 * i))
  1304  	}
  1305  	return uint8(x.b >> (8 * (i - 8)))
  1306  }
  1307  
  1308  func (x *Uint8s) set(i int, v uint8) {
  1309  	val := uint64(v)
  1310  	if i < 8 {
  1311  		mask := uint64(0xff) << (8 * i)
  1312  		x.a = (x.a &^ mask) | (val << (8 * i))
  1313  	} else {
  1314  		mask := uint64(0xff) << (8 * (i - 8))
  1315  		x.b = (x.b &^ mask) | (val << (8 * (i - 8)))
  1316  	}
  1317  }
  1318  
  1319  // Add returns the element-wise sum of x and y.
  1320  func (x Uint8s) Add(y Uint8s) Uint8s {
  1321  	var res Uint8s
  1322  	for i := 0; i < 16; i++ {
  1323  		res.set(i, x.get(i)+y.get(i))
  1324  	}
  1325  	return res
  1326  }
  1327  
  1328  // AddSaturated returns the element-wise saturated sum of x and y.
  1329  func (x Uint8s) AddSaturated(y Uint8s) Uint8s {
  1330  	var res Uint8s
  1331  	for i := 0; i < 16; i++ {
  1332  		sum := int(x.get(i)) + int(y.get(i))
  1333  		if sum > math.MaxUint8 {
  1334  			res.set(i, math.MaxUint8)
  1335  		} else {
  1336  			res.set(i, uint8(sum))
  1337  		}
  1338  	}
  1339  	return res
  1340  }
  1341  
  1342  // And returns the bitwise AND of x and y.
  1343  func (x Uint8s) And(y Uint8s) Uint8s {
  1344  	return Uint8s{a: x.a & y.a, b: x.b & y.b}
  1345  }
  1346  
  1347  // AndNot returns the bitwise AND NOT of x and y.
  1348  func (x Uint8s) AndNot(y Uint8s) Uint8s {
  1349  	return Uint8s{a: x.a &^ y.a, b: x.b &^ y.b}
  1350  }
  1351  
  1352  // Average returns the element-wise average of x and y.
  1353  func (x Uint8s) Average(y Uint8s) Uint8s {
  1354  	var res Uint8s
  1355  	for i := 0; i < 16; i++ {
  1356  		res.set(i, uint8((int(x.get(i))+int(y.get(i))+1)>>1))
  1357  	}
  1358  	return res
  1359  }
  1360  
  1361  // Equal returns a mask indicating where x and y are equal.
  1362  func (x Uint8s) Equal(y Uint8s) Mask8s {
  1363  	var res Mask8s
  1364  	for i := 0; i < 16; i++ {
  1365  		if x.get(i) == y.get(i) {
  1366  			res.set(i, true)
  1367  		}
  1368  	}
  1369  	return res
  1370  }
  1371  
  1372  // NotEqual returns a mask indicating where x and y are not equal.
  1373  func (x Uint8s) NotEqual(y Uint8s) Mask8s {
  1374  	var res Mask8s
  1375  	for i := 0; i < 16; i++ {
  1376  		if x.get(i) != y.get(i) {
  1377  			res.set(i, true)
  1378  		}
  1379  	}
  1380  	return res
  1381  }
  1382  
  1383  // Len returns the number of elements in the vector.
  1384  func (x Uint8s) Len() int {
  1385  	return 16
  1386  }
  1387  
  1388  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  1389  func (x Uint8s) Masked(mask Mask8s) Uint8s {
  1390  	return Uint8s{a: x.a & mask.a, b: x.b & mask.b}
  1391  }
  1392  
  1393  // Max returns the element-wise maximum of x and y.
  1394  func (x Uint8s) Max(y Uint8s) Uint8s {
  1395  	var res Uint8s
  1396  	for i := 0; i < 16; i++ {
  1397  		vx := x.get(i)
  1398  		vy := y.get(i)
  1399  		if vx > vy {
  1400  			res.set(i, vx)
  1401  		} else {
  1402  			res.set(i, vy)
  1403  		}
  1404  	}
  1405  	return res
  1406  }
  1407  
  1408  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  1409  func (x Uint8s) IfElse(mask Mask8s, y Uint8s) Uint8s {
  1410  	return Uint8s{
  1411  		a: (x.a & mask.a) | (y.a &^ mask.a),
  1412  		b: (x.b & mask.b) | (y.b &^ mask.b),
  1413  	}
  1414  }
  1415  
  1416  // Min returns the element-wise minimum of x and y.
  1417  func (x Uint8s) Min(y Uint8s) Uint8s {
  1418  	var res Uint8s
  1419  	for i := 0; i < 16; i++ {
  1420  		vx := x.get(i)
  1421  		vy := y.get(i)
  1422  		if vx < vy {
  1423  			res.set(i, vx)
  1424  		} else {
  1425  			res.set(i, vy)
  1426  		}
  1427  	}
  1428  	return res
  1429  }
  1430  
  1431  // Mul returns the element-wise product of x and y.
  1432  func (x Uint8s) Mul(y Uint8s) Uint8s {
  1433  	var res Uint8s
  1434  	for i := 0; i < 16; i++ {
  1435  		res.set(i, x.get(i)*y.get(i))
  1436  	}
  1437  	return res
  1438  }
  1439  
  1440  // Not returns the bitwise NOT of x.
  1441  func (x Uint8s) Not() Uint8s {
  1442  	return Uint8s{a: ^x.a, b: ^x.b}
  1443  }
  1444  
  1445  // Or returns the bitwise OR of x and y.
  1446  func (x Uint8s) Or(y Uint8s) Uint8s {
  1447  	return Uint8s{a: x.a | y.a, b: x.b | y.b}
  1448  }
  1449  
  1450  // Store stores the vector elements into the slice s.
  1451  func (x Uint8s) Store(s []uint8) {
  1452  	for i := 0; i < 16 && i < len(s); i++ {
  1453  		s[i] = x.get(i)
  1454  	}
  1455  }
  1456  
  1457  // StorePart stores a partial vector into the slice s.
  1458  func (x Uint8s) StorePart(s []uint8) int {
  1459  	x.Store(s)
  1460  	return min(len(s), x.Len())
  1461  }
  1462  
  1463  // String returns a string representation of the vector.
  1464  func (x Uint8s) String() string {
  1465  	var parts [16]uint8
  1466  	for i := 0; i < 16; i++ {
  1467  		parts[i] = x.get(i)
  1468  	}
  1469  	return fmt.Sprint(parts)
  1470  }
  1471  
  1472  // Sub returns the element-wise difference of x and y.
  1473  func (x Uint8s) Sub(y Uint8s) Uint8s {
  1474  	var res Uint8s
  1475  	for i := 0; i < 16; i++ {
  1476  		res.set(i, x.get(i)-y.get(i))
  1477  	}
  1478  	return res
  1479  }
  1480  
  1481  // SubSaturated returns the element-wise saturated difference of x and y.
  1482  func (x Uint8s) SubSaturated(y Uint8s) Uint8s {
  1483  	var res Uint8s
  1484  	for i := 0; i < 16; i++ {
  1485  		vx := x.get(i)
  1486  		vy := y.get(i)
  1487  		if vx < vy {
  1488  			res.set(i, 0)
  1489  		} else {
  1490  			res.set(i, vx-vy)
  1491  		}
  1492  	}
  1493  	return res
  1494  }
  1495  
  1496  // Xor returns the bitwise XOR of x and y.
  1497  func (x Uint8s) Xor(y Uint8s) Uint8s {
  1498  	return Uint8s{a: x.a ^ y.a, b: x.b ^ y.b}
  1499  }
  1500  
  1501  // BitsToInt8 reinterprets the vector bits as an Int8s vector.
  1502  func (x Uint8s) BitsToInt8() Int8s {
  1503  	return Int8s{a: x.a, b: x.b}
  1504  }
  1505  
  1506  // ConvertToInt8 converts the vector elements to int8.
  1507  func (x Uint8s) ConvertToInt8() Int8s {
  1508  	return Int8s{a: x.a, b: x.b}
  1509  }
  1510  
  1511  // ReshapeToUint16s reinterprets the vector bits as a Uint16s vector.
  1512  func (x Uint8s) ReshapeToUint16s() Uint16s {
  1513  	return Uint16s{a: x.a, b: x.b}
  1514  }
  1515  
  1516  // ReshapeToUint32s reinterprets the vector bits as a Uint32s vector.
  1517  func (x Uint8s) ReshapeToUint32s() Uint32s {
  1518  	return Uint32s{a: x.a, b: x.b}
  1519  }
  1520  
  1521  // ReshapeToUint64s reinterprets the vector bits as a Uint64s vector.
  1522  func (x Uint8s) ReshapeToUint64s() Uint64s {
  1523  	return Uint64s{a: x.a, b: x.b}
  1524  }
  1525  
  1526  // LoadUint16s loads a slice of uint16 into an Uint16s vector.
  1527  func LoadUint16s(s []uint16) Uint16s {
  1528  	var a, b uint64
  1529  	for i := 0; i < 8; i++ {
  1530  		val := uint64(s[i])
  1531  		if i < 4 {
  1532  			a |= val << (16 * i)
  1533  		} else {
  1534  			b |= val << (16 * (i - 4))
  1535  		}
  1536  	}
  1537  	return Uint16s{a: a, b: b}
  1538  }
  1539  
  1540  // LoadUint16sPart loads a partial slice of uint16 into an Uint16s vector.
  1541  func LoadUint16sPart(s []uint16) (Uint16s, int) {
  1542  	var a, b uint64
  1543  	n := len(s)
  1544  	if n > 8 {
  1545  		n = 8
  1546  	}
  1547  	for i := 0; i < n; i++ {
  1548  		val := uint64(s[i])
  1549  		if i < 4 {
  1550  			a |= val << (16 * i)
  1551  		} else {
  1552  			b |= val << (16 * (i - 4))
  1553  		}
  1554  	}
  1555  	return Uint16s{a: a, b: b}, n
  1556  }
  1557  
  1558  func (x Uint16s) get(i int) uint16 {
  1559  	if i < 4 {
  1560  		return uint16(x.a >> (16 * i))
  1561  	}
  1562  	return uint16(x.b >> (16 * (i - 4)))
  1563  }
  1564  
  1565  func (x *Uint16s) set(i int, v uint16) {
  1566  	val := uint64(v)
  1567  	if i < 4 {
  1568  		mask := uint64(0xffff) << (16 * i)
  1569  		x.a = (x.a &^ mask) | (val << (16 * i))
  1570  	} else {
  1571  		mask := uint64(0xffff) << (16 * (i - 4))
  1572  		x.b = (x.b &^ mask) | (val << (16 * (i - 4)))
  1573  	}
  1574  }
  1575  
  1576  // Add returns the element-wise sum of x and y.
  1577  func (x Uint16s) Add(y Uint16s) Uint16s {
  1578  	var res Uint16s
  1579  	for i := 0; i < 8; i++ {
  1580  		res.set(i, x.get(i)+y.get(i))
  1581  	}
  1582  	return res
  1583  }
  1584  
  1585  // AddSaturated returns the element-wise saturated sum of x and y.
  1586  func (x Uint16s) AddSaturated(y Uint16s) Uint16s {
  1587  	var res Uint16s
  1588  	for i := 0; i < 8; i++ {
  1589  		sum := int(x.get(i)) + int(y.get(i))
  1590  		if sum > math.MaxUint16 {
  1591  			res.set(i, math.MaxUint16)
  1592  		} else {
  1593  			res.set(i, uint16(sum))
  1594  		}
  1595  	}
  1596  	return res
  1597  }
  1598  
  1599  // And returns the bitwise AND of x and y.
  1600  func (x Uint16s) And(y Uint16s) Uint16s {
  1601  	return Uint16s{a: x.a & y.a, b: x.b & y.b}
  1602  }
  1603  
  1604  // AndNot returns the bitwise AND NOT of x and y.
  1605  func (x Uint16s) AndNot(y Uint16s) Uint16s {
  1606  	return Uint16s{a: x.a &^ y.a, b: x.b &^ y.b}
  1607  }
  1608  
  1609  // Average returns the element-wise average of x and y.
  1610  func (x Uint16s) Average(y Uint16s) Uint16s {
  1611  	var res Uint16s
  1612  	for i := 0; i < 8; i++ {
  1613  		res.set(i, uint16((int(x.get(i))+int(y.get(i))+1)>>1))
  1614  	}
  1615  	return res
  1616  }
  1617  
  1618  // Equal returns a mask indicating where x and y are equal.
  1619  func (x Uint16s) Equal(y Uint16s) Mask16s {
  1620  	var res Mask16s
  1621  	for i := 0; i < 8; i++ {
  1622  		if x.get(i) == y.get(i) {
  1623  			res.set(i, true)
  1624  		}
  1625  	}
  1626  	return res
  1627  }
  1628  
  1629  // Greater returns a mask indicating where x is greater than y.
  1630  func (x Uint16s) Greater(y Uint16s) Mask16s {
  1631  	var res Mask16s
  1632  	for i := 0; i < 8; i++ {
  1633  		if x.get(i) > y.get(i) {
  1634  			res.set(i, true)
  1635  		}
  1636  	}
  1637  	return res
  1638  }
  1639  
  1640  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  1641  func (x Uint16s) GreaterEqual(y Uint16s) Mask16s {
  1642  	var res Mask16s
  1643  	for i := 0; i < 8; i++ {
  1644  		if x.get(i) >= y.get(i) {
  1645  			res.set(i, true)
  1646  		}
  1647  	}
  1648  	return res
  1649  }
  1650  
  1651  // Less returns a mask indicating where x is less than y.
  1652  func (x Uint16s) Less(y Uint16s) Mask16s {
  1653  	var res Mask16s
  1654  	for i := 0; i < 8; i++ {
  1655  		if x.get(i) < y.get(i) {
  1656  			res.set(i, true)
  1657  		}
  1658  	}
  1659  	return res
  1660  }
  1661  
  1662  // LessEqual returns a mask indicating where x is less than or equal to y.
  1663  func (x Uint16s) LessEqual(y Uint16s) Mask16s {
  1664  	var res Mask16s
  1665  	for i := 0; i < 8; i++ {
  1666  		if x.get(i) <= y.get(i) {
  1667  			res.set(i, true)
  1668  		}
  1669  	}
  1670  	return res
  1671  }
  1672  
  1673  // NotEqual returns a mask indicating where x and y are not equal.
  1674  func (x Uint16s) NotEqual(y Uint16s) Mask16s {
  1675  	var res Mask16s
  1676  	for i := 0; i < 8; i++ {
  1677  		if x.get(i) != y.get(i) {
  1678  			res.set(i, true)
  1679  		}
  1680  	}
  1681  	return res
  1682  }
  1683  
  1684  // Len returns the number of elements in the vector.
  1685  func (x Uint16s) Len() int {
  1686  	return 8
  1687  }
  1688  
  1689  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  1690  func (x Uint16s) Masked(mask Mask16s) Uint16s {
  1691  	return Uint16s{a: x.a & mask.a, b: x.b & mask.b}
  1692  }
  1693  
  1694  // Max returns the element-wise maximum of x and y.
  1695  func (x Uint16s) Max(y Uint16s) Uint16s {
  1696  	var res Uint16s
  1697  	for i := 0; i < 8; i++ {
  1698  		vx := x.get(i)
  1699  		vy := y.get(i)
  1700  		if vx > vy {
  1701  			res.set(i, vx)
  1702  		} else {
  1703  			res.set(i, vy)
  1704  		}
  1705  	}
  1706  	return res
  1707  }
  1708  
  1709  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  1710  func (x Uint16s) IfElse(mask Mask16s, y Uint16s) Uint16s {
  1711  	return Uint16s{
  1712  		a: (x.a & mask.a) | (y.a &^ mask.a),
  1713  		b: (x.b & mask.b) | (y.b &^ mask.b),
  1714  	}
  1715  }
  1716  
  1717  // Min returns the element-wise minimum of x and y.
  1718  func (x Uint16s) Min(y Uint16s) Uint16s {
  1719  	var res Uint16s
  1720  	for i := 0; i < 8; i++ {
  1721  		vx := x.get(i)
  1722  		vy := y.get(i)
  1723  		if vx < vy {
  1724  			res.set(i, vx)
  1725  		} else {
  1726  			res.set(i, vy)
  1727  		}
  1728  	}
  1729  	return res
  1730  }
  1731  
  1732  // Mul returns the element-wise product of x and y.
  1733  func (x Uint16s) Mul(y Uint16s) Uint16s {
  1734  	var res Uint16s
  1735  	for i := 0; i < 8; i++ {
  1736  		res.set(i, x.get(i)*y.get(i))
  1737  	}
  1738  	return res
  1739  }
  1740  
  1741  // Not returns the bitwise NOT of x.
  1742  func (x Uint16s) Not() Uint16s {
  1743  	return Uint16s{a: ^x.a, b: ^x.b}
  1744  }
  1745  
  1746  // Or returns the bitwise OR of x and y.
  1747  func (x Uint16s) Or(y Uint16s) Uint16s {
  1748  	return Uint16s{a: x.a | y.a, b: x.b | y.b}
  1749  }
  1750  
  1751  // ShiftAllLeft shifts all elements left by y bits.
  1752  func (x Uint16s) ShiftAllLeft(y uint8) Uint16s {
  1753  	var res Uint16s
  1754  	for i := 0; i < 8; i++ {
  1755  		res.set(i, x.get(i)<<y)
  1756  	}
  1757  	return res
  1758  }
  1759  
  1760  // ShiftAllRight shifts all elements right by y bits.
  1761  func (x Uint16s) ShiftAllRight(y uint8) Uint16s {
  1762  	var res Uint16s
  1763  	for i := 0; i < 8; i++ {
  1764  		res.set(i, x.get(i)>>y)
  1765  	}
  1766  	return res
  1767  }
  1768  
  1769  // RotateAllLeft rotates all elements left by dist bits.
  1770  func (x Uint16s) RotateAllLeft(dist uint64) Uint16s {
  1771  	var res Uint16s
  1772  	d := dist & 15
  1773  	for i := 0; i < 8; i++ {
  1774  		u := x.get(i)
  1775  		r := (u << d) | (u >> ((16 - d) & 15))
  1776  		res.set(i, r)
  1777  	}
  1778  	return res
  1779  }
  1780  
  1781  // RotateAllRight rotates all elements right by dist bits.
  1782  func (x Uint16s) RotateAllRight(dist uint64) Uint16s {
  1783  	var res Uint16s
  1784  	d := dist & 15
  1785  	for i := 0; i < 8; i++ {
  1786  		u := x.get(i)
  1787  		r := (u >> d) | (u << ((16 - d) & 15))
  1788  		res.set(i, r)
  1789  	}
  1790  	return res
  1791  }
  1792  
  1793  // Store stores the vector elements into the slice s.
  1794  func (x Uint16s) Store(s []uint16) {
  1795  	for i := 0; i < 8 && i < len(s); i++ {
  1796  		s[i] = x.get(i)
  1797  	}
  1798  }
  1799  
  1800  // StorePart stores a partial vector into the slice s.
  1801  func (x Uint16s) StorePart(s []uint16) int {
  1802  	x.Store(s)
  1803  	return min(len(s), x.Len())
  1804  }
  1805  
  1806  // String returns a string representation of the vector.
  1807  func (x Uint16s) String() string {
  1808  	var parts [8]uint16
  1809  	for i := 0; i < 8; i++ {
  1810  		parts[i] = x.get(i)
  1811  	}
  1812  	return fmt.Sprint(parts)
  1813  }
  1814  
  1815  // Sub returns the element-wise difference of x and y.
  1816  func (x Uint16s) Sub(y Uint16s) Uint16s {
  1817  	var res Uint16s
  1818  	for i := 0; i < 8; i++ {
  1819  		res.set(i, x.get(i)-y.get(i))
  1820  	}
  1821  	return res
  1822  }
  1823  
  1824  // SubSaturated returns the element-wise saturated difference of x and y.
  1825  func (x Uint16s) SubSaturated(y Uint16s) Uint16s {
  1826  	var res Uint16s
  1827  	for i := 0; i < 8; i++ {
  1828  		vx := x.get(i)
  1829  		vy := y.get(i)
  1830  		if vx < vy {
  1831  			res.set(i, 0)
  1832  		} else {
  1833  			res.set(i, vx-vy)
  1834  		}
  1835  	}
  1836  	return res
  1837  }
  1838  
  1839  // Xor returns the bitwise XOR of x and y.
  1840  func (x Uint16s) Xor(y Uint16s) Uint16s {
  1841  	return Uint16s{a: x.a ^ y.a, b: x.b ^ y.b}
  1842  }
  1843  
  1844  // BitsToInt16 reinterprets the vector bits as an Int16s vector.
  1845  func (x Uint16s) BitsToInt16() Int16s {
  1846  	return Int16s{a: x.a, b: x.b}
  1847  }
  1848  
  1849  // ConvertToInt16 converts the vector elements to int16.
  1850  func (x Uint16s) ConvertToInt16() Int16s {
  1851  	return Int16s{a: x.a, b: x.b}
  1852  }
  1853  
  1854  // ReshapeToUint32s reinterprets the vector bits as a Uint32s vector.
  1855  func (x Uint16s) ReshapeToUint32s() Uint32s {
  1856  	return Uint32s{a: x.a, b: x.b}
  1857  }
  1858  
  1859  // ReshapeToUint64s reinterprets the vector bits as a Uint64s vector.
  1860  func (x Uint16s) ReshapeToUint64s() Uint64s {
  1861  	return Uint64s{a: x.a, b: x.b}
  1862  }
  1863  
  1864  // ReshapeToUint8s reinterprets the vector bits as a Uint8s vector.
  1865  func (x Uint16s) ReshapeToUint8s() Uint8s {
  1866  	return Uint8s{a: x.a, b: x.b}
  1867  }
  1868  
  1869  // LoadUint32s loads a slice of uint32 into an Uint32s vector.
  1870  func LoadUint32s(s []uint32) Uint32s {
  1871  	var a, b uint64
  1872  	for i := 0; i < 4; i++ {
  1873  		val := uint64(s[i])
  1874  		if i < 2 {
  1875  			a |= val << (32 * i)
  1876  		} else {
  1877  			b |= val << (32 * (i - 2))
  1878  		}
  1879  	}
  1880  	return Uint32s{a: a, b: b}
  1881  }
  1882  
  1883  // LoadUint32sPart loads a partial slice of uint32 into an Uint32s vector.
  1884  func LoadUint32sPart(s []uint32) (Uint32s, int) {
  1885  	var a, b uint64
  1886  	n := len(s)
  1887  	if n > 4 {
  1888  		n = 4
  1889  	}
  1890  	for i := 0; i < n; i++ {
  1891  		val := uint64(s[i])
  1892  		if i < 2 {
  1893  			a |= val << (32 * i)
  1894  		} else {
  1895  			b |= val << (32 * (i - 2))
  1896  		}
  1897  	}
  1898  	return Uint32s{a: a, b: b}, n
  1899  }
  1900  
  1901  func (x Uint32s) get(i int) uint32 {
  1902  	if i < 2 {
  1903  		return uint32(x.a >> (32 * i))
  1904  	}
  1905  	return uint32(x.b >> (32 * (i - 2)))
  1906  }
  1907  
  1908  func (x *Uint32s) set(i int, v uint32) {
  1909  	val := uint64(v)
  1910  	if i < 2 {
  1911  		mask := uint64(0xffffffff) << (32 * i)
  1912  		x.a = (x.a &^ mask) | (val << (32 * i))
  1913  	} else {
  1914  		mask := uint64(0xffffffff) << (32 * (i - 2))
  1915  		x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
  1916  	}
  1917  }
  1918  
  1919  // Add returns the element-wise sum of x and y.
  1920  func (x Uint32s) Add(y Uint32s) Uint32s {
  1921  	var res Uint32s
  1922  	for i := 0; i < 4; i++ {
  1923  		res.set(i, x.get(i)+y.get(i))
  1924  	}
  1925  	return res
  1926  }
  1927  
  1928  // And returns the bitwise AND of x and y.
  1929  func (x Uint32s) And(y Uint32s) Uint32s {
  1930  	return Uint32s{a: x.a & y.a, b: x.b & y.b}
  1931  }
  1932  
  1933  // AndNot returns the bitwise AND NOT of x and y.
  1934  func (x Uint32s) AndNot(y Uint32s) Uint32s {
  1935  	return Uint32s{a: x.a &^ y.a, b: x.b &^ y.b}
  1936  }
  1937  
  1938  // Equal returns a mask indicating where x and y are equal.
  1939  func (x Uint32s) Equal(y Uint32s) Mask32s {
  1940  	var res Mask32s
  1941  	for i := 0; i < 4; i++ {
  1942  		if x.get(i) == y.get(i) {
  1943  			res.set(i, true)
  1944  		}
  1945  	}
  1946  	return res
  1947  }
  1948  
  1949  // Greater returns a mask indicating where x is greater than y.
  1950  func (x Uint32s) Greater(y Uint32s) Mask32s {
  1951  	var res Mask32s
  1952  	for i := 0; i < 4; i++ {
  1953  		if x.get(i) > y.get(i) {
  1954  			res.set(i, true)
  1955  		}
  1956  	}
  1957  	return res
  1958  }
  1959  
  1960  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  1961  func (x Uint32s) GreaterEqual(y Uint32s) Mask32s {
  1962  	var res Mask32s
  1963  	for i := 0; i < 4; i++ {
  1964  		if x.get(i) >= y.get(i) {
  1965  			res.set(i, true)
  1966  		}
  1967  	}
  1968  	return res
  1969  }
  1970  
  1971  // Less returns a mask indicating where x is less than y.
  1972  func (x Uint32s) Less(y Uint32s) Mask32s {
  1973  	var res Mask32s
  1974  	for i := 0; i < 4; i++ {
  1975  		if x.get(i) < y.get(i) {
  1976  			res.set(i, true)
  1977  		}
  1978  	}
  1979  	return res
  1980  }
  1981  
  1982  // LessEqual returns a mask indicating where x is less than or equal to y.
  1983  func (x Uint32s) LessEqual(y Uint32s) Mask32s {
  1984  	var res Mask32s
  1985  	for i := 0; i < 4; i++ {
  1986  		if x.get(i) <= y.get(i) {
  1987  			res.set(i, true)
  1988  		}
  1989  	}
  1990  	return res
  1991  }
  1992  
  1993  // NotEqual returns a mask indicating where x and y are not equal.
  1994  func (x Uint32s) NotEqual(y Uint32s) Mask32s {
  1995  	var res Mask32s
  1996  	for i := 0; i < 4; i++ {
  1997  		if x.get(i) != y.get(i) {
  1998  			res.set(i, true)
  1999  		}
  2000  	}
  2001  	return res
  2002  }
  2003  
  2004  // Len returns the number of elements in the vector.
  2005  func (x Uint32s) Len() int {
  2006  	return 4
  2007  }
  2008  
  2009  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  2010  func (x Uint32s) Masked(mask Mask32s) Uint32s {
  2011  	return Uint32s{a: x.a & mask.a, b: x.b & mask.b}
  2012  }
  2013  
  2014  // Max returns the element-wise maximum of x and y.
  2015  func (x Uint32s) Max(y Uint32s) Uint32s {
  2016  	var res Uint32s
  2017  	for i := 0; i < 4; i++ {
  2018  		vx := x.get(i)
  2019  		vy := y.get(i)
  2020  		if vx > vy {
  2021  			res.set(i, vx)
  2022  		} else {
  2023  			res.set(i, vy)
  2024  		}
  2025  	}
  2026  	return res
  2027  }
  2028  
  2029  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  2030  func (x Uint32s) IfElse(mask Mask32s, y Uint32s) Uint32s {
  2031  	return Uint32s{
  2032  		a: (x.a & mask.a) | (y.a &^ mask.a),
  2033  		b: (x.b & mask.b) | (y.b &^ mask.b),
  2034  	}
  2035  }
  2036  
  2037  // Min returns the element-wise minimum of x and y.
  2038  func (x Uint32s) Min(y Uint32s) Uint32s {
  2039  	var res Uint32s
  2040  	for i := 0; i < 4; i++ {
  2041  		vx := x.get(i)
  2042  		vy := y.get(i)
  2043  		if vx < vy {
  2044  			res.set(i, vx)
  2045  		} else {
  2046  			res.set(i, vy)
  2047  		}
  2048  	}
  2049  	return res
  2050  }
  2051  
  2052  // Mul returns the element-wise product of x and y.
  2053  func (x Uint32s) Mul(y Uint32s) Uint32s {
  2054  	var res Uint32s
  2055  	for i := 0; i < 4; i++ {
  2056  		res.set(i, x.get(i)*y.get(i))
  2057  	}
  2058  	return res
  2059  }
  2060  
  2061  // Not returns the bitwise NOT of x.
  2062  func (x Uint32s) Not() Uint32s {
  2063  	return Uint32s{a: ^x.a, b: ^x.b}
  2064  }
  2065  
  2066  // Or returns the bitwise OR of x and y.
  2067  func (x Uint32s) Or(y Uint32s) Uint32s {
  2068  	return Uint32s{a: x.a | y.a, b: x.b | y.b}
  2069  }
  2070  
  2071  // ShiftAllLeft shifts all elements left by y bits.
  2072  func (x Uint32s) ShiftAllLeft(y uint8) Uint32s {
  2073  	var res Uint32s
  2074  	for i := 0; i < 4; i++ {
  2075  		res.set(i, x.get(i)<<y)
  2076  	}
  2077  	return res
  2078  }
  2079  
  2080  // ShiftAllRight shifts all elements right by y bits.
  2081  func (x Uint32s) ShiftAllRight(y uint8) Uint32s {
  2082  	var res Uint32s
  2083  	for i := 0; i < 4; i++ {
  2084  		res.set(i, x.get(i)>>y)
  2085  	}
  2086  	return res
  2087  }
  2088  
  2089  // RotateAllLeft rotates all elements left by dist bits.
  2090  func (x Uint32s) RotateAllLeft(dist uint64) Uint32s {
  2091  	var res Uint32s
  2092  	d := dist & 31
  2093  	for i := 0; i < 4; i++ {
  2094  		u := x.get(i)
  2095  		r := (u << d) | (u >> ((32 - d) & 31))
  2096  		res.set(i, r)
  2097  	}
  2098  	return res
  2099  }
  2100  
  2101  // RotateAllRight rotates all elements right by dist bits.
  2102  func (x Uint32s) RotateAllRight(dist uint64) Uint32s {
  2103  	var res Uint32s
  2104  	d := dist & 31
  2105  	for i := 0; i < 4; i++ {
  2106  		u := x.get(i)
  2107  		r := (u >> d) | (u << ((32 - d) & 31))
  2108  		res.set(i, r)
  2109  	}
  2110  	return res
  2111  }
  2112  
  2113  // Store stores the vector elements into the slice s.
  2114  func (x Uint32s) Store(s []uint32) {
  2115  	for i := 0; i < 4 && i < len(s); i++ {
  2116  		s[i] = x.get(i)
  2117  	}
  2118  }
  2119  
  2120  // StorePart stores a partial vector into the slice s.
  2121  func (x Uint32s) StorePart(s []uint32) int {
  2122  	x.Store(s)
  2123  	return min(len(s), x.Len())
  2124  }
  2125  
  2126  // String returns a string representation of the vector.
  2127  func (x Uint32s) String() string {
  2128  	var parts [4]uint32
  2129  	for i := 0; i < 4; i++ {
  2130  		parts[i] = x.get(i)
  2131  	}
  2132  	return fmt.Sprint(parts)
  2133  }
  2134  
  2135  // Sub returns the element-wise difference of x and y.
  2136  func (x Uint32s) Sub(y Uint32s) Uint32s {
  2137  	var res Uint32s
  2138  	for i := 0; i < 4; i++ {
  2139  		res.set(i, x.get(i)-y.get(i))
  2140  	}
  2141  	return res
  2142  }
  2143  
  2144  // Xor returns the bitwise XOR of x and y.
  2145  func (x Uint32s) Xor(y Uint32s) Uint32s {
  2146  	return Uint32s{a: x.a ^ y.a, b: x.b ^ y.b}
  2147  }
  2148  
  2149  // BitsToFloat32 reinterprets the vector bits as a Float32s vector.
  2150  func (x Uint32s) BitsToFloat32() Float32s {
  2151  	return Float32s{a: x.a, b: x.b}
  2152  }
  2153  
  2154  // BitsToInt32 reinterprets the vector bits as an Int32s vector.
  2155  func (x Uint32s) BitsToInt32() Int32s {
  2156  	return Int32s{a: x.a, b: x.b}
  2157  }
  2158  
  2159  // ConvertToInt32 converts the vector elements to int32.
  2160  func (x Uint32s) ConvertToInt32() Int32s {
  2161  	return Int32s{a: x.a, b: x.b}
  2162  }
  2163  
  2164  // ReshapeToUint16s reinterprets the vector bits as a Uint16s vector.
  2165  func (x Uint32s) ReshapeToUint16s() Uint16s {
  2166  	return Uint16s{a: x.a, b: x.b}
  2167  }
  2168  
  2169  // ReshapeToUint64s reinterprets the vector bits as a Uint64s vector.
  2170  func (x Uint32s) ReshapeToUint64s() Uint64s {
  2171  	return Uint64s{a: x.a, b: x.b}
  2172  }
  2173  
  2174  // ReshapeToUint8s reinterprets the vector bits as a Uint8s vector.
  2175  func (x Uint32s) ReshapeToUint8s() Uint8s {
  2176  	return Uint8s{a: x.a, b: x.b}
  2177  }
  2178  
  2179  // LoadUint64s loads a slice of uint64 into an Uint64s vector.
  2180  func LoadUint64s(s []uint64) Uint64s {
  2181  	var a, b uint64
  2182  	a = s[0]
  2183  	b = s[1]
  2184  	return Uint64s{a: a, b: b}
  2185  }
  2186  
  2187  // LoadUint64sPart loads a partial slice of uint64 into an Uint64s vector.
  2188  func LoadUint64sPart(s []uint64) (Uint64s, int) {
  2189  	n := len(s)
  2190  	var a, b uint64
  2191  	if n > 0 {
  2192  		a = s[0]
  2193  	}
  2194  	if n > 1 {
  2195  		b = s[1]
  2196  	}
  2197  	return Uint64s{a: a, b: b}, n
  2198  }
  2199  
  2200  func (x Uint64s) get(i int) uint64 {
  2201  	if i == 0 {
  2202  		return x.a
  2203  	}
  2204  	return x.b
  2205  }
  2206  
  2207  func (x *Uint64s) set(i int, v uint64) {
  2208  	if i == 0 {
  2209  		x.a = v
  2210  	} else {
  2211  		x.b = v
  2212  	}
  2213  }
  2214  
  2215  // Add returns the element-wise sum of x and y.
  2216  func (x Uint64s) Add(y Uint64s) Uint64s {
  2217  	return Uint64s{a: x.a + y.a, b: x.b + y.b}
  2218  }
  2219  
  2220  // And returns the bitwise AND of x and y.
  2221  func (x Uint64s) And(y Uint64s) Uint64s {
  2222  	return Uint64s{a: x.a & y.a, b: x.b & y.b}
  2223  }
  2224  
  2225  // AndNot returns the bitwise AND NOT of x and y.
  2226  func (x Uint64s) AndNot(y Uint64s) Uint64s {
  2227  	return Uint64s{a: x.a &^ y.a, b: x.b &^ y.b}
  2228  }
  2229  
  2230  // Equal returns a mask indicating where x and y are equal.
  2231  func (x Uint64s) Equal(y Uint64s) Mask64s {
  2232  	var res Mask64s
  2233  	if x.a == y.a {
  2234  		res.a = ^uint64(0)
  2235  	}
  2236  	if x.b == y.b {
  2237  		res.b = ^uint64(0)
  2238  	}
  2239  	return res
  2240  }
  2241  
  2242  // Greater returns a mask indicating where x is greater than y.
  2243  func (x Uint64s) Greater(y Uint64s) Mask64s {
  2244  	var res Mask64s
  2245  	for i := 0; i < 2; i++ {
  2246  		if x.get(i) > y.get(i) {
  2247  			res.set(i, true)
  2248  		}
  2249  	}
  2250  	return res
  2251  }
  2252  
  2253  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  2254  func (x Uint64s) GreaterEqual(y Uint64s) Mask64s {
  2255  	var res Mask64s
  2256  	for i := 0; i < 2; i++ {
  2257  		if x.get(i) >= y.get(i) {
  2258  			res.set(i, true)
  2259  		}
  2260  	}
  2261  	return res
  2262  }
  2263  
  2264  // Less returns a mask indicating where x is less than y.
  2265  func (x Uint64s) Less(y Uint64s) Mask64s {
  2266  	var res Mask64s
  2267  	for i := 0; i < 2; i++ {
  2268  		if x.get(i) < y.get(i) {
  2269  			res.set(i, true)
  2270  		}
  2271  	}
  2272  	return res
  2273  }
  2274  
  2275  // LessEqual returns a mask indicating where x is less than or equal to y.
  2276  func (x Uint64s) LessEqual(y Uint64s) Mask64s {
  2277  	var res Mask64s
  2278  	for i := 0; i < 2; i++ {
  2279  		if x.get(i) <= y.get(i) {
  2280  			res.set(i, true)
  2281  		}
  2282  	}
  2283  	return res
  2284  }
  2285  
  2286  // NotEqual returns a mask indicating where x and y are not equal.
  2287  func (x Uint64s) NotEqual(y Uint64s) Mask64s {
  2288  	var res Mask64s
  2289  	if x.a != y.a {
  2290  		res.a = ^uint64(0)
  2291  	}
  2292  	if x.b != y.b {
  2293  		res.b = ^uint64(0)
  2294  	}
  2295  	return res
  2296  }
  2297  
  2298  // Len returns the number of elements in the vector.
  2299  func (x Uint64s) Len() int {
  2300  	return 2
  2301  }
  2302  
  2303  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  2304  func (x Uint64s) Masked(mask Mask64s) Uint64s {
  2305  	return Uint64s{a: x.a & mask.a, b: x.b & mask.b}
  2306  }
  2307  
  2308  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  2309  func (x Uint64s) IfElse(mask Mask64s, y Uint64s) Uint64s {
  2310  	return Uint64s{
  2311  		a: (x.a & mask.a) | (y.a &^ mask.a),
  2312  		b: (x.b & mask.b) | (y.b &^ mask.b),
  2313  	}
  2314  }
  2315  
  2316  // Not returns the bitwise NOT of x.
  2317  func (x Uint64s) Not() Uint64s {
  2318  	return Uint64s{a: ^x.a, b: ^x.b}
  2319  }
  2320  
  2321  // Or returns the bitwise OR of x and y.
  2322  func (x Uint64s) Or(y Uint64s) Uint64s {
  2323  	return Uint64s{a: x.a | y.a, b: x.b | y.b}
  2324  }
  2325  
  2326  // ShiftAllLeft shifts all elements left by y bits.
  2327  func (x Uint64s) ShiftAllLeft(y uint8) Uint64s {
  2328  	return Uint64s{a: x.a << y, b: x.b << y}
  2329  }
  2330  
  2331  // ShiftAllRight shifts all elements right by y bits.
  2332  func (x Uint64s) ShiftAllRight(y uint8) Uint64s {
  2333  	return Uint64s{a: x.a >> y, b: x.b >> y}
  2334  }
  2335  
  2336  // RotateAllLeft rotates all elements left by dist bits.
  2337  func (x Uint64s) RotateAllLeft(dist uint64) Uint64s {
  2338  	d := dist & 63
  2339  	return Uint64s{
  2340  		a: (x.a << d) | (x.a >> ((64 - d) & 63)),
  2341  		b: (x.b << d) | (x.b >> ((64 - d) & 63)),
  2342  	}
  2343  }
  2344  
  2345  // RotateAllRight rotates all elements right by dist bits.
  2346  func (x Uint64s) RotateAllRight(dist uint64) Uint64s {
  2347  	d := dist & 63
  2348  	return Uint64s{
  2349  		a: (x.a >> d) | (x.a << ((64 - d) & 63)),
  2350  		b: (x.b >> d) | (x.b << ((64 - d) & 63)),
  2351  	}
  2352  }
  2353  
  2354  // Store stores the vector elements into the slice s.
  2355  func (x Uint64s) Store(s []uint64) {
  2356  	if len(s) > 0 {
  2357  		s[0] = x.a
  2358  	}
  2359  	if len(s) > 1 {
  2360  		s[1] = x.b
  2361  	}
  2362  }
  2363  
  2364  // StorePart stores a partial vector into the slice s.
  2365  func (x Uint64s) StorePart(s []uint64) int {
  2366  	x.Store(s)
  2367  	return min(len(s), x.Len())
  2368  }
  2369  
  2370  // String returns a string representation of the vector.
  2371  func (x Uint64s) String() string {
  2372  	return fmt.Sprint([2]uint64{x.a, x.b})
  2373  }
  2374  
  2375  // Sub returns the element-wise difference of x and y.
  2376  func (x Uint64s) Sub(y Uint64s) Uint64s {
  2377  	return Uint64s{a: x.a - y.a, b: x.b - y.b}
  2378  }
  2379  
  2380  // Xor returns the bitwise XOR of x and y.
  2381  func (x Uint64s) Xor(y Uint64s) Uint64s {
  2382  	return Uint64s{a: x.a ^ y.a, b: x.b ^ y.b}
  2383  }
  2384  
  2385  // BitsToFloat64 reinterprets the vector bits as a Float64s vector.
  2386  func (x Uint64s) BitsToFloat64() Float64s {
  2387  	return Float64s{a: x.a, b: x.b}
  2388  }
  2389  
  2390  // BitsToInt64 reinterprets the vector bits as an Int64s vector.
  2391  func (x Uint64s) BitsToInt64() Int64s {
  2392  	return Int64s{a: x.a, b: x.b}
  2393  }
  2394  
  2395  // ConvertToInt64 converts the vector elements to int64.
  2396  func (x Uint64s) ConvertToInt64() Int64s {
  2397  	return Int64s{a: x.a, b: x.b}
  2398  }
  2399  
  2400  // ReshapeToUint16s reinterprets the vector bits as a Uint16s vector.
  2401  func (x Uint64s) ReshapeToUint16s() Uint16s {
  2402  	return Uint16s{a: x.a, b: x.b}
  2403  }
  2404  
  2405  // ReshapeToUint32s reinterprets the vector bits as a Uint32s vector.
  2406  func (x Uint64s) ReshapeToUint32s() Uint32s {
  2407  	return Uint32s{a: x.a, b: x.b}
  2408  }
  2409  
  2410  // ReshapeToUint8s reinterprets the vector bits as a Uint8s vector.
  2411  func (x Uint64s) ReshapeToUint8s() Uint8s {
  2412  	return Uint8s{a: x.a, b: x.b}
  2413  }
  2414  
  2415  // LoadFloat32s loads a slice of float32 into an Float32s vector.
  2416  func LoadFloat32s(s []float32) Float32s {
  2417  	var a, b uint64
  2418  	for i := 0; i < 4; i++ {
  2419  		val := uint64(math.Float32bits(s[i]))
  2420  		if i < 2 {
  2421  			a |= val << (32 * i)
  2422  		} else {
  2423  			b |= val << (32 * (i - 2))
  2424  		}
  2425  	}
  2426  	return Float32s{a: a, b: b}
  2427  }
  2428  
  2429  // LoadFloat32sPart loads a partial slice of float32 into an Float32s vector.
  2430  func LoadFloat32sPart(s []float32) (Float32s, int) {
  2431  	var a, b uint64
  2432  	n := len(s)
  2433  	if n > 4 {
  2434  		n = 4
  2435  	}
  2436  	for i := 0; i < n; i++ {
  2437  		val := uint64(math.Float32bits(s[i]))
  2438  		if i < 2 {
  2439  			a |= val << (32 * i)
  2440  		} else {
  2441  			b |= val << (32 * (i - 2))
  2442  		}
  2443  	}
  2444  	return Float32s{a: a, b: b}, n
  2445  }
  2446  
  2447  func (x Float32s) get(i int) float32 {
  2448  	if i < 2 {
  2449  		return math.Float32frombits(uint32(x.a >> (32 * i)))
  2450  	}
  2451  	return math.Float32frombits(uint32(x.b >> (32 * (i - 2))))
  2452  }
  2453  
  2454  func (x *Float32s) set(i int, v float32) {
  2455  	val := uint64(math.Float32bits(v))
  2456  	if i < 2 {
  2457  		mask := uint64(0xffffffff) << (32 * i)
  2458  		x.a = (x.a &^ mask) | (val << (32 * i))
  2459  	} else {
  2460  		mask := uint64(0xffffffff) << (32 * (i - 2))
  2461  		x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
  2462  	}
  2463  }
  2464  
  2465  // Abs returns the element-wise absolute value of x.
  2466  func (x Float32s) Abs() Float32s {
  2467  	var res Float32s
  2468  	for i := 0; i < 4; i++ {
  2469  		v := x.get(i)
  2470  		if v < 0 {
  2471  			res.set(i, -v)
  2472  		} else {
  2473  			res.set(i, v)
  2474  		}
  2475  	}
  2476  	return res
  2477  }
  2478  
  2479  // Add returns the element-wise sum of x and y.
  2480  func (x Float32s) Add(y Float32s) Float32s {
  2481  	var res Float32s
  2482  	res.set(0, x.get(0)+y.get(0))
  2483  	res.set(1, x.get(1)+y.get(1))
  2484  	res.set(2, x.get(2)+y.get(2))
  2485  	res.set(3, x.get(3)+y.get(3))
  2486  	return res
  2487  }
  2488  
  2489  // ConvertToInt32 converts the vector elements to int32.
  2490  func (x Float32s) ConvertToInt32() Int32s {
  2491  	var res Int32s
  2492  	for i := 0; i < 4; i++ {
  2493  		res.set(i, int32(x.get(i)))
  2494  	}
  2495  	return res
  2496  }
  2497  
  2498  // Div returns the element-wise quotient of x and y.
  2499  func (x Float32s) Div(y Float32s) Float32s {
  2500  	var res Float32s
  2501  	for i := 0; i < 4; i++ {
  2502  		res.set(i, x.get(i)/y.get(i))
  2503  	}
  2504  	return res
  2505  }
  2506  
  2507  // Equal returns a mask indicating where x and y are equal.
  2508  func (x Float32s) Equal(y Float32s) Mask32s {
  2509  	var res Mask32s
  2510  	for i := 0; i < 4; i++ {
  2511  		if x.get(i) == y.get(i) {
  2512  			res.set(i, true)
  2513  		}
  2514  	}
  2515  	return res
  2516  }
  2517  
  2518  // Greater returns a mask indicating where x is greater than y.
  2519  func (x Float32s) Greater(y Float32s) Mask32s {
  2520  	var res Mask32s
  2521  	for i := 0; i < 4; i++ {
  2522  		if x.get(i) > y.get(i) {
  2523  			res.set(i, true)
  2524  		}
  2525  	}
  2526  	return res
  2527  }
  2528  
  2529  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  2530  func (x Float32s) GreaterEqual(y Float32s) Mask32s {
  2531  	var res Mask32s
  2532  	for i := 0; i < 4; i++ {
  2533  		if x.get(i) >= y.get(i) {
  2534  			res.set(i, true)
  2535  		}
  2536  	}
  2537  	return res
  2538  }
  2539  
  2540  // Len returns the number of elements in the vector.
  2541  func (x Float32s) Len() int {
  2542  	return 4
  2543  }
  2544  
  2545  // Less returns a mask indicating where x is less than y.
  2546  func (x Float32s) Less(y Float32s) Mask32s {
  2547  	var res Mask32s
  2548  	for i := 0; i < 4; i++ {
  2549  		if x.get(i) < y.get(i) {
  2550  			res.set(i, true)
  2551  		}
  2552  	}
  2553  	return res
  2554  }
  2555  
  2556  // LessEqual returns a mask indicating where x is less than or equal to y.
  2557  func (x Float32s) LessEqual(y Float32s) Mask32s {
  2558  	var res Mask32s
  2559  	for i := 0; i < 4; i++ {
  2560  		if x.get(i) <= y.get(i) {
  2561  			res.set(i, true)
  2562  		}
  2563  	}
  2564  	return res
  2565  }
  2566  
  2567  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  2568  func (x Float32s) Masked(mask Mask32s) Float32s {
  2569  	return Float32s{a: x.a & mask.a, b: x.b & mask.b}
  2570  }
  2571  
  2572  // Max returns the element-wise maximum of x and y.
  2573  func (x Float32s) Max(y Float32s) Float32s {
  2574  	var res Float32s
  2575  	for i := 0; i < 4; i++ {
  2576  		vx := x.get(i)
  2577  		vy := y.get(i)
  2578  		if vx > vy {
  2579  			res.set(i, vx)
  2580  		} else {
  2581  			res.set(i, vy)
  2582  		}
  2583  	}
  2584  	return res
  2585  }
  2586  
  2587  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  2588  func (x Float32s) IfElse(mask Mask32s, y Float32s) Float32s {
  2589  	return Float32s{
  2590  		a: (x.a & mask.a) | (y.a &^ mask.a),
  2591  		b: (x.b & mask.b) | (y.b &^ mask.b),
  2592  	}
  2593  }
  2594  
  2595  // Min returns the element-wise minimum of x and y.
  2596  func (x Float32s) Min(y Float32s) Float32s {
  2597  	var res Float32s
  2598  	for i := 0; i < 4; i++ {
  2599  		vx := x.get(i)
  2600  		vy := y.get(i)
  2601  		if vx < vy {
  2602  			res.set(i, vx)
  2603  		} else {
  2604  			res.set(i, vy)
  2605  		}
  2606  	}
  2607  	return res
  2608  }
  2609  
  2610  // Mul returns the element-wise product of x and y.
  2611  func (x Float32s) Mul(y Float32s) Float32s {
  2612  	var res Float32s
  2613  	res.set(0, x.get(0)*y.get(0))
  2614  	res.set(1, x.get(1)*y.get(1))
  2615  	res.set(2, x.get(2)*y.get(2))
  2616  	res.set(3, x.get(3)*y.get(3))
  2617  
  2618  	return res
  2619  }
  2620  
  2621  // MulAdd returns x * y + z element-wise.
  2622  func (x Float32s) MulAdd(y, z Float32s) Float32s {
  2623  	var res Float32s
  2624  
  2625  	res.set(0, x.get(0)*y.get(0)+z.get(0))
  2626  	res.set(1, x.get(1)*y.get(1)+z.get(1))
  2627  	res.set(2, x.get(2)*y.get(2)+z.get(2))
  2628  	res.set(3, x.get(3)*y.get(3)+z.get(3))
  2629  	return res
  2630  }
  2631  
  2632  // Neg returns the element-wise negation of x.
  2633  func (x Float32s) Neg() Float32s {
  2634  	var res Float32s
  2635  	for i := 0; i < 4; i++ {
  2636  		res.set(i, -(x.get(i)))
  2637  	}
  2638  	return res
  2639  }
  2640  
  2641  // NotEqual returns a mask indicating where x and y are not equal.
  2642  func (x Float32s) NotEqual(y Float32s) Mask32s {
  2643  	var res Mask32s
  2644  	for i := 0; i < 4; i++ {
  2645  		if x.get(i) != y.get(i) {
  2646  			res.set(i, true)
  2647  		}
  2648  	}
  2649  	return res
  2650  }
  2651  
  2652  // Sqrt returns the element-wise square root of x.
  2653  func (x Float32s) Sqrt() Float32s {
  2654  	var res Float32s
  2655  	for i := 0; i < 4; i++ {
  2656  		res.set(i, float32(math.Sqrt(float64(x.get(i)))))
  2657  	}
  2658  	return res
  2659  }
  2660  
  2661  // Store stores the vector elements into the slice s.
  2662  func (x Float32s) Store(s []float32) {
  2663  	for i := 0; i < 4 && i < len(s); i++ {
  2664  		s[i] = x.get(i)
  2665  	}
  2666  }
  2667  
  2668  // StorePart stores a partial vector into the slice s.
  2669  func (x Float32s) StorePart(s []float32) int {
  2670  	x.Store(s)
  2671  	return min(len(s), x.Len())
  2672  }
  2673  
  2674  // String returns a string representation of the vector.
  2675  func (x Float32s) String() string {
  2676  	var parts [4]float32
  2677  	for i := 0; i < 4; i++ {
  2678  		parts[i] = x.get(i)
  2679  	}
  2680  	return fmt.Sprint(parts)
  2681  }
  2682  
  2683  // Sub returns the element-wise difference of x and y.
  2684  func (x Float32s) Sub(y Float32s) Float32s {
  2685  	var res Float32s
  2686  	for i := 0; i < 4; i++ {
  2687  		res.set(i, x.get(i)-y.get(i))
  2688  	}
  2689  	return res
  2690  }
  2691  
  2692  // ToBits reinterprets the vector bits as a Uint32s vector.
  2693  func (x Float32s) ToBits() Uint32s {
  2694  	return Uint32s{a: x.a, b: x.b}
  2695  }
  2696  
  2697  // LoadFloat64s loads a slice of float64 into an Float64s vector.
  2698  func LoadFloat64s(s []float64) Float64s {
  2699  	var a, b uint64
  2700  	a = math.Float64bits(s[0])
  2701  	b = math.Float64bits(s[1])
  2702  	return Float64s{a: a, b: b}
  2703  }
  2704  
  2705  // LoadFloat64sPart loads a partial slice of float64 into an Float64s vector.
  2706  func LoadFloat64sPart(s []float64) (Float64s, int) {
  2707  	n := len(s)
  2708  	var a, b uint64
  2709  	if n > 0 {
  2710  		a = math.Float64bits(s[0])
  2711  	}
  2712  	if n > 1 {
  2713  		b = math.Float64bits(s[1])
  2714  	}
  2715  	return Float64s{a: a, b: b}, n
  2716  }
  2717  
  2718  func (x Float64s) get(i int) float64 {
  2719  	if i == 0 {
  2720  		return math.Float64frombits(x.a)
  2721  	}
  2722  	return math.Float64frombits(x.b)
  2723  }
  2724  
  2725  func (x *Float64s) set(i int, v float64) {
  2726  	if i == 0 {
  2727  		x.a = math.Float64bits(v)
  2728  	} else {
  2729  		x.b = math.Float64bits(v)
  2730  	}
  2731  }
  2732  
  2733  // Abs returns the element-wise absolute value of x.
  2734  func (x Float64s) Abs() Float64s {
  2735  	var res Float64s
  2736  	for i := 0; i < 4; i++ {
  2737  		v := x.get(i)
  2738  		if v < 0 {
  2739  			res.set(i, -v)
  2740  		} else {
  2741  			res.set(i, v)
  2742  		}
  2743  	}
  2744  	return res
  2745  }
  2746  
  2747  // Add returns the element-wise sum of x and y.
  2748  func (x Float64s) Add(y Float64s) Float64s {
  2749  	var res Float64s
  2750  	res.set(0, x.get(0)+y.get(0))
  2751  	res.set(1, x.get(1)+y.get(1))
  2752  	return res
  2753  }
  2754  
  2755  // Div returns the element-wise quotient of x and y.
  2756  func (x Float64s) Div(y Float64s) Float64s {
  2757  	var res Float64s
  2758  	res.set(0, x.get(0)/y.get(0))
  2759  	res.set(1, x.get(1)/y.get(1))
  2760  	return res
  2761  }
  2762  
  2763  // Equal returns a mask indicating where x and y are equal.
  2764  func (x Float64s) Equal(y Float64s) Mask64s {
  2765  	var res Mask64s
  2766  	if x.get(0) == y.get(0) {
  2767  		res.a = ^uint64(0)
  2768  	}
  2769  	if x.get(1) == y.get(1) {
  2770  		res.b = ^uint64(0)
  2771  	}
  2772  	return res
  2773  }
  2774  
  2775  // Greater returns a mask indicating where x is greater than y.
  2776  func (x Float64s) Greater(y Float64s) Mask64s {
  2777  	var res Mask64s
  2778  	if x.get(0) > y.get(0) {
  2779  		res.a = ^uint64(0)
  2780  	}
  2781  	if x.get(1) > y.get(1) {
  2782  		res.b = ^uint64(0)
  2783  	}
  2784  	return res
  2785  }
  2786  
  2787  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  2788  func (x Float64s) GreaterEqual(y Float64s) Mask64s {
  2789  	var res Mask64s
  2790  	if x.get(0) >= y.get(0) {
  2791  		res.a = ^uint64(0)
  2792  	}
  2793  	if x.get(1) >= y.get(1) {
  2794  		res.b = ^uint64(0)
  2795  	}
  2796  	return res
  2797  }
  2798  
  2799  // Len returns the number of elements in the vector.
  2800  func (x Float64s) Len() int {
  2801  	return 2
  2802  }
  2803  
  2804  // Less returns a mask indicating where x is less than y.
  2805  func (x Float64s) Less(y Float64s) Mask64s {
  2806  	var res Mask64s
  2807  	if x.get(0) < y.get(0) {
  2808  		res.a = ^uint64(0)
  2809  	}
  2810  	if x.get(1) < y.get(1) {
  2811  		res.b = ^uint64(0)
  2812  	}
  2813  	return res
  2814  }
  2815  
  2816  // LessEqual returns a mask indicating where x is less than or equal to y.
  2817  func (x Float64s) LessEqual(y Float64s) Mask64s {
  2818  	var res Mask64s
  2819  	if x.get(0) <= y.get(0) {
  2820  		res.a = ^uint64(0)
  2821  	}
  2822  	if x.get(1) <= y.get(1) {
  2823  		res.b = ^uint64(0)
  2824  	}
  2825  	return res
  2826  }
  2827  
  2828  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  2829  func (x Float64s) Masked(mask Mask64s) Float64s {
  2830  	return Float64s{a: x.a & mask.a, b: x.b & mask.b}
  2831  }
  2832  
  2833  // Max returns the element-wise maximum of x and y.
  2834  func (x Float64s) Max(y Float64s) Float64s {
  2835  	var res Float64s
  2836  	vx := x.get(0)
  2837  	vy := y.get(0)
  2838  	if vx > vy {
  2839  		res.set(0, vx)
  2840  	} else {
  2841  		res.set(0, vy)
  2842  	}
  2843  	vx = x.get(1)
  2844  	vy = y.get(1)
  2845  	if vx > vy {
  2846  		res.set(1, vx)
  2847  	} else {
  2848  		res.set(1, vy)
  2849  	}
  2850  	return res
  2851  }
  2852  
  2853  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  2854  func (x Float64s) IfElse(mask Mask64s, y Float64s) Float64s {
  2855  	return Float64s{
  2856  		a: (x.a & mask.a) | (y.a &^ mask.a),
  2857  		b: (x.b & mask.b) | (y.b &^ mask.b),
  2858  	}
  2859  }
  2860  
  2861  // Min returns the element-wise minimum of x and y.
  2862  func (x Float64s) Min(y Float64s) Float64s {
  2863  	var res Float64s
  2864  	vx := x.get(0)
  2865  	vy := y.get(0)
  2866  	if vx < vy {
  2867  		res.set(0, vx)
  2868  	} else {
  2869  		res.set(0, vy)
  2870  	}
  2871  	vx = x.get(1)
  2872  	vy = y.get(1)
  2873  	if vx < vy {
  2874  		res.set(1, vx)
  2875  	} else {
  2876  		res.set(1, vy)
  2877  	}
  2878  	return res
  2879  }
  2880  
  2881  // Mul returns the element-wise product of x and y.
  2882  func (x Float64s) Mul(y Float64s) Float64s {
  2883  	var res Float64s
  2884  	res.set(0, x.get(0)*y.get(0))
  2885  	res.set(1, x.get(1)*y.get(1))
  2886  	return res
  2887  }
  2888  
  2889  // MulAdd returns x * y + z element-wise.
  2890  func (x Float64s) MulAdd(y, z Float64s) Float64s {
  2891  	var res Float64s
  2892  	res.set(0, x.get(0)*y.get(0)+z.get(0))
  2893  	res.set(1, x.get(1)*y.get(1)+z.get(1))
  2894  	return res
  2895  }
  2896  
  2897  // Neg returns the element-wise negation of x.
  2898  func (x Float64s) Neg() Float64s {
  2899  	var res Float64s
  2900  	for i := 0; i < 4; i++ {
  2901  		res.set(i, -(x.get(i)))
  2902  	}
  2903  	return res
  2904  }
  2905  
  2906  // NotEqual returns a mask indicating where x and y are not equal.
  2907  func (x Float64s) NotEqual(y Float64s) Mask64s {
  2908  	var res Mask64s
  2909  	if x.get(0) != y.get(0) {
  2910  		res.a = ^uint64(0)
  2911  	}
  2912  	if x.get(1) != y.get(1) {
  2913  		res.b = ^uint64(0)
  2914  	}
  2915  	return res
  2916  }
  2917  
  2918  // Sqrt returns the element-wise square root of x.
  2919  func (x Float64s) Sqrt() Float64s {
  2920  	var res Float64s
  2921  	res.set(0, math.Sqrt(x.get(0)))
  2922  	res.set(1, math.Sqrt(x.get(1)))
  2923  	return res
  2924  }
  2925  
  2926  // Store stores the vector elements into the slice s.
  2927  func (x Float64s) Store(s []float64) {
  2928  	if len(s) > 0 {
  2929  		s[0] = x.get(0)
  2930  	}
  2931  	if len(s) > 1 {
  2932  		s[1] = x.get(1)
  2933  	}
  2934  }
  2935  
  2936  // StorePart stores a partial vector into the slice s.
  2937  func (x Float64s) StorePart(s []float64) int {
  2938  	x.Store(s)
  2939  	return min(len(s), x.Len())
  2940  }
  2941  
  2942  // String returns a string representation of the vector.
  2943  func (x Float64s) String() string {
  2944  	return fmt.Sprint([2]float64{x.get(0), x.get(1)})
  2945  }
  2946  
  2947  // Sub returns the element-wise difference of x and y.
  2948  func (x Float64s) Sub(y Float64s) Float64s {
  2949  	var res Float64s
  2950  	res.set(0, x.get(0)-y.get(0))
  2951  	res.set(1, x.get(1)-y.get(1))
  2952  	return res
  2953  }
  2954  
  2955  // ToBits reinterprets the vector bits as a Uint64s vector.
  2956  func (x Float64s) ToBits() Uint64s {
  2957  	return Uint64s{a: x.a, b: x.b}
  2958  }
  2959  
  2960  func (x *Mask8s) set(i int, v bool) {
  2961  	if v {
  2962  		if i < 8 {
  2963  			mask := uint64(0xff) << (8 * i)
  2964  			x.a |= mask
  2965  		} else {
  2966  			mask := uint64(0xff) << (8 * (i - 8))
  2967  			x.b |= mask
  2968  		}
  2969  	}
  2970  }
  2971  
  2972  // And returns the bitwise AND of x and y.
  2973  func (x Mask8s) And(y Mask8s) Mask8s {
  2974  	return Mask8s{a: x.a & y.a, b: x.b & y.b}
  2975  }
  2976  
  2977  // Or returns the bitwise OR of x and y.
  2978  func (x Mask8s) Or(y Mask8s) Mask8s {
  2979  	return Mask8s{a: x.a | y.a, b: x.b | y.b}
  2980  }
  2981  
  2982  // String returns a string representation of the vector.
  2983  func (x Mask8s) String() string {
  2984  	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
  2985  }
  2986  
  2987  // ToInt8s converts the mask to an Int8s vector.
  2988  func (x Mask8s) ToInt8s() Int8s {
  2989  	return Int8s{a: x.a, b: x.b}
  2990  }
  2991  
  2992  func (x *Mask16s) set(i int, v bool) {
  2993  	if v {
  2994  		if i < 4 {
  2995  			mask := uint64(0xffff) << (16 * i)
  2996  			x.a |= mask
  2997  		} else {
  2998  			mask := uint64(0xffff) << (16 * (i - 4))
  2999  			x.b |= mask
  3000  		}
  3001  	}
  3002  }
  3003  
  3004  // And returns the bitwise AND of x and y.
  3005  func (x Mask16s) And(y Mask16s) Mask16s {
  3006  	return Mask16s{a: x.a & y.a, b: x.b & y.b}
  3007  }
  3008  
  3009  // Or returns the bitwise OR of x and y.
  3010  func (x Mask16s) Or(y Mask16s) Mask16s {
  3011  	return Mask16s{a: x.a | y.a, b: x.b | y.b}
  3012  }
  3013  
  3014  // String returns a string representation of the vector.
  3015  func (x Mask16s) String() string {
  3016  	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
  3017  }
  3018  
  3019  // ToInt16s converts the mask to an Int16s vector.
  3020  func (x Mask16s) ToInt16s() Int16s {
  3021  	return Int16s{a: x.a, b: x.b}
  3022  }
  3023  
  3024  func (x *Mask32s) set(i int, v bool) {
  3025  	if v {
  3026  		if i < 2 {
  3027  			mask := uint64(0xffffffff) << (32 * i)
  3028  			x.a |= mask
  3029  		} else {
  3030  			mask := uint64(0xffffffff) << (32 * (i - 2))
  3031  			x.b |= mask
  3032  		}
  3033  	}
  3034  }
  3035  
  3036  // And returns the bitwise AND of x and y.
  3037  func (x Mask32s) And(y Mask32s) Mask32s {
  3038  	return Mask32s{a: x.a & y.a, b: x.b & y.b}
  3039  }
  3040  
  3041  // Or returns the bitwise OR of x and y.
  3042  func (x Mask32s) Or(y Mask32s) Mask32s {
  3043  	return Mask32s{a: x.a | y.a, b: x.b | y.b}
  3044  }
  3045  
  3046  // String returns a string representation of the vector.
  3047  func (x Mask32s) String() string {
  3048  	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
  3049  }
  3050  
  3051  // ToInt32s converts the mask to an Int32s vector.
  3052  func (x Mask32s) ToInt32s() Int32s {
  3053  	return Int32s{a: x.a, b: x.b}
  3054  }
  3055  
  3056  func (x *Mask64s) set(i int, v bool) {
  3057  	if v {
  3058  		if i == 0 {
  3059  			x.a = ^uint64(0)
  3060  		} else {
  3061  			x.b = ^uint64(0)
  3062  		}
  3063  	}
  3064  }
  3065  
  3066  // And returns the bitwise AND of x and y.
  3067  func (x Mask64s) And(y Mask64s) Mask64s {
  3068  	return Mask64s{a: x.a & y.a, b: x.b & y.b}
  3069  }
  3070  
  3071  // Or returns the bitwise OR of x and y.
  3072  func (x Mask64s) Or(y Mask64s) Mask64s {
  3073  	return Mask64s{a: x.a | y.a, b: x.b | y.b}
  3074  }
  3075  
  3076  // String returns a string representation of the vector.
  3077  func (x Mask64s) String() string {
  3078  	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
  3079  }
  3080  
  3081  // ToInt64s converts the mask to an Int64s vector.
  3082  func (x Mask64s) ToInt64s() Int64s {
  3083  	return Int64s{a: x.a, b: x.b}
  3084  }
  3085  
  3086  func newT(lo, hi uint64) Uint64s {
  3087  	return Uint64s{a: lo, b: hi}
  3088  }
  3089  
  3090  // mwl returns the 128-bit product of the lower halves of x and y
  3091  func (x Uint64s) mwl(y Uint64s) Uint64s {
  3092  	hi, lo := bits.Mul64(x.a, y.a)
  3093  	return Uint64s{a: lo, b: hi}
  3094  }
  3095  
  3096  var (
  3097  	// For mK, bits J such that J mod 5 == K are set
  3098  	m0 = newT(0x1084210842108421, 0x2108421084210842)
  3099  	m1 = newT(0x2108421084210842, 0x4210842108421084)
  3100  	m2 = newT(0x4210842108421084, 0x8421084210842108)
  3101  	m3 = newT(0x8421084210842108, 0x0842108421084210)
  3102  	m4 = newT(0x0842108421084210, 0x1084210842108421)
  3103  )
  3104  
  3105  func (x Uint64s) clmul(y Uint64s) Uint64s {
  3106  	x0 := x.And(m0)
  3107  	x1 := x.And(m1)
  3108  	x2 := x.And(m2)
  3109  	x3 := x.And(m3)
  3110  	x4 := x.And(m4)
  3111  
  3112  	y0 := y.And(m0)
  3113  	y1 := y.And(m1)
  3114  	y2 := y.And(m2)
  3115  	y3 := y.And(m3)
  3116  	y4 := y.And(m4)
  3117  
  3118  	// sum of x, y indices == K mod 5; mask index = K
  3119  	z := (x0.mwl(y0)).Xor(x1.mwl(y4)).Xor(x4.mwl(y1)).Xor(x2.mwl(y3)).Xor(x3.mwl(y2)).And(m0)
  3120  	z = (x3.mwl(y3)).Xor(x2.mwl(y4)).Xor(x4.mwl(y2)).Xor(x0.mwl(y1)).Xor(x1.mwl(y0)).And(m1).Or(z)
  3121  	z = (x1.mwl(y1)).Xor(x3.mwl(y4)).Xor(x4.mwl(y3)).Xor(x0.mwl(y2)).Xor(x2.mwl(y0)).And(m2).Or(z)
  3122  	z = (x4.mwl(y4)).Xor(x0.mwl(y3)).Xor(x3.mwl(y0)).Xor(x1.mwl(y2)).Xor(x2.mwl(y1)).And(m3).Or(z)
  3123  	z = (x2.mwl(y2)).Xor(x0.mwl(y4)).Xor(x4.mwl(y0)).Xor(x1.mwl(y3)).Xor(x3.mwl(y1)).And(m4).Or(z)
  3124  
  3125  	return z
  3126  }
  3127  
  3128  // CarrylessMultiplyEven computes the carryless
  3129  // multiplications of selected even halves of the elements of x and y.
  3130  // The result fills the 128 bits of each even-odd pair.
  3131  //
  3132  // A carryless multiplication uses bitwise XOR instead of
  3133  // add-with-carry, for example (in base two):
  3134  //
  3135  //	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
  3136  //
  3137  // This also models multiplication of polynomials with coefficients
  3138  // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
  3139  // x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
  3140  // polynomial terms, but coefficients "add" with XOR.)
  3141  func (x Uint64s) CarrylessMultiplyEven(y Uint64s) Uint64s {
  3142  	return x.clmul(y)
  3143  }
  3144  
  3145  // CarrylessMultiplyOdd computes the carryless
  3146  // multiplications of selected odd halves of the elements of x and y.
  3147  // The result fills the 128 bits of each even-odd pair.
  3148  //
  3149  // A carryless multiplication uses bitwise XOR instead of
  3150  // add-with-carry, for example (in base two):
  3151  //
  3152  //	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
  3153  //
  3154  // This also models multiplication of polynomials with coefficients
  3155  // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
  3156  // x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
  3157  // polynomial terms, but coefficients "add" with XOR.)
  3158  func (x Uint64s) CarrylessMultiplyOdd(y Uint64s) Uint64s {
  3159  	x.a = x.b
  3160  	y.a = y.b
  3161  	return x.clmul(y)
  3162  }
  3163  
  3164  const (
  3165  	by8  = 0x0101010101010101
  3166  	by16 = 0x0001000100010001
  3167  )
  3168  
  3169  // BroadcastInt8 fills the elements of a slice with its argument value.
  3170  func BroadcastInt8s(x int8) Int8s {
  3171  	v := (255 & uint64(x)) * by8
  3172  	return Int8s{a: v, b: v}
  3173  }
  3174  
  3175  // BroadcastInt16 fills the elements of a slice with its argument value.
  3176  func BroadcastInt16s(x int16) Int16s {
  3177  	v := (65535 & uint64(x)) * by16
  3178  	return Int16s{a: v, b: v}
  3179  }
  3180  
  3181  // BroadcastInt32 fills the elements of a slice with its argument value.
  3182  func BroadcastInt32s(x int32) Int32s {
  3183  	v := uint64(x) & 0xffffffff
  3184  	v = v<<32 | v
  3185  	return Int32s{a: v, b: v}
  3186  }
  3187  
  3188  // BroadcastInt64 fills the elements of a slice with its argument value.
  3189  func BroadcastInt64s(x int64) Int64s {
  3190  	v := uint64(x)
  3191  	return Int64s{a: v, b: v}
  3192  }
  3193  
  3194  // BroadcastUint8 fills the elements of a slice with its argument value.
  3195  func BroadcastUint8s(x uint8) Uint8s {
  3196  	v := uint64(x) * by8
  3197  	return Uint8s{a: v, b: v}
  3198  
  3199  }
  3200  
  3201  // BroadcastUint16 fills the elements of a slice with its argument value.
  3202  func BroadcastUint16s(x uint16) Uint16s {
  3203  	v := uint64(x) * by16
  3204  	return Uint16s{a: v, b: v}
  3205  
  3206  }
  3207  
  3208  // BroadcastUint32 fills the elements of a slice with its argument value.
  3209  func BroadcastUint32s(x uint32) Uint32s {
  3210  	v := uint64(x)
  3211  	v = v<<32 | v
  3212  	return Uint32s{a: v, b: v}
  3213  }
  3214  
  3215  // BroadcastUint64 fills the elements of a slice with its argument value.
  3216  func BroadcastUint64s(x uint64) Uint64s {
  3217  	return Uint64s{a: x, b: x}
  3218  }
  3219  
  3220  // BroadcastFloat32 fills the elements of a slice with its argument value.
  3221  func BroadcastFloat32s(x float32) Float32s {
  3222  	v := uint64(math.Float32bits(x))
  3223  	v = v<<32 | v
  3224  	return Float32s{a: v, b: v}
  3225  }
  3226  
  3227  // BroadcastFloat64 fills the elements of a slice with its argument value.
  3228  func BroadcastFloat64s(x float64) Float64s {
  3229  	v := math.Float64bits(x)
  3230  	return Float64s{a: v, b: v}
  3231  }
  3232  

View as plain text