Text file src/math/big/arith_riscv64.s

     1  // Copyright 2020 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !math_big_pure_go && riscv64
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  TEXT ·addVV(SB),NOSPLIT,$0
    13  	MOV	x+24(FP), X5
    14  	MOV	y+48(FP), X6
    15  	MOV	z+0(FP), X7
    16  	MOV	z_len+8(FP), X30
    17  
    18  	MOV	$4, X28
    19  	MOV	$0, X29		// c = 0
    20  
    21  	BEQZ	X30, done
    22  	BLTU	X30, X28, loop1
    23  
    24  loop4:
    25  	MOV	0(X5), X8	// x[0]
    26  	MOV	0(X6), X9	// y[0]
    27  	MOV	8(X5), X11	// x[1]
    28  	MOV	8(X6), X12	// y[1]
    29  	MOV	16(X5), X14	// x[2]
    30  	MOV	16(X6), X15	// y[2]
    31  	MOV	24(X5), X17	// x[3]
    32  	MOV	24(X6), X18	// y[3]
    33  
    34  	ADD	X8, X9, X21	// z[0] = x[0] + y[0]
    35  	SLTU	X8, X21, X22
    36  	ADD	X21, X29, X10	// z[0] = x[0] + y[0] + c
    37  	SLTU	X21, X10, X23
    38  	ADD	X22, X23, X29	// next c
    39  
    40  	ADD	X11, X12, X24	// z[1] = x[1] + y[1]
    41  	SLTU	X11, X24, X25
    42  	ADD	X24, X29, X13	// z[1] = x[1] + y[1] + c
    43  	SLTU	X24, X13, X26
    44  	ADD	X25, X26, X29	// next c
    45  
    46  	ADD	X14, X15, X21	// z[2] = x[2] + y[2]
    47  	SLTU	X14, X21, X22
    48  	ADD	X21, X29, X16	// z[2] = x[2] + y[2] + c
    49  	SLTU	X21, X16, X23
    50  	ADD	X22, X23, X29	// next c
    51  
    52  	ADD	X17, X18, X21	// z[3] = x[3] + y[3]
    53  	SLTU	X17, X21, X22
    54  	ADD	X21, X29, X19	// z[3] = x[3] + y[3] + c
    55  	SLTU	X21, X19, X23
    56  	ADD	X22, X23, X29	// next c
    57  
    58  	MOV	X10, 0(X7)	// z[0]
    59  	MOV	X13, 8(X7)	// z[1]
    60  	MOV	X16, 16(X7)	// z[2]
    61  	MOV	X19, 24(X7)	// z[3]
    62  
    63  	ADD	$32, X5
    64  	ADD	$32, X6
    65  	ADD	$32, X7
    66  	SUB	$4, X30
    67  
    68  	BGEU	X30, X28, loop4
    69  	BEQZ	X30, done
    70  
    71  loop1:
    72  	MOV	0(X5), X10	// x
    73  	MOV	0(X6), X11	// y
    74  
    75  	ADD	X10, X11, X12	// z = x + y
    76  	SLTU	X10, X12, X14
    77  	ADD	X12, X29, X13	// z = x + y + c
    78  	SLTU	X12, X13, X15
    79  	ADD	X14, X15, X29	// next c
    80  
    81  	MOV	X13, 0(X7)	// z
    82  
    83  	ADD	$8, X5
    84  	ADD	$8, X6
    85  	ADD	$8, X7
    86  	SUB	$1, X30
    87  
    88  	BNEZ	X30, loop1
    89  
    90  done:
    91  	MOV	X29, c+72(FP)	// return c
    92  	RET
    93  
    94  TEXT ·subVV(SB),NOSPLIT,$0
    95  	MOV	x+24(FP), X5
    96  	MOV	y+48(FP), X6
    97  	MOV	z+0(FP), X7
    98  	MOV	z_len+8(FP), X30
    99  
   100  	MOV	$4, X28
   101  	MOV	$0, X29		// b = 0
   102  
   103  	BEQZ	X30, done
   104  	BLTU	X30, X28, loop1
   105  
   106  loop4:
   107  	MOV	0(X5), X8	// x[0]
   108  	MOV	0(X6), X9	// y[0]
   109  	MOV	8(X5), X11	// x[1]
   110  	MOV	8(X6), X12	// y[1]
   111  	MOV	16(X5), X14	// x[2]
   112  	MOV	16(X6), X15	// y[2]
   113  	MOV	24(X5), X17	// x[3]
   114  	MOV	24(X6), X18	// y[3]
   115  
   116  	SUB	X9, X8, X21	// z[0] = x[0] - y[0]
   117  	SLTU	X21, X8, X22
   118  	SUB	X29, X21, X10	// z[0] = x[0] - y[0] - b
   119  	SLTU	X10, X21, X23
   120  	ADD	X22, X23, X29	// next b
   121  
   122  	SUB	X12, X11, X24	// z[1] = x[1] - y[1]
   123  	SLTU	X24, X11, X25
   124  	SUB	X29, X24, X13	// z[1] = x[1] - y[1] - b
   125  	SLTU	X13, X24, X26
   126  	ADD	X25, X26, X29	// next b
   127  
   128  	SUB	X15, X14, X21	// z[2] = x[2] - y[2]
   129  	SLTU	X21, X14, X22
   130  	SUB	X29, X21, X16	// z[2] = x[2] - y[2] - b
   131  	SLTU	X16, X21, X23
   132  	ADD	X22, X23, X29	// next b
   133  
   134  	SUB	X18, X17, X21	// z[3] = x[3] - y[3]
   135  	SLTU	X21, X17, X22
   136  	SUB	X29, X21, X19	// z[3] = x[3] - y[3] - b
   137  	SLTU	X19, X21, X23
   138  	ADD	X22, X23, X29	// next b
   139  
   140  	MOV	X10, 0(X7)	// z[0]
   141  	MOV	X13, 8(X7)	// z[1]
   142  	MOV	X16, 16(X7)	// z[2]
   143  	MOV	X19, 24(X7)	// z[3]
   144  
   145  	ADD	$32, X5
   146  	ADD	$32, X6
   147  	ADD	$32, X7
   148  	SUB	$4, X30
   149  
   150  	BGEU	X30, X28, loop4
   151  	BEQZ	X30, done
   152  
   153  loop1:
   154  	MOV	0(X5), X10	// x
   155  	MOV	0(X6), X11	// y
   156  
   157  	SUB	X11, X10, X12	// z = x - y
   158  	SLTU	X12, X10, X14
   159  	SUB	X29, X12, X13	// z = x - y - b
   160  	SLTU	X13, X12, X15
   161  	ADD	X14, X15, X29	// next b
   162  
   163  	MOV	X13, 0(X7)	// z
   164  
   165  	ADD	$8, X5
   166  	ADD	$8, X6
   167  	ADD	$8, X7
   168  	SUB	$1, X30
   169  
   170  	BNEZ	X30, loop1
   171  
   172  done:
   173  	MOV	X29, c+72(FP)	// return b
   174  	RET
   175  
   176  TEXT ·addVW(SB),NOSPLIT,$0
   177  	MOV	x+24(FP), X5
   178  	MOV	y+48(FP), X6
   179  	MOV	z+0(FP), X7
   180  	MOV	z_len+8(FP), X30
   181  
   182  	MOV	$4, X28
   183  	MOV	X6, X29		// c = y
   184  
   185  	BEQZ	X30, done
   186  	BLTU	X30, X28, loop1
   187  
   188  loop4:
   189  	MOV	0(X5), X8	// x[0]
   190  	MOV	8(X5), X11	// x[1]
   191  	MOV	16(X5), X14	// x[2]
   192  	MOV	24(X5), X17	// x[3]
   193  
   194  	ADD	X8, X29, X10	// z[0] = x[0] + c
   195  	SLTU	X8, X10, X29	// next c
   196  
   197  	ADD	X11, X29, X13	// z[1] = x[1] + c
   198  	SLTU	X11, X13, X29	// next c
   199  
   200  	ADD	X14, X29, X16	// z[2] = x[2] + c
   201  	SLTU	X14, X16, X29	// next c
   202  
   203  	ADD	X17, X29, X19	// z[3] = x[3] + c
   204  	SLTU	X17, X19, X29	// next c
   205  
   206  	MOV	X10, 0(X7)	// z[0]
   207  	MOV	X13, 8(X7)	// z[1]
   208  	MOV	X16, 16(X7)	// z[2]
   209  	MOV	X19, 24(X7)	// z[3]
   210  
   211  	ADD	$32, X5
   212  	ADD	$32, X7
   213  	SUB	$4, X30
   214  
   215  	BGEU	X30, X28, loop4
   216  	BEQZ	X30, done
   217  
   218  loop1:
   219  	MOV	0(X5), X10	// x
   220  
   221  	ADD	X10, X29, X12	// z = x + c
   222  	SLTU	X10, X12, X29	// next c
   223  
   224  	MOV	X12, 0(X7)	// z
   225  
   226  	ADD	$8, X5
   227  	ADD	$8, X7
   228  	SUB	$1, X30
   229  
   230  	BNEZ	X30, loop1
   231  
   232  done:
   233  	MOV	X29, c+56(FP)	// return c
   234  	RET
   235  
   236  TEXT ·subVW(SB),NOSPLIT,$0
   237  	MOV	x+24(FP), X5
   238  	MOV	y+48(FP), X6
   239  	MOV	z+0(FP), X7
   240  	MOV	z_len+8(FP), X30
   241  
   242  	MOV	$4, X28
   243  	MOV	X6, X29		// b = y
   244  
   245  	BEQZ	X30, done
   246  	BLTU	X30, X28, loop1
   247  
   248  loop4:
   249  	MOV	0(X5), X8	// x[0]
   250  	MOV	8(X5), X11	// x[1]
   251  	MOV	16(X5), X14	// x[2]
   252  	MOV	24(X5), X17	// x[3]
   253  
   254  	SUB	X29, X8, X10	// z[0] = x[0] - b
   255  	SLTU	X10, X8, X29	// next b
   256  
   257  	SUB	X29, X11, X13	// z[1] = x[1] - b
   258  	SLTU	X13, X11, X29	// next b
   259  
   260  	SUB	X29, X14, X16	// z[2] = x[2] - b
   261  	SLTU	X16, X14, X29	// next b
   262  
   263  	SUB	X29, X17, X19	// z[3] = x[3] - b
   264  	SLTU	X19, X17, X29	// next b
   265  
   266  	MOV	X10, 0(X7)	// z[0]
   267  	MOV	X13, 8(X7)	// z[1]
   268  	MOV	X16, 16(X7)	// z[2]
   269  	MOV	X19, 24(X7)	// z[3]
   270  
   271  	ADD	$32, X5
   272  	ADD	$32, X7
   273  	SUB	$4, X30
   274  
   275  	BGEU	X30, X28, loop4
   276  	BEQZ	X30, done
   277  
   278  loop1:
   279  	MOV	0(X5), X10	// x
   280  
   281  	SUB	X29, X10, X12	// z = x - b
   282  	SLTU	X12, X10, X29	// next b
   283  
   284  	MOV	X12, 0(X7)	// z
   285  
   286  	ADD	$8, X5
   287  	ADD	$8, X7
   288  	SUB	$1, X30
   289  
   290  	BNEZ	X30, loop1
   291  
   292  done:
   293  	MOV	X29, c+56(FP)	// return b
   294  	RET
   295  
   296  TEXT ·shlVU(SB),NOSPLIT,$0
   297  	JMP ·shlVU_g(SB)
   298  
   299  TEXT ·shrVU(SB),NOSPLIT,$0
   300  	JMP ·shrVU_g(SB)
   301  
   302  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   303  	MOV	x+24(FP), X5
   304  	MOV	y+48(FP), X6
   305  	MOV	z+0(FP), X7
   306  	MOV	z_len+8(FP), X30
   307  	MOV	r+56(FP), X29
   308  
   309  	MOV	$4, X28
   310  
   311  	BEQ	ZERO, X30, done
   312  	BLTU	X30, X28, loop1
   313  
   314  loop4:
   315  	MOV	0(X5), X8	// x[0]
   316  	MOV	8(X5), X11	// x[1]
   317  	MOV	16(X5), X14	// x[2]
   318  	MOV	24(X5), X17	// x[3]
   319  
   320  	MULHU	X8, X6, X9	// z_hi[0] = x[0] * y
   321  	MUL	X8, X6, X8	// z_lo[0] = x[0] * y
   322  	ADD	X8, X29, X10	// z[0] = z_lo[0] + c
   323  	SLTU	X8, X10, X23
   324  	ADD	X23, X9, X29	// next c
   325  
   326  	MULHU	X11, X6, X12	// z_hi[1] = x[1] * y
   327  	MUL	X11, X6, X11	// z_lo[1] = x[1] * y
   328  	ADD	X11, X29, X13	// z[1] = z_lo[1] + c
   329  	SLTU	X11, X13, X23
   330  	ADD	X23, X12, X29	// next c
   331  
   332  	MULHU	X14, X6, X15	// z_hi[2] = x[2] * y
   333  	MUL	X14, X6, X14	// z_lo[2] = x[2] * y
   334  	ADD	X14, X29, X16	// z[2] = z_lo[2] + c
   335  	SLTU	X14, X16, X23
   336  	ADD	X23, X15, X29	// next c
   337  
   338  	MULHU	X17, X6, X18	// z_hi[3] = x[3] * y
   339  	MUL	X17, X6, X17	// z_lo[3] = x[3] * y
   340  	ADD	X17, X29, X19	// z[3] = z_lo[3] + c
   341  	SLTU	X17, X19, X23
   342  	ADD	X23, X18, X29	// next c
   343  
   344  	MOV	X10, 0(X7)	// z[0]
   345  	MOV	X13, 8(X7)	// z[1]
   346  	MOV	X16, 16(X7)	// z[2]
   347  	MOV	X19, 24(X7)	// z[3]
   348  
   349  	ADD	$32, X5
   350  	ADD	$32, X7
   351  	SUB	$4, X30
   352  
   353  	BGEU	X30, X28, loop4
   354  	BEQZ	X30, done
   355  
   356  loop1:
   357  	MOV	0(X5), X10	// x
   358  
   359  	MULHU	X10, X6, X12	// z_hi = x * y
   360  	MUL	X10, X6, X10	// z_lo = x * y
   361  	ADD	X10, X29, X13	// z_lo + c
   362  	SLTU	X10, X13, X15
   363  	ADD	X12, X15, X29	// next c
   364  
   365  	MOV	X13, 0(X7)	// z
   366  
   367  	ADD	$8, X5
   368  	ADD	$8, X7
   369  	SUB	$1, X30
   370  
   371  	BNEZ	X30, loop1
   372  
   373  done:
   374  	MOV	X29, c+64(FP)	// return c
   375  	RET
   376  
   377  TEXT ·addMulVVW(SB),NOSPLIT,$0
   378  	MOV	x+24(FP), X5
   379  	MOV	y+48(FP), X6
   380  	MOV	z+0(FP), X7
   381  	MOV	z_len+8(FP), X30
   382  
   383  	MOV	$4, X28
   384  	MOV	$0, X29		// c = 0
   385  
   386  	BEQZ	X30, done
   387  	BLTU	X30, X28, loop1
   388  
   389  loop4:
   390  	MOV	0(X5), X8	// x[0]
   391  	MOV	0(X7), X10	// z[0]
   392  	MOV	8(X5), X11	// x[1]
   393  	MOV	8(X7), X13	// z[1]
   394  	MOV	16(X5), X14	// x[2]
   395  	MOV	16(X7), X16	// z[2]
   396  	MOV	24(X5), X17	// x[3]
   397  	MOV	24(X7), X19	// z[3]
   398  
   399  	MULHU	X8, X6, X9	// z_hi[0] = x[0] * y
   400  	MUL	X8, X6, X8	// z_lo[0] = x[0] * y
   401  	ADD	X8, X10, X21	// z_lo[0] = x[0] * y + z[0]
   402  	SLTU	X8, X21, X22
   403  	ADD	X9, X22, X9	// z_hi[0] = x[0] * y + z[0]
   404  	ADD	X21, X29, X10	// z[0] = x[0] * y + z[0] + c
   405  	SLTU	X21, X10, X22
   406  	ADD	X9, X22, X29	// next c
   407  
   408  	MULHU	X11, X6, X12	// z_hi[1] = x[1] * y
   409  	MUL	X11, X6, X11	// z_lo[1] = x[1] * y
   410  	ADD	X11, X13, X21	// z_lo[1] = x[1] * y + z[1]
   411  	SLTU	X11, X21, X22
   412  	ADD	X12, X22, X12	// z_hi[1] = x[1] * y + z[1]
   413  	ADD	X21, X29, X13	// z[1] = x[1] * y + z[1] + c
   414  	SLTU	X21, X13, X22
   415  	ADD	X12, X22, X29	// next c
   416  
   417  	MULHU	X14, X6, X15	// z_hi[2] = x[2] * y
   418  	MUL	X14, X6, X14	// z_lo[2] = x[2] * y
   419  	ADD	X14, X16, X21	// z_lo[2] = x[2] * y + z[2]
   420  	SLTU	X14, X21, X22
   421  	ADD	X15, X22, X15	// z_hi[2] = x[2] * y + z[2]
   422  	ADD	X21, X29, X16	// z[2] = x[2] * y + z[2] + c
   423  	SLTU	X21, X16, X22
   424  	ADD	X15, X22, X29	// next c
   425  
   426  	MULHU	X17, X6, X18	// z_hi[3] = x[3] * y
   427  	MUL	X17, X6, X17	// z_lo[3] = x[3] * y
   428  	ADD	X17, X19, X21	// z_lo[3] = x[3] * y + z[3]
   429  	SLTU	X17, X21, X22
   430  	ADD	X18, X22, X18	// z_hi[3] = x[3] * y + z[3]
   431  	ADD	X21, X29, X19	// z[3] = x[3] * y + z[3] + c
   432  	SLTU	X21, X19, X22
   433  	ADD	X18, X22, X29	// next c
   434  
   435  	MOV	X10, 0(X7)	// z[0]
   436  	MOV	X13, 8(X7)	// z[1]
   437  	MOV	X16, 16(X7)	// z[2]
   438  	MOV	X19, 24(X7)	// z[3]
   439  
   440  	ADD	$32, X5
   441  	ADD	$32, X7
   442  	SUB	$4, X30
   443  
   444  	BGEU	X30, X28, loop4
   445  	BEQZ	X30, done
   446  
   447  loop1:
   448  	MOV	0(X5), X10	// x
   449  	MOV	0(X7), X11	// z
   450  
   451  	MULHU	X10, X6, X12	// z_hi = x * y
   452  	MUL	X10, X6, X10	// z_lo = x * y
   453  	ADD	X10, X11, X13	// z_lo = x * y + z
   454  	SLTU	X10, X13, X15
   455  	ADD	X12, X15, X12	// z_hi = x * y + z
   456  	ADD	X13, X29, X10	// z = x * y + z + c
   457  	SLTU	X13, X10, X15
   458  	ADD	X12, X15, X29	// next c
   459  
   460  	MOV	X10, 0(X7)	// z
   461  
   462  	ADD	$8, X5
   463  	ADD	$8, X7
   464  	SUB	$1, X30
   465  
   466  	BNEZ	X30, loop1
   467  
   468  done:
   469  	MOV	X29, c+56(FP)	// return c
   470  	RET
   471  

View as plain text