Text file src/math/big/arith_ppc64x.s

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !math_big_pure_go && (ppc64 || ppc64le)
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // func addVV(z, y, y []Word) (c Word)
    13  // z[i] = x[i] + y[i] for all i, carrying
    14  TEXT ·addVV(SB), NOSPLIT, $0
    15  	MOVD  z_len+8(FP), R7   // R7 = z_len
    16  	MOVD  x+24(FP), R8      // R8 = x[]
    17  	MOVD  y+48(FP), R9      // R9 = y[]
    18  	MOVD  z+0(FP), R10      // R10 = z[]
    19  
    20  	// If z_len = 0, we are done
    21  	CMP   R7, $0
    22  	MOVD  R0, R4
    23  	BEQ   done
    24  
    25  	// Process the first iteration out of the loop so we can
    26  	// use MOVDU and avoid 3 index registers updates.
    27  	MOVD  0(R8), R11      // R11 = x[i]
    28  	MOVD  0(R9), R12      // R12 = y[i]
    29  	ADD   $-1, R7         // R7 = z_len - 1
    30  	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
    31  	CMP   R7, $0
    32  	MOVD  R15, 0(R10)     // z[i]
    33  	BEQ   final          // If z_len was 1, we are done
    34  
    35  	SRD   $2, R7, R5      // R5 = z_len/4
    36  	CMP   R5, $0
    37  	MOVD  R5, CTR         // Set up loop counter
    38  	BEQ   tail            // If R5 = 0, we can't use the loop
    39  
    40  	// Process 4 elements per iteration. Unrolling this loop
    41  	// means a performance trade-off: we will lose performance
    42  	// for small values of z_len (0.90x in the worst case), but
    43  	// gain significant performance as z_len increases (up to
    44  	// 1.45x).
    45  
    46  	PCALIGN $16
    47  loop:
    48  	MOVD  8(R8), R11      // R11 = x[i]
    49  	MOVD  16(R8), R12     // R12 = x[i+1]
    50  	MOVD  24(R8), R14     // R14 = x[i+2]
    51  	MOVDU 32(R8), R15     // R15 = x[i+3]
    52  	MOVD  8(R9), R16      // R16 = y[i]
    53  	MOVD  16(R9), R17     // R17 = y[i+1]
    54  	MOVD  24(R9), R18     // R18 = y[i+2]
    55  	MOVDU 32(R9), R19     // R19 = y[i+3]
    56  	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    57  	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
    58  	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
    59  	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
    60  	MOVD  R20, 8(R10)     // z[i]
    61  	MOVD  R21, 16(R10)    // z[i+1]
    62  	MOVD  R22, 24(R10)    // z[i+2]
    63  	MOVDU R23, 32(R10)    // z[i+3]
    64  	ADD   $-4, R7         // R7 = z_len - 4
    65  	BDNZ  loop
    66  
    67  	// We may have more elements to read
    68  	CMP   R7, $0
    69  	BEQ   final
    70  
    71  	// Process the remaining elements, one at a time
    72  tail:
    73  	MOVDU 8(R8), R11      // R11 = x[i]
    74  	MOVDU 8(R9), R16      // R16 = y[i]
    75  	ADD   $-1, R7         // R7 = z_len - 1
    76  	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    77  	CMP   R7, $0
    78  	MOVDU R20, 8(R10)     // z[i]
    79  	BEQ   final           // If R7 = 0, we are done
    80  
    81  	MOVDU 8(R8), R11
    82  	MOVDU 8(R9), R16
    83  	ADD   $-1, R7
    84  	ADDE  R11, R16, R20
    85  	CMP   R7, $0
    86  	MOVDU R20, 8(R10)
    87  	BEQ   final
    88  
    89  	MOVD  8(R8), R11
    90  	MOVD  8(R9), R16
    91  	ADDE  R11, R16, R20
    92  	MOVD  R20, 8(R10)
    93  
    94  final:
    95  	ADDZE R4              // Capture CA
    96  
    97  done:
    98  	MOVD  R4, c+72(FP)
    99  	RET
   100  
   101  // func subVV(z, x, y []Word) (c Word)
   102  // z[i] = x[i] - y[i] for all i, carrying
   103  TEXT ·subVV(SB), NOSPLIT, $0
   104  	MOVD  z_len+8(FP), R7 // R7 = z_len
   105  	MOVD  x+24(FP), R8    // R8 = x[]
   106  	MOVD  y+48(FP), R9    // R9 = y[]
   107  	MOVD  z+0(FP), R10    // R10 = z[]
   108  
   109  	// If z_len = 0, we are done
   110  	CMP   R7, $0
   111  	MOVD  R0, R4
   112  	BEQ   done
   113  
   114  	// Process the first iteration out of the loop so we can
   115  	// use MOVDU and avoid 3 index registers updates.
   116  	MOVD  0(R8), R11      // R11 = x[i]
   117  	MOVD  0(R9), R12      // R12 = y[i]
   118  	ADD   $-1, R7         // R7 = z_len - 1
   119  	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
   120  	CMP   R7, $0
   121  	MOVD  R15, 0(R10)     // z[i]
   122  	BEQ   final           // If z_len was 1, we are done
   123  
   124  	SRD   $2, R7, R5      // R5 = z_len/4
   125  	CMP   R5, $0
   126  	MOVD  R5, CTR         // Set up loop counter
   127  	BEQ   tail            // If R5 = 0, we can't use the loop
   128  
   129  	// Process 4 elements per iteration. Unrolling this loop
   130  	// means a performance trade-off: we will lose performance
   131  	// for small values of z_len (0.92x in the worst case), but
   132  	// gain significant performance as z_len increases (up to
   133  	// 1.45x).
   134  
   135  	PCALIGN $16
   136  loop:
   137  	MOVD  8(R8), R11      // R11 = x[i]
   138  	MOVD  16(R8), R12     // R12 = x[i+1]
   139  	MOVD  24(R8), R14     // R14 = x[i+2]
   140  	MOVDU 32(R8), R15     // R15 = x[i+3]
   141  	MOVD  8(R9), R16      // R16 = y[i]
   142  	MOVD  16(R9), R17     // R17 = y[i+1]
   143  	MOVD  24(R9), R18     // R18 = y[i+2]
   144  	MOVDU 32(R9), R19     // R19 = y[i+3]
   145  	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   146  	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
   147  	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
   148  	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
   149  	MOVD  R20, 8(R10)     // z[i]
   150  	MOVD  R21, 16(R10)    // z[i+1]
   151  	MOVD  R22, 24(R10)    // z[i+2]
   152  	MOVDU R23, 32(R10)    // z[i+3]
   153  	ADD   $-4, R7         // R7 = z_len - 4
   154  	BDNZ  loop
   155  
   156  	// We may have more elements to read
   157  	CMP   R7, $0
   158  	BEQ   final
   159  
   160  	// Process the remaining elements, one at a time
   161  tail:
   162  	MOVDU 8(R8), R11      // R11 = x[i]
   163  	MOVDU 8(R9), R16      // R16 = y[i]
   164  	ADD   $-1, R7         // R7 = z_len - 1
   165  	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   166  	CMP   R7, $0
   167  	MOVDU R20, 8(R10)     // z[i]
   168  	BEQ   final           // If R7 = 0, we are done
   169  
   170  	MOVDU 8(R8), R11
   171  	MOVDU 8(R9), R16
   172  	ADD   $-1, R7
   173  	SUBE  R16, R11, R20
   174  	CMP   R7, $0
   175  	MOVDU R20, 8(R10)
   176  	BEQ   final
   177  
   178  	MOVD  8(R8), R11
   179  	MOVD  8(R9), R16
   180  	SUBE  R16, R11, R20
   181  	MOVD  R20, 8(R10)
   182  
   183  final:
   184  	ADDZE R4
   185  	XOR   $1, R4
   186  
   187  done:
   188  	MOVD  R4, c+72(FP)
   189  	RET
   190  
   191  // func addVW(z, x []Word, y Word) (c Word)
   192  TEXT ·addVW(SB), NOSPLIT, $0
   193  	MOVD z+0(FP), R10	// R10 = z[]
   194  	MOVD x+24(FP), R8	// R8 = x[]
   195  	MOVD y+48(FP), R4	// R4 = y = c
   196  	MOVD z_len+8(FP), R11	// R11 = z_len
   197  
   198  	CMP   R11, $0		// If z_len is zero, return
   199  	BEQ   done
   200  
   201  	// We will process the first iteration out of the loop so we capture
   202  	// the value of c. In the subsequent iterations, we will rely on the
   203  	// value of CA set here.
   204  	MOVD  0(R8), R20	// R20 = x[i]
   205  	ADD   $-1, R11		// R11 = z_len - 1
   206  	ADDC  R20, R4, R6	// R6 = x[i] + c
   207  	CMP   R11, $0		// If z_len was 1, we are done
   208  	MOVD  R6, 0(R10)	// z[i]
   209  	BEQ   final
   210  
   211  	// We will read 4 elements per iteration
   212  	SRDCC $2, R11, R9	// R9 = z_len/4
   213  	DCBT  (R8)
   214  	MOVD  R9, CTR		// Set up the loop counter
   215  	BEQ   tail		// If R9 = 0, we can't use the loop
   216  	PCALIGN $16
   217  
   218  loop:
   219  	MOVD  8(R8), R20	// R20 = x[i]
   220  	MOVD  16(R8), R21	// R21 = x[i+1]
   221  	MOVD  24(R8), R22	// R22 = x[i+2]
   222  	MOVDU 32(R8), R23	// R23 = x[i+3]
   223  	ADDZE R20, R24		// R24 = x[i] + CA
   224  	ADDZE R21, R25		// R25 = x[i+1] + CA
   225  	ADDZE R22, R26		// R26 = x[i+2] + CA
   226  	ADDZE R23, R27		// R27 = x[i+3] + CA
   227  	MOVD  R24, 8(R10)	// z[i]
   228  	MOVD  R25, 16(R10)	// z[i+1]
   229  	MOVD  R26, 24(R10)	// z[i+2]
   230  	MOVDU R27, 32(R10)	// z[i+3]
   231  	ADD   $-4, R11		// R11 = z_len - 4
   232  	BDNZ  loop
   233  
   234  	// We may have some elements to read
   235  	CMP R11, $0
   236  	BEQ final
   237  
   238  tail:
   239  	MOVDU 8(R8), R20
   240  	ADDZE R20, R24
   241  	ADD $-1, R11
   242  	MOVDU R24, 8(R10)
   243  	CMP R11, $0
   244  	BEQ final
   245  
   246  	MOVDU 8(R8), R20
   247  	ADDZE R20, R24
   248  	ADD $-1, R11
   249  	MOVDU R24, 8(R10)
   250  	CMP R11, $0
   251  	BEQ final
   252  
   253  	MOVD 8(R8), R20
   254  	ADDZE R20, R24
   255  	MOVD R24, 8(R10)
   256  
   257  final:
   258  	ADDZE R0, R4		// c = CA
   259  done:
   260  	MOVD  R4, c+56(FP)
   261  	RET
   262  
   263  // func subVW(z, x []Word, y Word) (c Word)
   264  TEXT ·subVW(SB), NOSPLIT, $0
   265  	MOVD  z+0(FP), R10	// R10 = z[]
   266  	MOVD  x+24(FP), R8	// R8 = x[]
   267  	MOVD  y+48(FP), R4	// R4 = y = c
   268  	MOVD  z_len+8(FP), R11	// R11 = z_len
   269  
   270  	CMP   R11, $0		// If z_len is zero, return
   271  	BEQ   done
   272  
   273  	// We will process the first iteration out of the loop so we capture
   274  	// the value of c. In the subsequent iterations, we will rely on the
   275  	// value of CA set here.
   276  	MOVD  0(R8), R20	// R20 = x[i]
   277  	ADD   $-1, R11		// R11 = z_len - 1
   278  	SUBC  R4, R20, R6	// R6 = x[i] - c
   279  	CMP   R11, $0		// If z_len was 1, we are done
   280  	MOVD  R6, 0(R10)	// z[i]
   281  	BEQ   final
   282  
   283  	// We will read 4 elements per iteration
   284  	SRDCC $2, R11, R9	// R9 = z_len/4
   285  	DCBT  (R8)
   286  	MOVD  R9, CTR		// Set up the loop counter
   287  	BEQ   tail		// If R9 = 0, we can't use the loop
   288  
   289  	// The loop here is almost the same as the one used in s390x, but
   290  	// we don't need to capture CA every iteration because we've already
   291  	// done that above.
   292  
   293  	PCALIGN $16
   294  loop:
   295  	MOVD  8(R8), R20
   296  	MOVD  16(R8), R21
   297  	MOVD  24(R8), R22
   298  	MOVDU 32(R8), R23
   299  	SUBE  R0, R20
   300  	SUBE  R0, R21
   301  	SUBE  R0, R22
   302  	SUBE  R0, R23
   303  	MOVD  R20, 8(R10)
   304  	MOVD  R21, 16(R10)
   305  	MOVD  R22, 24(R10)
   306  	MOVDU R23, 32(R10)
   307  	ADD   $-4, R11
   308  	BDNZ  loop
   309  
   310  	// We may have some elements to read
   311  	CMP   R11, $0
   312  	BEQ   final
   313  
   314  tail:
   315  	MOVDU 8(R8), R20
   316  	SUBE  R0, R20
   317  	ADD   $-1, R11
   318  	MOVDU R20, 8(R10)
   319  	CMP   R11, $0
   320  	BEQ   final
   321  
   322  	MOVDU 8(R8), R20
   323  	SUBE  R0, R20
   324  	ADD   $-1, R11
   325  	MOVDU R20, 8(R10)
   326  	CMP   R11, $0
   327  	BEQ   final
   328  
   329  	MOVD  8(R8), R20
   330  	SUBE  R0, R20
   331  	MOVD  R20, 8(R10)
   332  
   333  final:
   334  	// Capture CA
   335  	SUBE  R4, R4
   336  	NEG   R4, R4
   337  
   338  done:
   339  	MOVD  R4, c+56(FP)
   340  	RET
   341  
   342  //func shlVU(z, x []Word, s uint) (c Word)
   343  TEXT ·shlVU(SB), NOSPLIT, $0
   344  	MOVD    z+0(FP), R3
   345  	MOVD    x+24(FP), R6
   346  	MOVD    s+48(FP), R9
   347  	MOVD    z_len+8(FP), R4
   348  	MOVD    x_len+32(FP), R7
   349  	CMP     R9, $0          // s==0 copy(z,x)
   350  	BEQ     zeroshift
   351  	CMP     R4, $0          // len(z)==0 return
   352  	BEQ     done
   353  
   354  	ADD     $-1, R4, R5     // len(z)-1
   355  	SUBC    R9, $64, R4     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
   356  	SLD     $3, R5, R7
   357  	ADD     R6, R7, R15     // save starting address &x[len(z)-1]
   358  	ADD     R3, R7, R16     // save starting address &z[len(z)-1]
   359  	MOVD    (R6)(R7), R14
   360  	SRD     R4, R14, R7     // compute x[len(z)-1]>>ŝ into R7
   361  	CMP     R5, $0          // iterate from i=len(z)-1 to 0
   362  	BEQ     loopexit        // Already at end?
   363  	MOVD	0(R15),R10	// x[i]
   364  	PCALIGN $16
   365  shloop:
   366  	SLD     R9, R10, R10    // x[i]<<s
   367  	MOVDU   -8(R15), R14
   368  	SRD     R4, R14, R11    // x[i-1]>>ŝ
   369  	OR      R11, R10, R10
   370  	MOVD    R10, 0(R16)     // z[i-1]=x[i]<<s | x[i-1]>>ŝ
   371  	MOVD	R14, R10	// reuse x[i-1] for next iteration
   372  	ADD     $-8, R16        // i--
   373  	CMP     R15, R6         // &x[i-1]>&x[0]?
   374  	BGT     shloop
   375  loopexit:
   376  	MOVD    0(R6), R4
   377  	SLD     R9, R4, R4
   378  	MOVD    R4, 0(R3)       // z[0]=x[0]<<s
   379  	MOVD    R7, c+56(FP)    // store pre-computed x[len(z)-1]>>ŝ into c
   380  	RET
   381  
   382  zeroshift:
   383  	CMP     R6, $0          // x is null, nothing to copy
   384  	BEQ     done
   385  	CMP     R6, R3          // if x is same as z, nothing to copy
   386  	BEQ     done
   387  	CMP     R7, R4
   388  	ISEL    $0, R7, R4, R7  // Take the lower bound of lengths of x,z
   389  	SLD     $3, R7, R7
   390  	SUB     R6, R3, R11     // dest - src
   391  	CMPU    R11, R7, CR2    // < len?
   392  	BLT     CR2, backward   // there is overlap, copy backwards
   393  	MOVD    $0, R14
   394  	// shlVU processes backwards, but added a forward copy option
   395  	// since its faster on POWER
   396  repeat:
   397  	MOVD    (R6)(R14), R15  // Copy 8 bytes at a time
   398  	MOVD    R15, (R3)(R14)
   399  	ADD     $8, R14
   400  	CMP     R14, R7         // More 8 bytes left?
   401  	BLT     repeat
   402  	BR      done
   403  backward:
   404  	ADD     $-8,R7, R14
   405  repeatback:
   406  	MOVD    (R6)(R14), R15  // copy x into z backwards
   407  	MOVD    R15, (R3)(R14)  // copy 8 bytes at a time
   408  	SUB     $8, R14
   409  	CMP     R14, $-8        // More 8 bytes left?
   410  	BGT     repeatback
   411  
   412  done:
   413  	MOVD    R0, c+56(FP)    // c=0
   414  	RET
   415  
   416  //func shrVU(z, x []Word, s uint) (c Word)
   417  TEXT ·shrVU(SB), NOSPLIT, $0
   418  	MOVD    z+0(FP), R3
   419  	MOVD    x+24(FP), R6
   420  	MOVD    s+48(FP), R9
   421  	MOVD    z_len+8(FP), R4
   422  	MOVD    x_len+32(FP), R7
   423  
   424  	CMP     R9, $0          // s==0, copy(z,x)
   425  	BEQ     zeroshift
   426  	CMP     R4, $0          // len(z)==0 return
   427  	BEQ     done
   428  	SUBC    R9, $64, R5     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
   429  
   430  	MOVD    0(R6), R7
   431  	SLD     R5, R7, R7      // compute x[0]<<ŝ
   432  	MOVD    $1, R8          // iterate from i=1 to i<len(z)
   433  	CMP     R8, R4
   434  	BGE     loopexit        // Already at end?
   435  
   436  	// vectorize if len(z) is >=3, else jump to scalar loop
   437  	CMP     R4, $3
   438  	BLT     scalar
   439  	MTVSRD  R9, VS38        // s
   440  	VSPLTB  $7, V6, V4
   441  	MTVSRD  R5, VS39        // ŝ
   442  	VSPLTB  $7, V7, V2
   443  	ADD     $-2, R4, R16
   444  	PCALIGN $16
   445  loopback:
   446  	ADD     $-1, R8, R10
   447  	SLD     $3, R10
   448  	LXVD2X  (R6)(R10), VS32 // load x[i-1], x[i]
   449  	SLD     $3, R8, R12
   450  	LXVD2X  (R6)(R12), VS33 // load x[i], x[i+1]
   451  
   452  	VSRD    V0, V4, V3      // x[i-1]>>s, x[i]>>s
   453  	VSLD    V1, V2, V5      // x[i]<<ŝ, x[i+1]<<ŝ
   454  	VOR     V3, V5, V5      // Or(|) the two registers together
   455  	STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
   456  	ADD     $2, R8          // Done processing 2 entries, i and i+1
   457  	CMP     R8, R16         // Are there at least a couple of more entries left?
   458  	BLE     loopback
   459  	CMP     R8, R4          // Are we at the last element?
   460  	BEQ     loopexit
   461  scalar:
   462  	ADD     $-1, R8, R10
   463  	SLD     $3, R10
   464  	MOVD    (R6)(R10),R11
   465  	SRD     R9, R11, R11    // x[len(z)-2] >> s
   466  	SLD     $3, R8, R12
   467  	MOVD    (R6)(R12), R12
   468  	SLD     R5, R12, R12    // x[len(z)-1]<<ŝ
   469  	OR      R12, R11, R11   // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
   470  	MOVD    R11, (R3)(R10)  // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
   471  loopexit:
   472  	ADD     $-1, R4
   473  	SLD     $3, R4
   474  	MOVD    (R6)(R4), R5
   475  	SRD     R9, R5, R5      // x[len(z)-1]>>s
   476  	MOVD    R5, (R3)(R4)    // z[len(z)-1]=x[len(z)-1]>>s
   477  	MOVD    R7, c+56(FP)    // store pre-computed x[0]<<ŝ into c
   478  	RET
   479  
   480  zeroshift:
   481  	CMP     R6, $0          // x is null, nothing to copy
   482  	BEQ     done
   483  	CMP     R6, R3          // if x is same as z, nothing to copy
   484  	BEQ     done
   485  	CMP     R7, R4
   486  	ISEL    $0, R7, R4, R7  // Take the lower bounds of lengths of x, z
   487  	SLD     $3, R7, R7
   488  	MOVD    $0, R14
   489  repeat:
   490  	MOVD    (R6)(R14), R15  // copy 8 bytes at a time
   491  	MOVD    R15, (R3)(R14)  // shrVU processes bytes only forwards
   492  	ADD     $8, R14
   493  	CMP     R14, R7         // More 8 bytes left?
   494  	BLT     repeat
   495  done:
   496  	MOVD    R0, c+56(FP)
   497  	RET
   498  
   499  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   500  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   501  	MOVD    z+0(FP), R10      // R10 = z[]
   502  	MOVD    x+24(FP), R8      // R8 = x[]
   503  	MOVD    y+48(FP), R9      // R9 = y
   504  	MOVD    r+56(FP), R4      // R4 = r = c
   505  	MOVD    z_len+8(FP), R11  // R11 = z_len
   506  
   507  	CMP     R11, $0
   508  	BEQ     done
   509  
   510  	MOVD    0(R8), R20
   511  	ADD     $-1, R11
   512  	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
   513  	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
   514  	ADDC    R4, R6            // R6 = z0 + r
   515  	ADDZE   R7, R4            // R4 = z1 + CA
   516  	CMP     R11, $0
   517  	MOVD    R6, 0(R10)        // z[i]
   518  	BEQ     done
   519  
   520  	// We will read 4 elements per iteration
   521  	SRDCC   $2, R11, R14      // R14 = z_len/4
   522  	DCBT    (R8)
   523  	MOVD    R14, CTR          // Set up the loop counter
   524  	BEQ     tail              // If R9 = 0, we can't use the loop
   525  	PCALIGN $16
   526  
   527  loop:
   528  	MOVD    8(R8), R20        // R20 = x[i]
   529  	MOVD    16(R8), R21       // R21 = x[i+1]
   530  	MOVD    24(R8), R22       // R22 = x[i+2]
   531  	MOVDU   32(R8), R23       // R23 = x[i+3]
   532  	MULLD   R9, R20, R24      // R24 = z0[i]
   533  	MULHDU  R9, R20, R20      // R20 = z1[i]
   534  	ADDC    R4, R24           // R24 = z0[i] + c
   535  	MULLD   R9, R21, R25
   536  	MULHDU  R9, R21, R21
   537  	ADDE    R20, R25
   538  	MULLD   R9, R22, R26
   539  	MULHDU  R9, R22, R22
   540  	MULLD   R9, R23, R27
   541  	MULHDU  R9, R23, R23
   542  	ADDE    R21, R26
   543  	MOVD    R24, 8(R10)       // z[i]
   544  	MOVD    R25, 16(R10)      // z[i+1]
   545  	ADDE    R22, R27
   546  	ADDZE   R23,R4		  // update carry
   547  	MOVD    R26, 24(R10)      // z[i+2]
   548  	MOVDU   R27, 32(R10)      // z[i+3]
   549  	ADD     $-4, R11          // R11 = z_len - 4
   550  	BDNZ    loop
   551  
   552  	// We may have some elements to read
   553  	CMP   R11, $0
   554  	BEQ   done
   555  
   556  	// Process the remaining elements, one at a time
   557  tail:
   558  	MOVDU   8(R8), R20        // R20 = x[i]
   559  	MULLD   R9, R20, R24      // R24 = z0[i]
   560  	MULHDU  R9, R20, R25      // R25 = z1[i]
   561  	ADD     $-1, R11          // R11 = z_len - 1
   562  	ADDC    R4, R24
   563  	ADDZE   R25, R4
   564  	MOVDU   R24, 8(R10)       // z[i]
   565  	CMP     R11, $0
   566  	BEQ     done              // If R11 = 0, we are done
   567  
   568  	MOVDU   8(R8), R20
   569  	MULLD   R9, R20, R24
   570  	MULHDU  R9, R20, R25
   571  	ADD     $-1, R11
   572  	ADDC    R4, R24
   573  	ADDZE   R25, R4
   574  	MOVDU   R24, 8(R10)
   575  	CMP     R11, $0
   576  	BEQ     done
   577  
   578  	MOVD    8(R8), R20
   579  	MULLD   R9, R20, R24
   580  	MULHDU  R9, R20, R25
   581  	ADD     $-1, R11
   582  	ADDC    R4, R24
   583  	ADDZE   R25,R4
   584  	MOVD    R24, 8(R10)
   585  
   586  done:
   587  	MOVD    R4, c+64(FP)
   588  	RET
   589  
   590  // func addMulVVW(z, x []Word, y Word) (c Word)
   591  TEXT ·addMulVVW(SB), NOSPLIT, $0
   592  	MOVD	z+0(FP), R3	// R3 = z[]
   593  	MOVD	x+24(FP), R4	// R4 = x[]
   594  	MOVD	y+48(FP), R5	// R5 = y
   595  	MOVD	z_len+8(FP), R6	// R6 = z_len
   596  
   597  	CMP	R6, $4
   598  	MOVD	R0, R9		// R9 = c = 0
   599  	BLT	tail
   600  	SRD	$2, R6, R7
   601  	MOVD	R7, CTR		// Initialize loop counter
   602  	PCALIGN	$16
   603  
   604  loop:
   605  	MOVD	0(R4), R14	// x[i]
   606  	MOVD	8(R4), R16	// x[i+1]
   607  	MOVD	16(R4), R18	// x[i+2]
   608  	MOVD	24(R4), R20	// x[i+3]
   609  	MOVD	0(R3), R15	// z[i]
   610  	MOVD	8(R3), R17	// z[i+1]
   611  	MOVD	16(R3), R19	// z[i+2]
   612  	MOVD	24(R3), R21	// z[i+3]
   613  	MULLD	R5, R14, R10	// low x[i]*y
   614  	MULHDU	R5, R14, R11	// high x[i]*y
   615  	ADDC	R15, R10
   616  	ADDZE	R11
   617  	ADDC	R9, R10
   618  	ADDZE	R11, R9
   619  	MULLD	R5, R16, R14	// low x[i+1]*y
   620  	MULHDU	R5, R16, R15	// high x[i+1]*y
   621  	ADDC	R17, R14
   622  	ADDZE	R15
   623  	ADDC	R9, R14
   624  	ADDZE	R15, R9
   625  	MULLD	R5, R18, R16    // low x[i+2]*y
   626  	MULHDU	R5, R18, R17    // high x[i+2]*y
   627  	ADDC	R19, R16
   628  	ADDZE	R17
   629  	ADDC	R9, R16
   630  	ADDZE	R17, R9
   631  	MULLD	R5, R20, R18    // low x[i+3]*y
   632  	MULHDU	R5, R20, R19    // high x[i+3]*y
   633  	ADDC	R21, R18
   634  	ADDZE	R19
   635  	ADDC	R9, R18
   636  	ADDZE	R19, R9
   637  	MOVD	R10, 0(R3)	// z[i]
   638  	MOVD	R14, 8(R3)	// z[i+1]
   639  	MOVD	R16, 16(R3)	// z[i+2]
   640  	MOVD	R18, 24(R3)	// z[i+3]
   641  	ADD	$32, R3
   642  	ADD	$32, R4
   643  	BDNZ	loop
   644  
   645  	ANDCC	$3, R6
   646  tail:
   647  	CMP	R6, $0
   648  	BEQ	done
   649  	MOVD	R6, CTR
   650  	PCALIGN $16
   651  tailloop:
   652  	MOVD	0(R4), R14
   653  	MOVD	0(R3), R15
   654  	MULLD	R5, R14, R10
   655  	MULHDU	R5, R14, R11
   656  	ADDC	R15, R10
   657  	ADDZE	R11
   658  	ADDC	R9, R10
   659  	ADDZE	R11, R9
   660  	MOVD	R10, 0(R3)
   661  	ADD	$8, R3
   662  	ADD	$8, R4
   663  	BDNZ	tailloop
   664  
   665  done:
   666  	MOVD	R9, c+56(FP)
   667  	RET
   668  
   669  

View as plain text