Text file src/math/big/arith_s390x.s

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !math_big_pure_go
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
    13  // func addVV(z, x, y []Word) (c Word)
    14  
    15  TEXT ·addVV(SB), NOSPLIT, $0
    16  	MOVD addvectorfacility+0x00(SB), R1
    17  	BR   (R1)
    18  
    19  TEXT ·addVV_check(SB), NOSPLIT, $0
    20  	MOVB   ·hasVX(SB), R1
    21  	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
    22  	MOVD   $addvectorfacility+0x00(SB), R1
    23  	MOVD   $·addVV_novec(SB), R2
    24  	MOVD   R2, 0(R1)
    25  
    26  	// MOVD	$·addVV_novec(SB), 0(R1)
    27  	BR ·addVV_novec(SB)
    28  
    29  vectorimpl:
    30  	MOVD $addvectorfacility+0x00(SB), R1
    31  	MOVD $·addVV_vec(SB), R2
    32  	MOVD R2, 0(R1)
    33  
    34  	// MOVD	$·addVV_vec(SB), 0(R1)
    35  	BR ·addVV_vec(SB)
    36  
    37  GLOBL addvectorfacility+0x00(SB), NOPTR, $8
    38  DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
    39  
    40  TEXT ·addVV_vec(SB), NOSPLIT, $0
    41  	MOVD z_len+8(FP), R3
    42  	MOVD x+24(FP), R8
    43  	MOVD y+48(FP), R9
    44  	MOVD z+0(FP), R2
    45  
    46  	MOVD $0, R4  // c = 0
    47  	MOVD $0, R0  // make sure it's zero
    48  	MOVD $0, R10 // i = 0
    49  
    50  	// s/JL/JMP/ below to disable the unrolled loop
    51  	SUB $4, R3
    52  	BLT v1
    53  	SUB $12, R3 // n -= 16
    54  	BLT A1      // if n < 0 goto A1
    55  
    56  	MOVD R8, R5
    57  	MOVD R9, R6
    58  	MOVD R2, R7
    59  
    60  	// n >= 0
    61  	// regular loop body unrolled 16x
    62  	VZERO V0 // c = 0
    63  
    64  UU1:
    65  	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
    66  	ADD  $64, R5
    67  	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
    68  	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
    69  
    70  	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
    71  	ADD  $64, R6
    72  	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
    73  	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
    74  
    75  	VACCCQ V1, V9, V0, V25
    76  	VACQ   V1, V9, V0, V17
    77  	VACCCQ V2, V10, V25, V26
    78  	VACQ   V2, V10, V25, V18
    79  
    80  	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
    81  	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
    82  	ADD $32, R5
    83  	ADD $32, R6
    84  
    85  	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
    86  	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
    87  	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
    88  	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
    89  
    90  	VACCCQ V3, V11, V26, V27
    91  	VACQ   V3, V11, V26, V19
    92  	VACCCQ V4, V12, V27, V28
    93  	VACQ   V4, V12, V27, V20
    94  
    95  	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
    96  	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
    97  	ADD $32, R5
    98  	ADD $32, R6
    99  
   100  	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
   101  	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
   102  	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
   103  	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
   104  
   105  	VACCCQ V5, V13, V28, V29
   106  	VACQ   V5, V13, V28, V21
   107  	VACCCQ V6, V14, V29, V30
   108  	VACQ   V6, V14, V29, V22
   109  
   110  	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
   111  	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
   112  	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
   113  	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
   114  
   115  	VACCCQ V7, V15, V30, V31
   116  	VACQ   V7, V15, V30, V23
   117  	VACCCQ V8, V16, V31, V0  // V0 has carry-over
   118  	VACQ   V8, V16, V31, V24
   119  
   120  	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
   121  	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
   122  	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
   123  	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
   124  	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
   125  	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
   126  	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
   127  	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
   128  	VSTM  V17, V24, 0(R7)     // 128-bytes into z
   129  	ADD   $128, R7
   130  	ADD   $128, R10           // i += 16
   131  	SUB   $16, R3             // n -= 16
   132  	BGE   UU1                 // if n >= 0 goto U1
   133  	VLGVG $1, V0, R4          // put cf into R4
   134  	NEG   R4, R4              // save cf
   135  
   136  A1:
   137  	ADD $12, R3 // n += 16
   138  
   139  	// s/JL/JMP/ below to disable the unrolled loop
   140  	BLT v1 // if n < 0 goto v1
   141  
   142  U1:  // n >= 0
   143  	// regular loop body unrolled 4x
   144  	MOVD 0(R8)(R10*1), R5
   145  	MOVD 8(R8)(R10*1), R6
   146  	MOVD 16(R8)(R10*1), R7
   147  	MOVD 24(R8)(R10*1), R1
   148  	ADDC R4, R4             // restore CF
   149  	MOVD 0(R9)(R10*1), R11
   150  	ADDE R11, R5
   151  	MOVD 8(R9)(R10*1), R11
   152  	ADDE R11, R6
   153  	MOVD 16(R9)(R10*1), R11
   154  	ADDE R11, R7
   155  	MOVD 24(R9)(R10*1), R11
   156  	ADDE R11, R1
   157  	MOVD R0, R4
   158  	ADDE R4, R4             // save CF
   159  	NEG  R4, R4
   160  	MOVD R5, 0(R2)(R10*1)
   161  	MOVD R6, 8(R2)(R10*1)
   162  	MOVD R7, 16(R2)(R10*1)
   163  	MOVD R1, 24(R2)(R10*1)
   164  
   165  	ADD $32, R10 // i += 4
   166  	SUB $4, R3   // n -= 4
   167  	BGE U1       // if n >= 0 goto U1
   168  
   169  v1:
   170  	ADD $4, R3 // n += 4
   171  	BLE E1     // if n <= 0 goto E1
   172  
   173  L1:  // n > 0
   174  	ADDC R4, R4            // restore CF
   175  	MOVD 0(R8)(R10*1), R5
   176  	MOVD 0(R9)(R10*1), R11
   177  	ADDE R11, R5
   178  	MOVD R5, 0(R2)(R10*1)
   179  	MOVD R0, R4
   180  	ADDE R4, R4            // save CF
   181  	NEG  R4, R4
   182  
   183  	ADD $8, R10 // i++
   184  	SUB $1, R3  // n--
   185  	BGT L1      // if n > 0 goto L1
   186  
   187  E1:
   188  	NEG  R4, R4
   189  	MOVD R4, c+72(FP) // return c
   190  	RET
   191  
   192  TEXT ·addVV_novec(SB), NOSPLIT, $0
   193  novec:
   194  	MOVD z_len+8(FP), R3
   195  	MOVD x+24(FP), R8
   196  	MOVD y+48(FP), R9
   197  	MOVD z+0(FP), R2
   198  
   199  	MOVD $0, R4  // c = 0
   200  	MOVD $0, R0  // make sure it's zero
   201  	MOVD $0, R10 // i = 0
   202  
   203  	// s/JL/JMP/ below to disable the unrolled loop
   204  	SUB $4, R3 // n -= 4
   205  	BLT v1n    // if n < 0 goto v1n
   206  
   207  U1n:  // n >= 0
   208  	// regular loop body unrolled 4x
   209  	MOVD 0(R8)(R10*1), R5
   210  	MOVD 8(R8)(R10*1), R6
   211  	MOVD 16(R8)(R10*1), R7
   212  	MOVD 24(R8)(R10*1), R1
   213  	ADDC R4, R4             // restore CF
   214  	MOVD 0(R9)(R10*1), R11
   215  	ADDE R11, R5
   216  	MOVD 8(R9)(R10*1), R11
   217  	ADDE R11, R6
   218  	MOVD 16(R9)(R10*1), R11
   219  	ADDE R11, R7
   220  	MOVD 24(R9)(R10*1), R11
   221  	ADDE R11, R1
   222  	MOVD R0, R4
   223  	ADDE R4, R4             // save CF
   224  	NEG  R4, R4
   225  	MOVD R5, 0(R2)(R10*1)
   226  	MOVD R6, 8(R2)(R10*1)
   227  	MOVD R7, 16(R2)(R10*1)
   228  	MOVD R1, 24(R2)(R10*1)
   229  
   230  	ADD $32, R10 // i += 4
   231  	SUB $4, R3   // n -= 4
   232  	BGE U1n      // if n >= 0 goto U1n
   233  
   234  v1n:
   235  	ADD $4, R3 // n += 4
   236  	BLE E1n    // if n <= 0 goto E1n
   237  
   238  L1n:  // n > 0
   239  	ADDC R4, R4            // restore CF
   240  	MOVD 0(R8)(R10*1), R5
   241  	MOVD 0(R9)(R10*1), R11
   242  	ADDE R11, R5
   243  	MOVD R5, 0(R2)(R10*1)
   244  	MOVD R0, R4
   245  	ADDE R4, R4            // save CF
   246  	NEG  R4, R4
   247  
   248  	ADD $8, R10 // i++
   249  	SUB $1, R3  // n--
   250  	BGT L1n     // if n > 0 goto L1n
   251  
   252  E1n:
   253  	NEG  R4, R4
   254  	MOVD R4, c+72(FP) // return c
   255  	RET
   256  
   257  TEXT ·subVV(SB), NOSPLIT, $0
   258  	MOVD subvectorfacility+0x00(SB), R1
   259  	BR   (R1)
   260  
   261  TEXT ·subVV_check(SB), NOSPLIT, $0
   262  	MOVB   ·hasVX(SB), R1
   263  	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
   264  	MOVD   $subvectorfacility+0x00(SB), R1
   265  	MOVD   $·subVV_novec(SB), R2
   266  	MOVD   R2, 0(R1)
   267  
   268  	// MOVD	$·subVV_novec(SB), 0(R1)
   269  	BR ·subVV_novec(SB)
   270  
   271  vectorimpl:
   272  	MOVD $subvectorfacility+0x00(SB), R1
   273  	MOVD $·subVV_vec(SB), R2
   274  	MOVD R2, 0(R1)
   275  
   276  	// MOVD	$·subVV_vec(SB), 0(R1)
   277  	BR ·subVV_vec(SB)
   278  
   279  GLOBL subvectorfacility+0x00(SB), NOPTR, $8
   280  DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
   281  
   282  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   283  // func subVV(z, x, y []Word) (c Word)
   284  // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   285  TEXT ·subVV_vec(SB), NOSPLIT, $0
   286  	MOVD z_len+8(FP), R3
   287  	MOVD x+24(FP), R8
   288  	MOVD y+48(FP), R9
   289  	MOVD z+0(FP), R2
   290  	MOVD $0, R4          // c = 0
   291  	MOVD $0, R0          // make sure it's zero
   292  	MOVD $0, R10         // i = 0
   293  
   294  	// s/JL/JMP/ below to disable the unrolled loop
   295  	SUB $4, R3  // n -= 4
   296  	BLT v1      // if n < 0 goto v1
   297  	SUB $12, R3 // n -= 16
   298  	BLT A1      // if n < 0 goto A1
   299  
   300  	MOVD R8, R5
   301  	MOVD R9, R6
   302  	MOVD R2, R7
   303  
   304  	// n >= 0
   305  	// regular loop body unrolled 16x
   306  	VZERO V0         // cf = 0
   307  	MOVD  $1, R4     // for 390 subtraction cf starts as 1 (no borrow)
   308  	VLVGG $1, R4, V0 // put carry into V0
   309  
   310  UU1:
   311  	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
   312  	ADD  $64, R5
   313  	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
   314  	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
   315  
   316  	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
   317  	ADD  $64, R6
   318  	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
   319  	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
   320  
   321  	VSBCBIQ V1, V9, V0, V25
   322  	VSBIQ   V1, V9, V0, V17
   323  	VSBCBIQ V2, V10, V25, V26
   324  	VSBIQ   V2, V10, V25, V18
   325  
   326  	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
   327  	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
   328  	ADD $32, R5
   329  	ADD $32, R6
   330  
   331  	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
   332  	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
   333  	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
   334  	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
   335  
   336  	VSBCBIQ V3, V11, V26, V27
   337  	VSBIQ   V3, V11, V26, V19
   338  	VSBCBIQ V4, V12, V27, V28
   339  	VSBIQ   V4, V12, V27, V20
   340  
   341  	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
   342  	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
   343  	ADD $32, R5
   344  	ADD $32, R6
   345  
   346  	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
   347  	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
   348  	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
   349  	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
   350  
   351  	VSBCBIQ V5, V13, V28, V29
   352  	VSBIQ   V5, V13, V28, V21
   353  	VSBCBIQ V6, V14, V29, V30
   354  	VSBIQ   V6, V14, V29, V22
   355  
   356  	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
   357  	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
   358  	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
   359  	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
   360  
   361  	VSBCBIQ V7, V15, V30, V31
   362  	VSBIQ   V7, V15, V30, V23
   363  	VSBCBIQ V8, V16, V31, V0  // V0 has carry-over
   364  	VSBIQ   V8, V16, V31, V24
   365  
   366  	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
   367  	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
   368  	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
   369  	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
   370  	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
   371  	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
   372  	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
   373  	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
   374  	VSTM  V17, V24, 0(R7)     // 128-bytes into z
   375  	ADD   $128, R7
   376  	ADD   $128, R10           // i += 16
   377  	SUB   $16, R3             // n -= 16
   378  	BGE   UU1                 // if n >= 0 goto U1
   379  	VLGVG $1, V0, R4          // put cf into R4
   380  	SUB   $1, R4              // save cf
   381  
   382  A1:
   383  	ADD $12, R3 // n += 16
   384  	BLT v1      // if n < 0 goto v1
   385  
   386  U1:  // n >= 0
   387  	// regular loop body unrolled 4x
   388  	MOVD 0(R8)(R10*1), R5
   389  	MOVD 8(R8)(R10*1), R6
   390  	MOVD 16(R8)(R10*1), R7
   391  	MOVD 24(R8)(R10*1), R1
   392  	MOVD R0, R11
   393  	SUBC R4, R11            // restore CF
   394  	MOVD 0(R9)(R10*1), R11
   395  	SUBE R11, R5
   396  	MOVD 8(R9)(R10*1), R11
   397  	SUBE R11, R6
   398  	MOVD 16(R9)(R10*1), R11
   399  	SUBE R11, R7
   400  	MOVD 24(R9)(R10*1), R11
   401  	SUBE R11, R1
   402  	MOVD R0, R4
   403  	SUBE R4, R4             // save CF
   404  	MOVD R5, 0(R2)(R10*1)
   405  	MOVD R6, 8(R2)(R10*1)
   406  	MOVD R7, 16(R2)(R10*1)
   407  	MOVD R1, 24(R2)(R10*1)
   408  
   409  	ADD $32, R10 // i += 4
   410  	SUB $4, R3   // n -= 4
   411  	BGE U1       // if n >= 0 goto U1n
   412  
   413  v1:
   414  	ADD $4, R3 // n += 4
   415  	BLE E1     // if n <= 0 goto E1
   416  
   417  L1:  // n > 0
   418  	MOVD R0, R11
   419  	SUBC R4, R11           // restore CF
   420  	MOVD 0(R8)(R10*1), R5
   421  	MOVD 0(R9)(R10*1), R11
   422  	SUBE R11, R5
   423  	MOVD R5, 0(R2)(R10*1)
   424  	MOVD R0, R4
   425  	SUBE R4, R4            // save CF
   426  
   427  	ADD $8, R10 // i++
   428  	SUB $1, R3  // n--
   429  	BGT L1      // if n > 0 goto L1n
   430  
   431  E1:
   432  	NEG  R4, R4
   433  	MOVD R4, c+72(FP) // return c
   434  	RET
   435  
   436  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   437  // func subVV(z, x, y []Word) (c Word)
   438  // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   439  TEXT ·subVV_novec(SB), NOSPLIT, $0
   440  	MOVD z_len+8(FP), R3
   441  	MOVD x+24(FP), R8
   442  	MOVD y+48(FP), R9
   443  	MOVD z+0(FP), R2
   444  
   445  	MOVD $0, R4  // c = 0
   446  	MOVD $0, R0  // make sure it's zero
   447  	MOVD $0, R10 // i = 0
   448  
   449  	// s/JL/JMP/ below to disable the unrolled loop
   450  	SUB $4, R3 // n -= 4
   451  	BLT v1     // if n < 0 goto v1
   452  
   453  U1:  // n >= 0
   454  	// regular loop body unrolled 4x
   455  	MOVD 0(R8)(R10*1), R5
   456  	MOVD 8(R8)(R10*1), R6
   457  	MOVD 16(R8)(R10*1), R7
   458  	MOVD 24(R8)(R10*1), R1
   459  	MOVD R0, R11
   460  	SUBC R4, R11            // restore CF
   461  	MOVD 0(R9)(R10*1), R11
   462  	SUBE R11, R5
   463  	MOVD 8(R9)(R10*1), R11
   464  	SUBE R11, R6
   465  	MOVD 16(R9)(R10*1), R11
   466  	SUBE R11, R7
   467  	MOVD 24(R9)(R10*1), R11
   468  	SUBE R11, R1
   469  	MOVD R0, R4
   470  	SUBE R4, R4             // save CF
   471  	MOVD R5, 0(R2)(R10*1)
   472  	MOVD R6, 8(R2)(R10*1)
   473  	MOVD R7, 16(R2)(R10*1)
   474  	MOVD R1, 24(R2)(R10*1)
   475  
   476  	ADD $32, R10 // i += 4
   477  	SUB $4, R3   // n -= 4
   478  	BGE U1       // if n >= 0 goto U1
   479  
   480  v1:
   481  	ADD $4, R3 // n += 4
   482  	BLE E1     // if n <= 0 goto E1
   483  
   484  L1:  // n > 0
   485  	MOVD R0, R11
   486  	SUBC R4, R11           // restore CF
   487  	MOVD 0(R8)(R10*1), R5
   488  	MOVD 0(R9)(R10*1), R11
   489  	SUBE R11, R5
   490  	MOVD R5, 0(R2)(R10*1)
   491  	MOVD R0, R4
   492  	SUBE R4, R4            // save CF
   493  
   494  	ADD $8, R10 // i++
   495  	SUB $1, R3  // n--
   496  	BGT L1      // if n > 0 goto L1
   497  
   498  E1:
   499  	NEG  R4, R4
   500  	MOVD R4, c+72(FP) // return c
   501  	RET
   502  
   503  TEXT ·addVW(SB), NOSPLIT, $0
   504  	MOVD z_len+8(FP), R5 // length of z
   505  	MOVD x+24(FP), R6
   506  	MOVD y+48(FP), R7    // c = y
   507  	MOVD z+0(FP), R8
   508  
   509  	CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
   510  
   511  	// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
   512  	ADDC   0(R6), R7
   513  	MOVD   R7, 0(R8)
   514  	CMPBEQ R5, $1, returnResult // len(z) == 1
   515  	MOVD   $0, R9
   516  	ADDE   8(R6), R9
   517  	MOVD   R9, 8(R8)
   518  	CMPBEQ R5, $2, returnResult // len(z) == 2
   519  
   520  	// Update the counters
   521  	MOVD $16, R12    // i = 2
   522  	MOVD $-2(R5), R5 // n = n - 2
   523  
   524  loopOverEachWord:
   525  	BRC  $12, copySetup // carry = 0, copy the rest
   526  	MOVD $1, R9
   527  
   528  	// Originally we used the carry flag generated in the previous iteration
   529  	// (i.e: ADDE could be used here to do the addition).  However, since we
   530  	// already know carry is 1 (otherwise we will go to copy section), we can use
   531  	// ADDC here so the current iteration does not depend on the carry flag
   532  	// generated in the previous iteration. This could be useful when branch prediction happens.
   533  	ADDC 0(R6)(R12*1), R9
   534  	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
   535  
   536  	MOVD  $8(R12), R12         // i++
   537  	BRCTG R5, loopOverEachWord // n--
   538  
   539  // Return the current carry value
   540  returnResult:
   541  	MOVD $0, R0
   542  	ADDE R0, R0
   543  	MOVD R0, c+56(FP)
   544  	RET
   545  
   546  // Update position of x(R6) and z(R8) based on the current counter value and perform copying.
   547  // With the assumption that x and z will not overlap with each other or x and z will
   548  // point to same memory region, we can use a faster version of copy using only MVC here.
   549  // In the following implementation, we have three copy loops, each copying a word, 4 words, and
   550  // 32 words at a time.  Via benchmarking, this implementation is faster than calling runtime·memmove.
   551  copySetup:
   552  	ADD R12, R6
   553  	ADD R12, R8
   554  
   555  	CMPBGE R5, $4, mediumLoop
   556  
   557  smallLoop:  // does a loop unrolling to copy word when n < 4
   558  	CMPBEQ R5, $0, returnZero
   559  	MVC    $8, 0(R6), 0(R8)
   560  	CMPBEQ R5, $1, returnZero
   561  	MVC    $8, 8(R6), 8(R8)
   562  	CMPBEQ R5, $2, returnZero
   563  	MVC    $8, 16(R6), 16(R8)
   564  
   565  returnZero:
   566  	MOVD $0, c+56(FP) // return 0 as carry
   567  	RET
   568  
   569  mediumLoop:
   570  	CMPBLT R5, $4, smallLoop
   571  	CMPBLT R5, $32, mediumLoopBody
   572  
   573  largeLoop:  // Copying 256 bytes at a time.
   574  	MVC    $256, 0(R6), 0(R8)
   575  	MOVD   $256(R6), R6
   576  	MOVD   $256(R8), R8
   577  	MOVD   $-32(R5), R5
   578  	CMPBGE R5, $32, largeLoop
   579  	BR     mediumLoop
   580  
   581  mediumLoopBody:  // Copying 32 bytes at a time
   582  	MVC    $32, 0(R6), 0(R8)
   583  	MOVD   $32(R6), R6
   584  	MOVD   $32(R8), R8
   585  	MOVD   $-4(R5), R5
   586  	CMPBGE R5, $4, mediumLoopBody
   587  	BR     smallLoop
   588  
   589  returnC:
   590  	MOVD R7, c+56(FP)
   591  	RET
   592  
   593  TEXT ·subVW(SB), NOSPLIT, $0
   594  	MOVD z_len+8(FP), R5
   595  	MOVD x+24(FP), R6
   596  	MOVD y+48(FP), R7    // The borrow bit passed in
   597  	MOVD z+0(FP), R8
   598  	MOVD $0, R0          // R0 is a temporary variable used during computation. Ensure it has zero in it.
   599  
   600  	CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
   601  
   602  	// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
   603  	MOVD   0(R6), R9
   604  	SUBC   R7, R9
   605  	MOVD   R9, 0(R8)
   606  	CMPBEQ R5, $1, returnResult
   607  	MOVD   8(R6), R9
   608  	SUBE   R0, R9
   609  	MOVD   R9, 8(R8)
   610  	CMPBEQ R5, $2, returnResult
   611  
   612  	// Update the counters
   613  	MOVD $16, R12    // i = 2
   614  	MOVD $-2(R5), R5 // n = n - 2
   615  
   616  loopOverEachWord:
   617  	BRC  $3, copySetup    // no borrow, copy the rest
   618  	MOVD 0(R6)(R12*1), R9
   619  
   620  	// Originally we used the borrow flag generated in the previous iteration
   621  	// (i.e: SUBE could be used here to do the subtraction). However, since we
   622  	// already know borrow is 1 (otherwise we will go to copy section), we can
   623  	// use SUBC here so the current iteration does not depend on the borrow flag
   624  	// generated in the previous iteration. This could be useful when branch prediction happens.
   625  	SUBC $1, R9
   626  	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
   627  
   628  	MOVD  $8(R12), R12         // i++
   629  	BRCTG R5, loopOverEachWord // n--
   630  
   631  // return the current borrow value
   632  returnResult:
   633  	SUBE R0, R0
   634  	NEG  R0, R0
   635  	MOVD R0, c+56(FP)
   636  	RET
   637  
   638  // Update position of x(R6) and z(R8) based on the current counter value and perform copying.
   639  // With the assumption that x and z will not overlap with each other or x and z will
   640  // point to same memory region, we can use a faster version of copy using only MVC here.
   641  // In the following implementation, we have three copy loops, each copying a word, 4 words, and
   642  // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
   643  copySetup:
   644  	ADD R12, R6
   645  	ADD R12, R8
   646  
   647  	CMPBGE R5, $4, mediumLoop
   648  
   649  smallLoop:  // does a loop unrolling to copy word when n < 4
   650  	CMPBEQ R5, $0, returnZero
   651  	MVC    $8, 0(R6), 0(R8)
   652  	CMPBEQ R5, $1, returnZero
   653  	MVC    $8, 8(R6), 8(R8)
   654  	CMPBEQ R5, $2, returnZero
   655  	MVC    $8, 16(R6), 16(R8)
   656  
   657  returnZero:
   658  	MOVD $0, c+56(FP) // return 0 as borrow
   659  	RET
   660  
   661  mediumLoop:
   662  	CMPBLT R5, $4, smallLoop
   663  	CMPBLT R5, $32, mediumLoopBody
   664  
   665  largeLoop:  // Copying 256 bytes at a time
   666  	MVC    $256, 0(R6), 0(R8)
   667  	MOVD   $256(R6), R6
   668  	MOVD   $256(R8), R8
   669  	MOVD   $-32(R5), R5
   670  	CMPBGE R5, $32, largeLoop
   671  	BR     mediumLoop
   672  
   673  mediumLoopBody:  // Copying 32 bytes at a time
   674  	MVC    $32, 0(R6), 0(R8)
   675  	MOVD   $32(R6), R6
   676  	MOVD   $32(R8), R8
   677  	MOVD   $-4(R5), R5
   678  	CMPBGE R5, $4, mediumLoopBody
   679  	BR     smallLoop
   680  
   681  returnC:
   682  	MOVD R7, c+56(FP)
   683  	RET
   684  
   685  // func shlVU(z, x []Word, s uint) (c Word)
   686  TEXT ·shlVU(SB), NOSPLIT, $0
   687  	BR ·shlVU_g(SB)
   688  
   689  // func shrVU(z, x []Word, s uint) (c Word)
   690  TEXT ·shrVU(SB), NOSPLIT, $0
   691  	BR ·shrVU_g(SB)
   692  
   693  // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i
   694  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   695  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   696  	MOVD z+0(FP), R2
   697  	MOVD x+24(FP), R8
   698  	MOVD y+48(FP), R9
   699  	MOVD r+56(FP), R4    // c = r
   700  	MOVD z_len+8(FP), R5
   701  	MOVD $0, R1          // i = 0
   702  	MOVD $0, R7          // i*8 = 0
   703  	MOVD $0, R0          // make sure it's zero
   704  	BR   E5
   705  
   706  L5:
   707  	MOVD   (R8)(R1*1), R6
   708  	MULHDU R9, R6
   709  	ADDC   R4, R11         // add to low order bits
   710  	ADDE   R0, R6
   711  	MOVD   R11, (R2)(R1*1)
   712  	MOVD   R6, R4
   713  	ADD    $8, R1          // i*8 + 8
   714  	ADD    $1, R7          // i++
   715  
   716  E5:
   717  	CMPBLT R7, R5, L5 // i < n
   718  
   719  	MOVD R4, c+64(FP)
   720  	RET
   721  
   722  // func addMulVVW(z, x []Word, y Word) (c Word)
   723  // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i
   724  TEXT ·addMulVVW(SB), NOSPLIT, $0
   725  	MOVD z+0(FP), R2
   726  	MOVD x+24(FP), R8
   727  	MOVD y+48(FP), R9
   728  	MOVD z_len+8(FP), R5
   729  
   730  	MOVD $0, R1 // i*8 = 0
   731  	MOVD $0, R7 // i = 0
   732  	MOVD $0, R0 // make sure it's zero
   733  	MOVD $0, R4 // c = 0
   734  
   735  	MOVD   R5, R12
   736  	AND    $-2, R12
   737  	CMPBGE R5, $2, A6
   738  	BR     E6
   739  
   740  A6:
   741  	MOVD   (R8)(R1*1), R6
   742  	MULHDU R9, R6
   743  	MOVD   (R2)(R1*1), R10
   744  	ADDC   R10, R11        // add to low order bits
   745  	ADDE   R0, R6
   746  	ADDC   R4, R11
   747  	ADDE   R0, R6
   748  	MOVD   R6, R4
   749  	MOVD   R11, (R2)(R1*1)
   750  
   751  	MOVD   (8)(R8)(R1*1), R6
   752  	MULHDU R9, R6
   753  	MOVD   (8)(R2)(R1*1), R10
   754  	ADDC   R10, R11           // add to low order bits
   755  	ADDE   R0, R6
   756  	ADDC   R4, R11
   757  	ADDE   R0, R6
   758  	MOVD   R6, R4
   759  	MOVD   R11, (8)(R2)(R1*1)
   760  
   761  	ADD $16, R1 // i*8 + 8
   762  	ADD $2, R7  // i++
   763  
   764  	CMPBLT R7, R12, A6
   765  	BR     E6
   766  
   767  L6:
   768  	MOVD   (R8)(R1*1), R6
   769  	MULHDU R9, R6
   770  	MOVD   (R2)(R1*1), R10
   771  	ADDC   R10, R11        // add to low order bits
   772  	ADDE   R0, R6
   773  	ADDC   R4, R11
   774  	ADDE   R0, R6
   775  	MOVD   R6, R4
   776  	MOVD   R11, (R2)(R1*1)
   777  
   778  	ADD $8, R1 // i*8 + 8
   779  	ADD $1, R7 // i++
   780  
   781  E6:
   782  	CMPBLT R7, R5, L6 // i < n
   783  
   784  	MOVD R4, c+56(FP)
   785  	RET
   786  
   787  

View as plain text