Text file src/crypto/aes/gcm_ppc64x.s

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build (ppc64 || ppc64le) && !purego
     6  
     7  // Portions based on CRYPTOGAMS code with the following comment:
     8  // # ====================================================================
     9  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    10  // # project. The module is, however, dual licensed under OpenSSL and
    11  // # CRYPTOGAMS licenses depending on where you obtain it. For further
    12  // # details see http://www.openssl.org/~appro/cryptogams/.
    13  // # ====================================================================
    14  
    15  // The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm
    16  // from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
    17  // from commit d47afb3c.
    18  
    19  // Changes were made due to differences in the ABI and some register usage.
    20  // Some arguments were changed due to the way the Go code passes them.
    21  
    22  // Portions that use the stitched AES-GCM approach in counterCryptASM
    23  // are based on code found in
    24  // https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s
    25  
    26  #include "textflag.h"
    27  
    28  #define XIP    R3
    29  #define HTBL   R4
    30  #define INP    R5
    31  #define LEN    R6
    32  
    33  #define XL     V0
    34  #define XM     V1
    35  #define XH     V2
    36  #define IN     V3
    37  #define ZERO   V4
    38  #define T0     V5
    39  #define T1     V6
    40  #define T2     V7
    41  #define XC2    V8
    42  #define H      V9
    43  #define HH     V10
    44  #define HL     V11
    45  #define LEMASK V12
    46  #define XL1    V13
    47  #define XM1    V14
    48  #define XH1    V15
    49  #define IN1    V16
    50  #define H2     V17
    51  #define H2H    V18
    52  #define H2L    V19
    53  #define XL3    V20
    54  #define XM2    V21
    55  #define IN2    V22
    56  #define H3L    V23
    57  #define H3     V24
    58  #define H3H    V25
    59  #define XH3    V26
    60  #define XM3    V27
    61  #define IN3    V28
    62  #define H4L    V29
    63  #define H4     V30
    64  #define H4H    V31
    65  
    66  #define IN0    IN
    67  #define H21L   HL
    68  #define H21H   HH
    69  #define LOPERM H2L
    70  #define HIPERM H2H
    71  
    72  #define VXL    VS32
    73  #define VIN    VS35
    74  #define VXC2   VS40
    75  #define VH     VS41
    76  #define VHH    VS42
    77  #define VHL    VS43
    78  #define VIN1   VS48
    79  #define VH2    VS49
    80  #define VH2H   VS50
    81  #define VH2L   VS51
    82  
    83  #define VIN2   VS54
    84  #define VH3L   VS55
    85  #define VH3    VS56
    86  #define VH3H   VS57
    87  #define VIN3   VS60
    88  #define VH4L   VS61
    89  #define VH4    VS62
    90  #define VH4H   VS63
    91  
    92  #define VIN0   VIN
    93  
    94  #define ESPERM V10
    95  #define TMP2 V11
    96  
    97  // The following macros provide appropriate
    98  // implementations for endianness as well as
    99  // ISA specific for power8 and power9.
   100  #ifdef GOARCH_ppc64le
   101  #  ifdef GOPPC64_power9
   102  #define P8_LXVB16X(RA,RB,VT)   LXVB16X (RA)(RB), VT
   103  #define P8_STXVB16X(VS,RA,RB)  STXVB16X VS, (RA)(RB)
   104  #  else
   105  #define NEEDS_ESPERM
   106  #define P8_LXVB16X(RA,RB,VT) \
   107  	LXVD2X  (RA+RB), VT \
   108  	VPERM	VT, VT, ESPERM, VT
   109  
   110  #define P8_STXVB16X(VS,RA,RB) \
   111  	VPERM	VS, VS, ESPERM, TMP2; \
   112  	STXVD2X TMP2, (RA+RB)
   113  
   114  #  endif
   115  #else
   116  #define P8_LXVB16X(RA,RB,VT) \
   117  	LXVD2X  (RA+RB), VT
   118  
   119  #define P8_STXVB16X(VS,RA,RB) \
   120  	STXVD2X VS, (RA+RB)
   121  
   122  #endif
   123  
   124  #define MASK_PTR   R8
   125  
   126  #define MASKV   V0
   127  #define INV     V1
   128  
   129  // The following macros are used for
   130  // the stitched implementation within
   131  // counterCryptASM.
   132  
   133  // Load the initial GCM counter value
   134  // in V30 and set up the counter increment
   135  // in V31
   136  #define SETUP_COUNTER \
   137  	P8_LXVB16X(COUNTER, R0, V30); \
   138  	VSPLTISB $1, V28; \
   139  	VXOR V31, V31, V31; \
   140  	VSLDOI $1, V31, V28, V31
   141  
   142  // These macros set up the initial value
   143  // for a single encryption, or 4 or 8
   144  // stitched encryptions implemented
   145  // with interleaving vciphers.
   146  //
   147  // The input value for each encryption
   148  // is generated by XORing the counter
   149  // from V30 with the first key in VS0
   150  // and incrementing the counter.
   151  //
   152  // Single encryption in V15
   153  #define GEN_VCIPHER_INPUT \
   154  	XXLOR VS0, VS0, V29 \
   155  	VXOR V30, V29, V15; \
   156  	VADDUWM V30, V31, V30
   157  
   158  // 4 encryptions in V15 - V18
   159  #define GEN_VCIPHER_4_INPUTS \
   160  	XXLOR VS0, VS0, V29; \
   161  	VXOR V30, V29, V15; \
   162  	VADDUWM V30, V31, V30; \
   163  	VXOR V30, V29, V16; \
   164  	VADDUWM V30, V31, V30; \
   165  	VXOR V30, V29, V17; \
   166  	VADDUWM V30, V31, V30; \
   167  	VXOR V30, V29, V18; \
   168  	VADDUWM V30, V31, V30
   169  
   170  // 8 encryptions in V15 - V22
   171  #define GEN_VCIPHER_8_INPUTS \
   172  	XXLOR VS0, VS0, V29; \
   173  	VXOR V30, V29, V15; \
   174  	VADDUWM V30, V31, V30; \
   175  	VXOR V30, V29, V16; \
   176  	VADDUWM V30, V31, V30; \
   177  	VXOR V30, V29, V17; \
   178  	VADDUWM V30, V31, V30; \
   179  	VXOR V30, V29, V18; \
   180  	VADDUWM V30, V31, V30; \
   181  	VXOR V30, V29, V19; \
   182  	VADDUWM V30, V31, V30; \
   183  	VXOR V30, V29, V20; \
   184  	VADDUWM V30, V31, V30; \
   185  	VXOR V30, V29, V21; \
   186  	VADDUWM V30, V31, V30; \
   187  	VXOR V30, V29, V22; \
   188  	VADDUWM V30, V31, V30
   189  
   190  // Load the keys to be used for
   191  // encryption based on key_len.
   192  // Keys are in VS0 - VS14
   193  // depending on key_len.
   194  // Valid keys sizes are verified
   195  // here. CR2 is set and used
   196  // throughout to check key_len.
   197  #define LOAD_KEYS(blk_key, key_len) \
   198  	MOVD	$16, R16; \
   199  	MOVD	$32, R17; \
   200  	MOVD	$48, R18; \
   201  	MOVD	$64, R19; \
   202  	LXVD2X (blk_key)(R0), VS0; \
   203  	LXVD2X (blk_key)(R16), VS1; \
   204  	LXVD2X (blk_key)(R17), VS2; \
   205  	LXVD2X (blk_key)(R18), VS3; \
   206  	LXVD2X (blk_key)(R19), VS4; \
   207  	ADD $64, R16; \
   208  	ADD $64, R17; \
   209  	ADD $64, R18; \
   210  	ADD $64, R19; \
   211  	LXVD2X (blk_key)(R16), VS5; \
   212  	LXVD2X (blk_key)(R17), VS6; \
   213  	LXVD2X (blk_key)(R18), VS7; \
   214  	LXVD2X (blk_key)(R19), VS8; \
   215  	ADD $64, R16; \
   216  	ADD $64, R17; \
   217  	ADD $64, R18; \
   218  	ADD $64, R19; \
   219  	LXVD2X (blk_key)(R16), VS9; \
   220  	LXVD2X (blk_key)(R17), VS10; \
   221  	CMP key_len, $12, CR2; \
   222  	CMP key_len, $10; \
   223  	BEQ keysLoaded; \
   224  	LXVD2X (blk_key)(R18), VS11; \
   225  	LXVD2X (blk_key)(R19), VS12; \
   226  	BEQ CR2, keysLoaded; \
   227  	ADD $64, R16; \
   228  	ADD $64, R17; \
   229  	LXVD2X (blk_key)(R16), VS13; \
   230  	LXVD2X (blk_key)(R17), VS14; \
   231  	CMP key_len, $14; \
   232  	BEQ keysLoaded; \
   233  	MOVD R0,0(R0); \
   234  keysLoaded:
   235  
   236  // Encrypt 1 (vin) with first 9
   237  // keys from VS1 - VS9.
   238  #define VCIPHER_1X9_KEYS(vin) \
   239  	XXLOR VS1, VS1, V23; \
   240  	XXLOR VS2, VS2, V24; \
   241  	XXLOR VS3, VS3, V25; \
   242  	XXLOR VS4, VS4, V26; \
   243  	XXLOR VS5, VS5, V27; \
   244  	VCIPHER vin, V23, vin; \
   245  	VCIPHER vin, V24, vin; \
   246  	VCIPHER vin, V25, vin; \
   247  	VCIPHER vin, V26, vin; \
   248  	VCIPHER vin, V27, vin; \
   249  	XXLOR VS6, VS6, V23; \
   250  	XXLOR VS7, VS7, V24; \
   251  	XXLOR VS8, VS8, V25; \
   252  	XXLOR VS9, VS9, V26; \
   253  	VCIPHER vin, V23, vin; \
   254  	VCIPHER vin, V24, vin; \
   255  	VCIPHER vin, V25, vin; \
   256  	VCIPHER	vin, V26, vin
   257  
   258  // Encrypt 1 value (vin) with
   259  // 2 specified keys
   260  #define VCIPHER_1X2_KEYS(vin, key1, key2) \
   261  	XXLOR key1, key1, V25; \
   262  	XXLOR key2, key2, V26; \
   263  	VCIPHER vin, V25, vin; \
   264  	VCIPHER vin, V26, vin
   265  
   266  // Encrypt 4 values in V15 - V18
   267  // with the specified key from
   268  // VS1 - VS9.
   269  #define VCIPHER_4X1_KEY(key) \
   270  	XXLOR key, key, V23; \
   271  	VCIPHER V15, V23, V15; \
   272  	VCIPHER V16, V23, V16; \
   273  	VCIPHER V17, V23, V17; \
   274  	VCIPHER V18, V23, V18
   275  
   276  // Encrypt 8 values in V15 - V22
   277  // with the specified key,
   278  // assuming it is a VSreg
   279  #define VCIPHER_8X1_KEY(key) \
   280  	XXLOR key, key, V23; \
   281  	VCIPHER V15, V23, V15; \
   282  	VCIPHER V16, V23, V16; \
   283  	VCIPHER V17, V23, V17; \
   284  	VCIPHER V18, V23, V18; \
   285  	VCIPHER V19, V23, V19; \
   286  	VCIPHER V20, V23, V20; \
   287  	VCIPHER V21, V23, V21; \
   288  	VCIPHER V22, V23, V22
   289  
   290  // Load input block into V1-V4
   291  // in big endian order and
   292  // update blk_inp by 64.
   293  #define LOAD_INPUT_BLOCK64(blk_inp) \
   294  	MOVD $16, R16; \
   295  	MOVD $32, R17; \
   296  	MOVD $48, R18; \
   297  	P8_LXVB16X(blk_inp,R0,V1); \
   298  	P8_LXVB16X(blk_inp,R16,V2); \
   299  	P8_LXVB16X(blk_inp,R17,V3); \
   300  	P8_LXVB16X(blk_inp,R18,V4); \
   301  	ADD $64, blk_inp
   302  
   303  // Load input block into V1-V8
   304  // in big endian order and
   305  // Update blk_inp by 128
   306  #define LOAD_INPUT_BLOCK128(blk_inp) \
   307  	MOVD $16, R16; \
   308  	MOVD $32, R17; \
   309  	MOVD $48, R18; \
   310  	MOVD $64, R19; \
   311  	MOVD $80, R20; \
   312  	MOVD $96, R21; \
   313  	MOVD $112, R22; \
   314  	P8_LXVB16X(blk_inp,R0,V1); \
   315  	P8_LXVB16X(blk_inp,R16,V2); \
   316  	P8_LXVB16X(blk_inp,R17,V3); \
   317  	P8_LXVB16X(blk_inp,R18,V4); \
   318  	P8_LXVB16X(blk_inp,R19,V5); \
   319  	P8_LXVB16X(blk_inp,R20,V6); \
   320  	P8_LXVB16X(blk_inp,R21,V7); \
   321  	P8_LXVB16X(blk_inp,R22,V8); \
   322  	ADD $128, blk_inp
   323  
   324  // Finish encryption on 8 streams and
   325  // XOR with input block
   326  #define VCIPHERLAST8_XOR_INPUT \
   327  	VCIPHERLAST     V15, V23, V15; \
   328  	VCIPHERLAST     V16, V23, V16; \
   329  	VCIPHERLAST     V17, V23, V17; \
   330  	VCIPHERLAST     V18, V23, V18; \
   331  	VCIPHERLAST     V19, V23, V19; \
   332  	VCIPHERLAST     V20, V23, V20; \
   333  	VCIPHERLAST     V21, V23, V21; \
   334  	VCIPHERLAST     V22, V23, V22; \
   335  	XXLXOR          V1, V15, V1; \
   336  	XXLXOR          V2, V16, V2; \
   337  	XXLXOR          V3, V17, V3; \
   338  	XXLXOR          V4, V18, V4; \
   339  	XXLXOR          V5, V19, V5; \
   340  	XXLXOR          V6, V20, V6; \
   341  	XXLXOR          V7, V21, V7; \
   342  	XXLXOR          V8, V22, V8
   343  
   344  // Finish encryption on 4 streams and
   345  // XOR with input block
   346  #define VCIPHERLAST4_XOR_INPUT \
   347  	VCIPHERLAST     V15, V23, V15; \
   348  	VCIPHERLAST     V16, V23, V16; \
   349  	VCIPHERLAST     V17, V23, V17; \
   350  	VCIPHERLAST     V18, V23, V18; \
   351  	XXLXOR          V1, V15, V1; \
   352  	XXLXOR          V2, V16, V2; \
   353  	XXLXOR          V3, V17, V3; \
   354  	XXLXOR          V4, V18, V4
   355  
   356  // Store output block from V1-V8
   357  // in big endian order and
   358  // Update blk_out by 128
   359  #define STORE_OUTPUT_BLOCK128(blk_out) \
   360  	P8_STXVB16X(V1,blk_out,R0); \
   361  	P8_STXVB16X(V2,blk_out,R16); \
   362  	P8_STXVB16X(V3,blk_out,R17); \
   363  	P8_STXVB16X(V4,blk_out,R18); \
   364  	P8_STXVB16X(V5,blk_out,R19); \
   365  	P8_STXVB16X(V6,blk_out,R20); \
   366  	P8_STXVB16X(V7,blk_out,R21); \
   367  	P8_STXVB16X(V8,blk_out,R22); \
   368  	ADD $128, blk_out
   369  
   370  // Store output block from V1-V4
   371  // in big endian order and
   372  // Update blk_out by 64
   373  #define STORE_OUTPUT_BLOCK64(blk_out) \
   374  	P8_STXVB16X(V1,blk_out,R0); \
   375  	P8_STXVB16X(V2,blk_out,R16); \
   376  	P8_STXVB16X(V3,blk_out,R17); \
   377  	P8_STXVB16X(V4,blk_out,R18); \
   378  	ADD $64, blk_out
   379  
   380  // func gcmInit(productTable *[256]byte, h []byte)
   381  TEXT ·gcmInit(SB), NOSPLIT, $0-32
   382  	MOVD productTable+0(FP), XIP
   383  	MOVD h+8(FP), HTBL
   384  
   385  	MOVD   $0x10, R8
   386  	MOVD   $0x20, R9
   387  	MOVD   $0x30, R10
   388  	LXVD2X (HTBL)(R0), VH // Load H
   389  
   390  	VSPLTISB $-16, XC2           // 0xf0
   391  	VSPLTISB $1, T0              // one
   392  	VADDUBM  XC2, XC2, XC2       // 0xe0
   393  	VXOR     ZERO, ZERO, ZERO
   394  	VOR      XC2, T0, XC2        // 0xe1
   395  	VSLDOI   $15, XC2, ZERO, XC2 // 0xe1...
   396  	VSLDOI   $1, ZERO, T0, T1    // ...1
   397  	VADDUBM  XC2, XC2, XC2       // 0xc2...
   398  	VSPLTISB $7, T2
   399  	VOR      XC2, T1, XC2        // 0xc2....01
   400  	VSPLTB   $0, H, T1           // most significant byte
   401  	VSL      H, T0, H            // H<<=1
   402  	VSRAB    T1, T2, T1          // broadcast carry bit
   403  	VAND     T1, XC2, T1
   404  	VXOR     H, T1, IN           // twisted H
   405  
   406  	VSLDOI $8, IN, IN, H      // twist even more ...
   407  	VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0
   408  	VSLDOI $8, ZERO, H, HL    // ... and split
   409  	VSLDOI $8, H, ZERO, HH
   410  
   411  	STXVD2X VXC2, (XIP+R0) // save pre-computed table
   412  	STXVD2X VHL, (XIP+R8)
   413  	MOVD    $0x40, R8
   414  	STXVD2X VH, (XIP+R9)
   415  	MOVD    $0x50, R9
   416  	STXVD2X VHH, (XIP+R10)
   417  	MOVD    $0x60, R10
   418  
   419  	VPMSUMD IN, HL, XL // H.lo·H.lo
   420  	VPMSUMD IN, H, XM  // H.hi·H.lo+H.lo·H.hi
   421  	VPMSUMD IN, HH, XH // H.hi·H.hi
   422  
   423  	VPMSUMD XL, XC2, T2 // 1st reduction phase
   424  
   425  	VSLDOI $8, XM, ZERO, T0
   426  	VSLDOI $8, ZERO, XM, T1
   427  	VXOR   XL, T0, XL
   428  	VXOR   XH, T1, XH
   429  
   430  	VSLDOI $8, XL, XL, XL
   431  	VXOR   XL, T2, XL
   432  
   433  	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
   434  	VPMSUMD XL, XC2, XL
   435  	VXOR    T1, XH, T1
   436  	VXOR    XL, T1, IN1
   437  
   438  	VSLDOI $8, IN1, IN1, H2
   439  	VSLDOI $8, ZERO, H2, H2L
   440  	VSLDOI $8, H2, ZERO, H2H
   441  
   442  	STXVD2X VH2L, (XIP+R8)  // save H^2
   443  	MOVD    $0x70, R8
   444  	STXVD2X VH2, (XIP+R9)
   445  	MOVD    $0x80, R9
   446  	STXVD2X VH2H, (XIP+R10)
   447  	MOVD    $0x90, R10
   448  
   449  	VPMSUMD IN, H2L, XL   // H.lo·H^2.lo
   450  	VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo
   451  	VPMSUMD IN, H2, XM    // H.hi·H^2.lo+H.lo·H^2.hi
   452  	VPMSUMD IN1, H2, XM1  // H^2.hi·H^2.lo+H^2.lo·H^2.hi
   453  	VPMSUMD IN, H2H, XH   // H.hi·H^2.hi
   454  	VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi
   455  
   456  	VPMSUMD XL, XC2, T2  // 1st reduction phase
   457  	VPMSUMD XL1, XC2, HH // 1st reduction phase
   458  
   459  	VSLDOI $8, XM, ZERO, T0
   460  	VSLDOI $8, ZERO, XM, T1
   461  	VSLDOI $8, XM1, ZERO, HL
   462  	VSLDOI $8, ZERO, XM1, H
   463  	VXOR   XL, T0, XL
   464  	VXOR   XH, T1, XH
   465  	VXOR   XL1, HL, XL1
   466  	VXOR   XH1, H, XH1
   467  
   468  	VSLDOI $8, XL, XL, XL
   469  	VSLDOI $8, XL1, XL1, XL1
   470  	VXOR   XL, T2, XL
   471  	VXOR   XL1, HH, XL1
   472  
   473  	VSLDOI  $8, XL, XL, T1  // 2nd reduction phase
   474  	VSLDOI  $8, XL1, XL1, H // 2nd reduction phase
   475  	VPMSUMD XL, XC2, XL
   476  	VPMSUMD XL1, XC2, XL1
   477  	VXOR    T1, XH, T1
   478  	VXOR    H, XH1, H
   479  	VXOR    XL, T1, XL
   480  	VXOR    XL1, H, XL1
   481  
   482  	VSLDOI $8, XL, XL, H
   483  	VSLDOI $8, XL1, XL1, H2
   484  	VSLDOI $8, ZERO, H, HL
   485  	VSLDOI $8, H, ZERO, HH
   486  	VSLDOI $8, ZERO, H2, H2L
   487  	VSLDOI $8, H2, ZERO, H2H
   488  
   489  	STXVD2X VHL, (XIP+R8)   // save H^3
   490  	MOVD    $0xa0, R8
   491  	STXVD2X VH, (XIP+R9)
   492  	MOVD    $0xb0, R9
   493  	STXVD2X VHH, (XIP+R10)
   494  	MOVD    $0xc0, R10
   495  	STXVD2X VH2L, (XIP+R8)  // save H^4
   496  	STXVD2X VH2, (XIP+R9)
   497  	STXVD2X VH2H, (XIP+R10)
   498  
   499  	RET
   500  
   501  // func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int)
   502  TEXT ·gcmHash(SB), NOSPLIT, $0-64
   503  	MOVD output+0(FP), XIP
   504  	MOVD productTable+24(FP), HTBL
   505  	MOVD inp+32(FP), INP
   506  	MOVD len+56(FP), LEN
   507  
   508  	MOVD   $0x10, R8
   509  	MOVD   $0x20, R9
   510  	MOVD   $0x30, R10
   511  	LXVD2X (XIP)(R0), VXL // load Xi
   512  
   513  	LXVD2X   (HTBL)(R8), VHL    // load pre-computed table
   514  	MOVD     $0x40, R8
   515  	LXVD2X   (HTBL)(R9), VH
   516  	MOVD     $0x50, R9
   517  	LXVD2X   (HTBL)(R10), VHH
   518  	MOVD     $0x60, R10
   519  	LXVD2X   (HTBL)(R0), VXC2
   520  #ifdef GOARCH_ppc64le
   521  	LVSL     (R0)(R0), LEMASK
   522  	VSPLTISB $0x07, T0
   523  	VXOR     LEMASK, T0, LEMASK
   524  	VPERM    XL, XL, LEMASK, XL
   525  #endif
   526  	VXOR     ZERO, ZERO, ZERO
   527  
   528  	CMPU LEN, $64
   529  	BGE  gcm_ghash_p8_4x
   530  
   531  	LXVD2X (INP)(R0), VIN
   532  	ADD    $16, INP, INP
   533  	SUBCCC $16, LEN, LEN
   534  #ifdef GOARCH_ppc64le
   535  	VPERM  IN, IN, LEMASK, IN
   536  #endif
   537  	VXOR   IN, XL, IN
   538  	BEQ    short
   539  
   540  	LXVD2X (HTBL)(R8), VH2L  // load H^2
   541  	MOVD   $16, R8
   542  	LXVD2X (HTBL)(R9), VH2
   543  	ADD    LEN, INP, R9      // end of input
   544  	LXVD2X (HTBL)(R10), VH2H
   545  
   546  loop_2x:
   547  	LXVD2X (INP)(R0), VIN1
   548  #ifdef GOARCH_ppc64le
   549  	VPERM  IN1, IN1, LEMASK, IN1
   550  #endif
   551  
   552  	SUBC    $32, LEN, LEN
   553  	VPMSUMD IN, H2L, XL   // H^2.lo·Xi.lo
   554  	VPMSUMD IN1, HL, XL1  // H.lo·Xi+1.lo
   555  	SUBE    R11, R11, R11 // borrow?-1:0
   556  	VPMSUMD IN, H2, XM    // H^2.hi·Xi.lo+H^2.lo·Xi.hi
   557  	VPMSUMD IN1, H, XM1   // H.hi·Xi+1.lo+H.lo·Xi+1.hi
   558  	AND     LEN, R11, R11
   559  	VPMSUMD IN, H2H, XH   // H^2.hi·Xi.hi
   560  	VPMSUMD IN1, HH, XH1  // H.hi·Xi+1.hi
   561  	ADD     R11, INP, INP
   562  
   563  	VXOR XL, XL1, XL
   564  	VXOR XM, XM1, XM
   565  
   566  	VPMSUMD XL, XC2, T2 // 1st reduction phase
   567  
   568  	VSLDOI $8, XM, ZERO, T0
   569  	VSLDOI $8, ZERO, XM, T1
   570  	VXOR   XH, XH1, XH
   571  	VXOR   XL, T0, XL
   572  	VXOR   XH, T1, XH
   573  
   574  	VSLDOI $8, XL, XL, XL
   575  	VXOR   XL, T2, XL
   576  	LXVD2X (INP)(R8), VIN
   577  	ADD    $32, INP, INP
   578  
   579  	VSLDOI  $8, XL, XL, T1     // 2nd reduction phase
   580  	VPMSUMD XL, XC2, XL
   581  #ifdef GOARCH_ppc64le
   582  	VPERM   IN, IN, LEMASK, IN
   583  #endif
   584  	VXOR    T1, XH, T1
   585  	VXOR    IN, T1, IN
   586  	VXOR    IN, XL, IN
   587  	CMP     R9, INP
   588  	BGT     loop_2x            // done yet?
   589  
   590  	CMPWU LEN, $0
   591  	BNE   even
   592  
   593  short:
   594  	VPMSUMD IN, HL, XL // H.lo·Xi.lo
   595  	VPMSUMD IN, H, XM  // H.hi·Xi.lo+H.lo·Xi.hi
   596  	VPMSUMD IN, HH, XH // H.hi·Xi.hi
   597  
   598  	VPMSUMD XL, XC2, T2 // 1st reduction phase
   599  
   600  	VSLDOI $8, XM, ZERO, T0
   601  	VSLDOI $8, ZERO, XM, T1
   602  	VXOR   XL, T0, XL
   603  	VXOR   XH, T1, XH
   604  
   605  	VSLDOI $8, XL, XL, XL
   606  	VXOR   XL, T2, XL
   607  
   608  	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
   609  	VPMSUMD XL, XC2, XL
   610  	VXOR    T1, XH, T1
   611  
   612  even:
   613  	VXOR    XL, T1, XL
   614  #ifdef GOARCH_ppc64le
   615  	VPERM   XL, XL, LEMASK, XL
   616  #endif
   617  	STXVD2X VXL, (XIP+R0)
   618  
   619  	OR R12, R12, R12 // write out Xi
   620  	RET
   621  
   622  gcm_ghash_p8_4x:
   623  	LVSL     (R8)(R0), T0      // 0x0001..0e0f
   624  	MOVD     $0x70, R8
   625  	LXVD2X   (HTBL)(R9), VH2
   626  	MOVD     $0x80, R9
   627  	VSPLTISB $8, T1            // 0x0808..0808
   628  	MOVD     $0x90, R10
   629  	LXVD2X   (HTBL)(R8), VH3L  // load H^3
   630  	MOVD     $0xa0, R8
   631  	LXVD2X   (HTBL)(R9), VH3
   632  	MOVD     $0xb0, R9
   633  	LXVD2X   (HTBL)(R10), VH3H
   634  	MOVD     $0xc0, R10
   635  	LXVD2X   (HTBL)(R8), VH4L  // load H^4
   636  	MOVD     $0x10, R8
   637  	LXVD2X   (HTBL)(R9), VH4
   638  	MOVD     $0x20, R9
   639  	LXVD2X   (HTBL)(R10), VH4H
   640  	MOVD     $0x30, R10
   641  
   642  	VSLDOI  $8, ZERO, T1, T2   // 0x0000..0808
   643  	VADDUBM T0, T2, HIPERM     // 0x0001..1617
   644  	VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f
   645  
   646  	SRD $4, LEN, LEN // this allows to use sign bit as carry
   647  
   648  	LXVD2X (INP)(R0), VIN0       // load input
   649  	LXVD2X (INP)(R8), VIN1
   650  	SUBCCC $8, LEN, LEN
   651  	LXVD2X (INP)(R9), VIN2
   652  	LXVD2X (INP)(R10), VIN3
   653  	ADD    $0x40, INP, INP
   654  #ifdef GOARCH_ppc64le
   655  	VPERM  IN0, IN0, LEMASK, IN0
   656  	VPERM  IN1, IN1, LEMASK, IN1
   657  	VPERM  IN2, IN2, LEMASK, IN2
   658  	VPERM  IN3, IN3, LEMASK, IN3
   659  #endif
   660  
   661  	VXOR IN0, XL, XH
   662  
   663  	VPMSUMD IN1, H3L, XL1
   664  	VPMSUMD IN1, H3, XM1
   665  	VPMSUMD IN1, H3H, XH1
   666  
   667  	VPERM   H2, H, HIPERM, H21L
   668  	VPERM   IN2, IN3, LOPERM, T0
   669  	VPERM   H2, H, LOPERM, H21H
   670  	VPERM   IN2, IN3, HIPERM, T1
   671  	VPMSUMD IN2, H2, XM2         // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
   672  	VPMSUMD T0, H21L, XL3        // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
   673  	VPMSUMD IN3, H, XM3          // H.hi·Xi+3.lo  +H.lo·Xi+3.hi
   674  	VPMSUMD T1, H21H, XH3        // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
   675  
   676  	VXOR XM2, XM1, XM2
   677  	VXOR XL3, XL1, XL3
   678  	VXOR XM3, XM2, XM3
   679  	VXOR XH3, XH1, XH3
   680  
   681  	BLT tail_4x
   682  
   683  loop_4x:
   684  	LXVD2X (INP)(R0), VIN0
   685  	LXVD2X (INP)(R8), VIN1
   686  	SUBCCC $4, LEN, LEN
   687  	LXVD2X (INP)(R9), VIN2
   688  	LXVD2X (INP)(R10), VIN3
   689  	ADD    $0x40, INP, INP
   690  #ifdef GOARCH_ppc64le
   691  	VPERM  IN1, IN1, LEMASK, IN1
   692  	VPERM  IN2, IN2, LEMASK, IN2
   693  	VPERM  IN3, IN3, LEMASK, IN3
   694  	VPERM  IN0, IN0, LEMASK, IN0
   695  #endif
   696  
   697  	VPMSUMD XH, H4L, XL   // H^4.lo·Xi.lo
   698  	VPMSUMD XH, H4, XM    // H^4.hi·Xi.lo+H^4.lo·Xi.hi
   699  	VPMSUMD XH, H4H, XH   // H^4.hi·Xi.hi
   700  	VPMSUMD IN1, H3L, XL1
   701  	VPMSUMD IN1, H3, XM1
   702  	VPMSUMD IN1, H3H, XH1
   703  
   704  	VXOR  XL, XL3, XL
   705  	VXOR  XM, XM3, XM
   706  	VXOR  XH, XH3, XH
   707  	VPERM IN2, IN3, LOPERM, T0
   708  	VPERM IN2, IN3, HIPERM, T1
   709  
   710  	VPMSUMD XL, XC2, T2   // 1st reduction phase
   711  	VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
   712  	VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
   713  
   714  	VSLDOI $8, XM, ZERO, T0
   715  	VSLDOI $8, ZERO, XM, T1
   716  	VXOR   XL, T0, XL
   717  	VXOR   XH, T1, XH
   718  
   719  	VSLDOI $8, XL, XL, XL
   720  	VXOR   XL, T2, XL
   721  
   722  	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
   723  	VPMSUMD IN2, H2, XM2   // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
   724  	VPMSUMD IN3, H, XM3    // H.hi·Xi+3.lo  +H.lo·Xi+3.hi
   725  	VPMSUMD XL, XC2, XL
   726  
   727  	VXOR XL3, XL1, XL3
   728  	VXOR XH3, XH1, XH3
   729  	VXOR XH, IN0, XH
   730  	VXOR XM2, XM1, XM2
   731  	VXOR XH, T1, XH
   732  	VXOR XM3, XM2, XM3
   733  	VXOR XH, XL, XH
   734  	BGE  loop_4x
   735  
   736  tail_4x:
   737  	VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
   738  	VPMSUMD XH, H4, XM  // H^4.hi·Xi.lo+H^4.lo·Xi.hi
   739  	VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
   740  
   741  	VXOR XL, XL3, XL
   742  	VXOR XM, XM3, XM
   743  
   744  	VPMSUMD XL, XC2, T2 // 1st reduction phase
   745  
   746  	VSLDOI $8, XM, ZERO, T0
   747  	VSLDOI $8, ZERO, XM, T1
   748  	VXOR   XH, XH3, XH
   749  	VXOR   XL, T0, XL
   750  	VXOR   XH, T1, XH
   751  
   752  	VSLDOI $8, XL, XL, XL
   753  	VXOR   XL, T2, XL
   754  
   755  	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
   756  	VPMSUMD XL, XC2, XL
   757  	VXOR    T1, XH, T1
   758  	VXOR    XL, T1, XL
   759  
   760  	ADDCCC $4, LEN, LEN
   761  	BEQ    done_4x
   762  
   763  	LXVD2X (INP)(R0), VIN0
   764  	CMPU   LEN, $2
   765  	MOVD   $-4, LEN
   766  	BLT    one
   767  	LXVD2X (INP)(R8), VIN1
   768  	BEQ    two
   769  
   770  three:
   771  	LXVD2X (INP)(R9), VIN2
   772  #ifdef GOARCH_ppc64le
   773  	VPERM  IN0, IN0, LEMASK, IN0
   774  	VPERM  IN1, IN1, LEMASK, IN1
   775  	VPERM  IN2, IN2, LEMASK, IN2
   776  #endif
   777  
   778  	VXOR IN0, XL, XH
   779  	VOR  H3L, H3L, H4L
   780  	VOR  H3, H3, H4
   781  	VOR  H3H, H3H, H4H
   782  
   783  	VPERM   IN1, IN2, LOPERM, T0
   784  	VPERM   IN1, IN2, HIPERM, T1
   785  	VPMSUMD IN1, H2, XM2         // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
   786  	VPMSUMD IN2, H, XM3          // H.hi·Xi+2.lo  +H.lo·Xi+2.hi
   787  	VPMSUMD T0, H21L, XL3        // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
   788  	VPMSUMD T1, H21H, XH3        // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
   789  
   790  	VXOR XM3, XM2, XM3
   791  	JMP  tail_4x
   792  
   793  two:
   794  #ifdef GOARCH_ppc64le
   795  	VPERM IN0, IN0, LEMASK, IN0
   796  	VPERM IN1, IN1, LEMASK, IN1
   797  #endif
   798  
   799  	VXOR  IN, XL, XH
   800  	VPERM ZERO, IN1, LOPERM, T0
   801  	VPERM ZERO, IN1, HIPERM, T1
   802  
   803  	VSLDOI $8, ZERO, H2, H4L
   804  	VOR    H2, H2, H4
   805  	VSLDOI $8, H2, ZERO, H4H
   806  
   807  	VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo
   808  	VPMSUMD IN1, H, XM3   // H.hi·Xi+1.lo+H.lo·Xi+2.hi
   809  	VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi
   810  
   811  	JMP tail_4x
   812  
   813  one:
   814  #ifdef GOARCH_ppc64le
   815  	VPERM IN0, IN0, LEMASK, IN0
   816  #endif
   817  
   818  	VSLDOI $8, ZERO, H, H4L
   819  	VOR    H, H, H4
   820  	VSLDOI $8, H, ZERO, H4H
   821  
   822  	VXOR IN0, XL, XH
   823  	VXOR XL3, XL3, XL3
   824  	VXOR XM3, XM3, XM3
   825  	VXOR XH3, XH3, XH3
   826  
   827  	JMP tail_4x
   828  
   829  done_4x:
   830  #ifdef GOARCH_ppc64le
   831  	VPERM   XL, XL, LEMASK, XL
   832  #endif
   833  	STXVD2X VXL, (XIP+R0)      // write out Xi
   834  	RET
   835  
   836  // func gcmMul(output []byte, productTable *[256]byte)
   837  TEXT ·gcmMul(SB), NOSPLIT, $0-32
   838  	MOVD output+0(FP), XIP
   839  	MOVD productTable+24(FP), HTBL
   840  
   841  	MOVD   $0x10, R8
   842  	MOVD   $0x20, R9
   843  	MOVD   $0x30, R10
   844  	LXVD2X (XIP)(R0), VIN // load Xi
   845  
   846  	LXVD2X   (HTBL)(R8), VHL    // Load pre-computed table
   847  	LXVD2X   (HTBL)(R9), VH
   848  	LXVD2X   (HTBL)(R10), VHH
   849  	LXVD2X   (HTBL)(R0), VXC2
   850  #ifdef GOARCH_ppc64le
   851  	VSPLTISB $0x07, T0
   852  	VXOR     LEMASK, T0, LEMASK
   853  	VPERM    IN, IN, LEMASK, IN
   854  #endif
   855  	VXOR     ZERO, ZERO, ZERO
   856  
   857  	VPMSUMD IN, HL, XL // H.lo·Xi.lo
   858  	VPMSUMD IN, H, XM  // H.hi·Xi.lo+H.lo·Xi.hi
   859  	VPMSUMD IN, HH, XH // H.hi·Xi.hi
   860  
   861  	VPMSUMD XL, XC2, T2 // 1st reduction phase
   862  
   863  	VSLDOI $8, XM, ZERO, T0
   864  	VSLDOI $8, ZERO, XM, T1
   865  	VXOR   XL, T0, XL
   866  	VXOR   XH, T1, XH
   867  
   868  	VSLDOI $8, XL, XL, XL
   869  	VXOR   XL, T2, XL
   870  
   871  	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
   872  	VPMSUMD XL, XC2, XL
   873  	VXOR    T1, XH, T1
   874  	VXOR    XL, T1, XL
   875  
   876  #ifdef GOARCH_ppc64le
   877  	VPERM   XL, XL, LEMASK, XL
   878  #endif
   879  	STXVD2X VXL, (XIP+R0)      // write out Xi
   880  	RET
   881  
   882  #define BLK_INP    R3
   883  #define BLK_OUT    R4
   884  #define BLK_KEY    R5
   885  #define KEY_LEN    R6
   886  #define BLK_IDX    R7
   887  #define IDX        R8
   888  #define IN_LEN     R9
   889  #define COUNTER    R10
   890  #define CONPTR     R14
   891  #define MASK       V5
   892  
   893  // Implementation of the counterCrypt function in assembler.
   894  // Original loop is unrolled to allow for multiple encryption
   895  // streams to be done in parallel, which is achieved by interleaving
   896  // vcipher instructions from each stream. This is also referred to as
   897  // stitching, and provides significant performance improvements.
   898  // Some macros are defined which enable execution for big or little
   899  // endian as well as different ISA targets.
   900  //func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32)
   901  //func counterCryptASM(xr, out, in, counter, key)
   902  TEXT ·counterCryptASM(SB), NOSPLIT, $16-72
   903  	MOVD	xr(FP), KEY_LEN
   904  	MOVD    out+8(FP), BLK_OUT
   905  	MOVD    out_len+16(FP), R8
   906  	MOVD    in+32(FP), BLK_INP
   907  	MOVD    in_len+40(FP), IN_LEN
   908  	MOVD    counter+56(FP), COUNTER
   909  	MOVD    key+64(FP), BLK_KEY
   910  
   911  // Set up permute string when needed.
   912  #ifdef NEEDS_ESPERM
   913  	MOVD    $·rcon(SB), R14
   914  	LVX     (R14), ESPERM   // Permute value for P8_ macros.
   915  #endif
   916  	SETUP_COUNTER		// V30 Counter V31 BE {0, 0, 0, 1}
   917  	LOAD_KEYS(BLK_KEY, KEY_LEN)	// VS1 - VS10/12/14 based on keysize
   918  	CMP     IN_LEN, $128
   919  	BLT	block64
   920  block128_loop:
   921  	// Do 8 encryptions in parallel by setting
   922  	// input values in V15-V22 and executing
   923  	// vcipher on the updated value and the keys.
   924  	GEN_VCIPHER_8_INPUTS
   925  	VCIPHER_8X1_KEY(VS1)
   926  	VCIPHER_8X1_KEY(VS2)
   927  	VCIPHER_8X1_KEY(VS3)
   928  	VCIPHER_8X1_KEY(VS4)
   929  	VCIPHER_8X1_KEY(VS5)
   930  	VCIPHER_8X1_KEY(VS6)
   931  	VCIPHER_8X1_KEY(VS7)
   932  	VCIPHER_8X1_KEY(VS8)
   933  	VCIPHER_8X1_KEY(VS9)
   934  	// Additional encryptions are done based on
   935  	// the key length, with the last key moved
   936  	// to V23 for use with VCIPHERLAST.
   937  	// CR2 = CMP key_len, $12
   938  	XXLOR VS10, VS10, V23
   939  	BLT	CR2, block128_last // key_len = 10
   940  	VCIPHER_8X1_KEY(VS10)
   941  	VCIPHER_8X1_KEY(VS11)
   942  	XXLOR VS12,VS12,V23
   943  	BEQ	CR2, block128_last // ken_len = 12
   944  	VCIPHER_8X1_KEY(VS12)
   945  	VCIPHER_8X1_KEY(VS13)
   946  	XXLOR VS14,VS14,V23	// key_len = 14
   947  block128_last:
   948  	// vcipher encryptions are in V15-V22 at this
   949  	// point with vcipherlast remaining to be done.
   950  	// Load input block into V1-V8, setting index offsets
   951  	// in R16-R22 to use with the STORE.
   952  	LOAD_INPUT_BLOCK128(BLK_INP)
   953  	// Do VCIPHERLAST on the last key for each encryption
   954  	// stream and XOR the result with the corresponding
   955  	// value from the input block.
   956  	VCIPHERLAST8_XOR_INPUT
   957  	// Store the results (8*16) and update BLK_OUT by 128.
   958  	STORE_OUTPUT_BLOCK128(BLK_OUT)
   959  	ADD	$-128, IN_LEN	// input size
   960  	CMP     IN_LEN, $128	// check if >= blocksize
   961  	BGE	block128_loop	// next input block
   962  	CMP	IN_LEN, $0
   963  	BEQ	done
   964  block64:
   965  	CMP	IN_LEN, $64	// Check if >= 64
   966  	BLT	block16_loop
   967  	// Do 4 encryptions in parallel by setting
   968  	// input values in V15-V18 and executing
   969  	// vcipher on the updated value and the keys.
   970  	GEN_VCIPHER_4_INPUTS
   971  	VCIPHER_4X1_KEY(VS1)
   972  	VCIPHER_4X1_KEY(VS2)
   973  	VCIPHER_4X1_KEY(VS3)
   974  	VCIPHER_4X1_KEY(VS4)
   975  	VCIPHER_4X1_KEY(VS5)
   976  	VCIPHER_4X1_KEY(VS6)
   977  	VCIPHER_4X1_KEY(VS7)
   978  	VCIPHER_4X1_KEY(VS8)
   979  	VCIPHER_4X1_KEY(VS9)
   980  	// Check key length based on CR2
   981  	// Move last key to V23 for use with later vcipherlast
   982  	XXLOR	VS10, VS10, V23
   983  	BLT	CR2, block64_last	// size = 10
   984  	VCIPHER_4X1_KEY(VS10)		// Encrypt next 2 keys
   985  	VCIPHER_4X1_KEY(VS11)
   986  	XXLOR	VS12, VS12, V23
   987  	BEQ	CR2, block64_last	// size = 12
   988  	VCIPHER_4X1_KEY(VS12)		// Encrypt last 2 keys
   989  	VCIPHER_4X1_KEY(VS13)
   990  	XXLOR	VS14, VS14, V23		// size = 14
   991  block64_last:
   992  	LOAD_INPUT_BLOCK64(BLK_INP)	// Load 64 bytes of input
   993  	// Do VCIPHERLAST on the last for each encryption
   994  	// stream and XOR the result with the corresponding
   995  	// value from the input block.
   996  	VCIPHERLAST4_XOR_INPUT
   997  	// Store the results (4*16) and update BLK_OUT by 64.
   998  	STORE_OUTPUT_BLOCK64(BLK_OUT)
   999  	ADD	$-64, IN_LEN		// decrement input block length
  1000  	CMP	IN_LEN, $0		// check for remaining length
  1001  	BEQ	done
  1002  block16_loop:
  1003  	CMP	IN_LEN, $16		// More input
  1004  	BLT	final_block		// If not, then handle partial block
  1005  	// Single encryption, no stitching
  1006  	GEN_VCIPHER_INPUT		// Generate input value for single encryption
  1007  	VCIPHER_1X9_KEYS(V15)		// Encrypt V15 value with 9 keys
  1008  	XXLOR	VS10, VS10, V23		// Last key -> V23 for later vcipiherlast
  1009  	// Key length based on CR2. (LT=10, EQ=12, GT=14)
  1010  	BLT	CR2, block16_last	// Finish for key size 10
  1011  	VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys
  1012  	XXLOR	VS12, VS12, V23		// Last key -> V23 for later vcipherlast
  1013  	BEQ	CR2, block16_last	// Finish for key size 12
  1014  	VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
  1015  	XXLOR	VS14, VS14, V23		// Last key -> V23 for vcipherlast with key size 14
  1016  block16_last:
  1017  	P8_LXVB16X(BLK_INP, R0, V1)	// Load input
  1018  	VCIPHERLAST V15, V23, V15	// Encrypt last value in V23
  1019  	XXLXOR	V15, V1, V1		// XOR with input
  1020  	P8_STXVB16X(V1,R0,BLK_OUT)	// Store final encryption value to output
  1021  	ADD	$16, BLK_INP		// Increment input pointer
  1022  	ADD	$16, BLK_OUT		// Increment output pointer
  1023  	ADD	$-16, IN_LEN		// Decrement input length
  1024  	BR	block16_loop		// Check for next
  1025  final_block:
  1026  	CMP	IN_LEN, $0
  1027  	BEQ	done
  1028  	GEN_VCIPHER_INPUT		// Generate input value for partial encryption
  1029  	VCIPHER_1X9_KEYS(V15)		// Encrypt V15 with 9 keys
  1030  	XXLOR	VS10, VS10, V23		// Save possible last key
  1031  	BLT	CR2, final_block_last
  1032  	VCIPHER_1X2_KEYS(V15, VS10, VS11)	// Encrypt V15 with next 2 keys
  1033  	XXLOR	VS12, VS12, V23		// Save possible last key
  1034  	BEQ	CR2, final_block_last
  1035  	VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
  1036  	XXLOR	VS14, VS14, V23		// Save last key
  1037  final_block_last:
  1038  	VCIPHERLAST V15, V23, V15	// Finish encryption
  1039  #ifdef GOPPC64_power10
  1040  	// set up length
  1041  	SLD	$56, IN_LEN, R17
  1042  	LXVLL	BLK_INP, R17, V25
  1043  	VXOR	V25, V15, V25
  1044  	STXVLL	V25, BLK_OUT, R17
  1045  #else
  1046  	ADD	$32, R1, MASK_PTR
  1047  	MOVD	$0, R16
  1048  	P8_STXVB16X(V15, MASK_PTR, R0)
  1049  	CMP	IN_LEN, $8
  1050  	BLT	next4
  1051  	MOVD	0(MASK_PTR), R14
  1052  	MOVD	0(BLK_INP), R15
  1053  	XOR	R14, R15, R14
  1054  	MOVD	R14, 0(BLK_OUT)
  1055  	ADD	$8, R16
  1056  	ADD	$-8, IN_LEN
  1057  next4:
  1058  	CMP	IN_LEN, $4
  1059  	BLT	next2
  1060  	MOVWZ	(BLK_INP)(R16), R15
  1061  	MOVWZ	(MASK_PTR)(R16), R14
  1062  	XOR	R14, R15, R14
  1063  	MOVW	R14, (R16)(BLK_OUT)
  1064  	ADD	$4, R16
  1065  	ADD	$-4, IN_LEN
  1066  next2:
  1067  	CMP	IN_LEN, $2
  1068  	BLT	next1
  1069  	MOVHZ	(BLK_INP)(R16), R15
  1070  	MOVHZ	(MASK_PTR)(R16), R14
  1071  	XOR	R14, R15, R14
  1072  	MOVH	R14, (R16)(BLK_OUT)
  1073  	ADD	$2, R16
  1074  	ADD	$-2, IN_LEN
  1075  next1:
  1076  	CMP	IN_LEN, $1
  1077  	BLT	done
  1078  	MOVBZ	(MASK_PTR)(R16), R14
  1079  	MOVBZ	(BLK_INP)(R16), R15
  1080  	XOR	R14, R15, R14
  1081  	MOVB	R14, (R16)(BLK_OUT)
  1082  #endif
  1083  done:
  1084  	// Save the updated counter value
  1085  	P8_STXVB16X(V30, COUNTER, R0)
  1086  	// Clear the keys
  1087  	XXLXOR	VS0, VS0, VS0
  1088  	XXLXOR	VS1, VS1, VS1
  1089  	XXLXOR	VS2, VS2, VS2
  1090  	XXLXOR	VS3, VS3, VS3
  1091  	XXLXOR	VS4, VS4, VS4
  1092  	XXLXOR	VS5, VS5, VS5
  1093  	XXLXOR	VS6, VS6, VS6
  1094  	XXLXOR	VS7, VS7, VS7
  1095  	XXLXOR	VS8, VS8, VS8
  1096  	XXLXOR	VS9, VS9, VS9
  1097  	XXLXOR	VS10, VS10, VS10
  1098  	XXLXOR	VS11, VS11, VS11
  1099  	XXLXOR	VS12, VS12, VS12
  1100  	XXLXOR	VS13, VS13, VS13
  1101  	XXLXOR	VS14, VS14, VS14
  1102  	RET
  1103  
  1104  

View as plain text