Text file src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Based on CRYPTOGAMS code with the following comment:
     6  // # ====================================================================
     7  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     8  // # project. The module is, however, dual licensed under OpenSSL and
     9  // # CRYPTOGAMS licenses depending on where you obtain it. For further
    10  // # details see http://www.openssl.org/~appro/cryptogams/.
    11  // # ====================================================================
    12  
    13  // Code for the perl script that generates the ppc64 assembler
    14  // can be found in the cryptogams repository at the link below. It is based on
    15  // the original from openssl.
    16  
    17  // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
    18  
    19  // The differences in this and the original implementation are
    20  // due to the calling conventions and initialization of constants.
    21  
    22  //go:build gc && !purego
    23  
    24  #include "textflag.h"
    25  
    26  #define OUT  R3
    27  #define INP  R4
    28  #define LEN  R5
    29  #define KEY  R6
    30  #define CNT  R7
    31  #define TMP  R15
    32  
    33  #define CONSTBASE  R16
    34  #define BLOCKS R17
    35  
    36  // for VPERMXOR
    37  #define MASK  R18
    38  
    39  DATA consts<>+0x00(SB)/8, $0x3320646e61707865
    40  DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
    41  DATA consts<>+0x10(SB)/8, $0x0000000000000001
    42  DATA consts<>+0x18(SB)/8, $0x0000000000000000
    43  DATA consts<>+0x20(SB)/8, $0x0000000000000004
    44  DATA consts<>+0x28(SB)/8, $0x0000000000000000
    45  DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
    46  DATA consts<>+0x38(SB)/8, $0x0203000106070405
    47  DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
    48  DATA consts<>+0x48(SB)/8, $0x0102030005060704
    49  DATA consts<>+0x50(SB)/8, $0x6170786561707865
    50  DATA consts<>+0x58(SB)/8, $0x6170786561707865
    51  DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
    52  DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
    53  DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
    54  DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
    55  DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
    56  DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
    57  DATA consts<>+0x90(SB)/8, $0x0000000100000000
    58  DATA consts<>+0x98(SB)/8, $0x0000000300000002
    59  DATA consts<>+0xa0(SB)/8, $0x5566774411223300
    60  DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
    61  DATA consts<>+0xb0(SB)/8, $0x6677445522330011
    62  DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
    63  GLOBL consts<>(SB), RODATA, $0xc0
    64  
    65  //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
    66  TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
    67  	MOVD out+0(FP), OUT
    68  	MOVD inp+8(FP), INP
    69  	MOVD len+16(FP), LEN
    70  	MOVD key+24(FP), KEY
    71  	MOVD counter+32(FP), CNT
    72  
    73  	// Addressing for constants
    74  	MOVD $consts<>+0x00(SB), CONSTBASE
    75  	MOVD $16, R8
    76  	MOVD $32, R9
    77  	MOVD $48, R10
    78  	MOVD $64, R11
    79  	SRD $6, LEN, BLOCKS
    80  	// for VPERMXOR
    81  	MOVD $consts<>+0xa0(SB), MASK
    82  	MOVD $16, R20
    83  	// V16
    84  	LXVW4X (CONSTBASE)(R0), VS48
    85  	ADD $80,CONSTBASE
    86  
    87  	// Load key into V17,V18
    88  	LXVW4X (KEY)(R0), VS49
    89  	LXVW4X (KEY)(R8), VS50
    90  
    91  	// Load CNT, NONCE into V19
    92  	LXVW4X (CNT)(R0), VS51
    93  
    94  	// Clear V27
    95  	VXOR V27, V27, V27
    96  
    97  	// V28
    98  	LXVW4X (CONSTBASE)(R11), VS60
    99  
   100  	// Load mask constants for VPERMXOR
   101  	LXVW4X (MASK)(R0), V20
   102  	LXVW4X (MASK)(R20), V21
   103  
   104  	// splat slot from V19 -> V26
   105  	VSPLTW $0, V19, V26
   106  
   107  	VSLDOI $4, V19, V27, V19
   108  	VSLDOI $12, V27, V19, V19
   109  
   110  	VADDUWM V26, V28, V26
   111  
   112  	MOVD $10, R14
   113  	MOVD R14, CTR
   114  	PCALIGN $16
   115  loop_outer_vsx:
   116  	// V0, V1, V2, V3
   117  	LXVW4X (R0)(CONSTBASE), VS32
   118  	LXVW4X (R8)(CONSTBASE), VS33
   119  	LXVW4X (R9)(CONSTBASE), VS34
   120  	LXVW4X (R10)(CONSTBASE), VS35
   121  
   122  	// splat values from V17, V18 into V4-V11
   123  	VSPLTW $0, V17, V4
   124  	VSPLTW $1, V17, V5
   125  	VSPLTW $2, V17, V6
   126  	VSPLTW $3, V17, V7
   127  	VSPLTW $0, V18, V8
   128  	VSPLTW $1, V18, V9
   129  	VSPLTW $2, V18, V10
   130  	VSPLTW $3, V18, V11
   131  
   132  	// VOR
   133  	VOR V26, V26, V12
   134  
   135  	// splat values from V19 -> V13, V14, V15
   136  	VSPLTW $1, V19, V13
   137  	VSPLTW $2, V19, V14
   138  	VSPLTW $3, V19, V15
   139  
   140  	// splat   const values
   141  	VSPLTISW $-16, V27
   142  	VSPLTISW $12, V28
   143  	VSPLTISW $8, V29
   144  	VSPLTISW $7, V30
   145  	PCALIGN $16
   146  loop_vsx:
   147  	VADDUWM V0, V4, V0
   148  	VADDUWM V1, V5, V1
   149  	VADDUWM V2, V6, V2
   150  	VADDUWM V3, V7, V3
   151  
   152  	VPERMXOR V12, V0, V21, V12
   153  	VPERMXOR V13, V1, V21, V13
   154  	VPERMXOR V14, V2, V21, V14
   155  	VPERMXOR V15, V3, V21, V15
   156  
   157  	VADDUWM V8, V12, V8
   158  	VADDUWM V9, V13, V9
   159  	VADDUWM V10, V14, V10
   160  	VADDUWM V11, V15, V11
   161  
   162  	VXOR V4, V8, V4
   163  	VXOR V5, V9, V5
   164  	VXOR V6, V10, V6
   165  	VXOR V7, V11, V7
   166  
   167  	VRLW V4, V28, V4
   168  	VRLW V5, V28, V5
   169  	VRLW V6, V28, V6
   170  	VRLW V7, V28, V7
   171  
   172  	VADDUWM V0, V4, V0
   173  	VADDUWM V1, V5, V1
   174  	VADDUWM V2, V6, V2
   175  	VADDUWM V3, V7, V3
   176  
   177  	VPERMXOR V12, V0, V20, V12
   178  	VPERMXOR V13, V1, V20, V13
   179  	VPERMXOR V14, V2, V20, V14
   180  	VPERMXOR V15, V3, V20, V15
   181  
   182  	VADDUWM V8, V12, V8
   183  	VADDUWM V9, V13, V9
   184  	VADDUWM V10, V14, V10
   185  	VADDUWM V11, V15, V11
   186  
   187  	VXOR V4, V8, V4
   188  	VXOR V5, V9, V5
   189  	VXOR V6, V10, V6
   190  	VXOR V7, V11, V7
   191  
   192  	VRLW V4, V30, V4
   193  	VRLW V5, V30, V5
   194  	VRLW V6, V30, V6
   195  	VRLW V7, V30, V7
   196  
   197  	VADDUWM V0, V5, V0
   198  	VADDUWM V1, V6, V1
   199  	VADDUWM V2, V7, V2
   200  	VADDUWM V3, V4, V3
   201  
   202  	VPERMXOR V15, V0, V21, V15
   203  	VPERMXOR V12, V1, V21, V12
   204  	VPERMXOR V13, V2, V21, V13
   205  	VPERMXOR V14, V3, V21, V14
   206  
   207  	VADDUWM V10, V15, V10
   208  	VADDUWM V11, V12, V11
   209  	VADDUWM V8, V13, V8
   210  	VADDUWM V9, V14, V9
   211  
   212  	VXOR V5, V10, V5
   213  	VXOR V6, V11, V6
   214  	VXOR V7, V8, V7
   215  	VXOR V4, V9, V4
   216  
   217  	VRLW V5, V28, V5
   218  	VRLW V6, V28, V6
   219  	VRLW V7, V28, V7
   220  	VRLW V4, V28, V4
   221  
   222  	VADDUWM V0, V5, V0
   223  	VADDUWM V1, V6, V1
   224  	VADDUWM V2, V7, V2
   225  	VADDUWM V3, V4, V3
   226  
   227  	VPERMXOR V15, V0, V20, V15
   228  	VPERMXOR V12, V1, V20, V12
   229  	VPERMXOR V13, V2, V20, V13
   230  	VPERMXOR V14, V3, V20, V14
   231  
   232  	VADDUWM V10, V15, V10
   233  	VADDUWM V11, V12, V11
   234  	VADDUWM V8, V13, V8
   235  	VADDUWM V9, V14, V9
   236  
   237  	VXOR V5, V10, V5
   238  	VXOR V6, V11, V6
   239  	VXOR V7, V8, V7
   240  	VXOR V4, V9, V4
   241  
   242  	VRLW V5, V30, V5
   243  	VRLW V6, V30, V6
   244  	VRLW V7, V30, V7
   245  	VRLW V4, V30, V4
   246  	BDNZ   loop_vsx
   247  
   248  	VADDUWM V12, V26, V12
   249  
   250  	VMRGEW V0, V1, V27
   251  	VMRGEW V2, V3, V28
   252  
   253  	VMRGOW V0, V1, V0
   254  	VMRGOW V2, V3, V2
   255  
   256  	VMRGEW V4, V5, V29
   257  	VMRGEW V6, V7, V30
   258  
   259  	XXPERMDI VS32, VS34, $0, VS33
   260  	XXPERMDI VS32, VS34, $3, VS35
   261  	XXPERMDI VS59, VS60, $0, VS32
   262  	XXPERMDI VS59, VS60, $3, VS34
   263  
   264  	VMRGOW V4, V5, V4
   265  	VMRGOW V6, V7, V6
   266  
   267  	VMRGEW V8, V9, V27
   268  	VMRGEW V10, V11, V28
   269  
   270  	XXPERMDI VS36, VS38, $0, VS37
   271  	XXPERMDI VS36, VS38, $3, VS39
   272  	XXPERMDI VS61, VS62, $0, VS36
   273  	XXPERMDI VS61, VS62, $3, VS38
   274  
   275  	VMRGOW V8, V9, V8
   276  	VMRGOW V10, V11, V10
   277  
   278  	VMRGEW V12, V13, V29
   279  	VMRGEW V14, V15, V30
   280  
   281  	XXPERMDI VS40, VS42, $0, VS41
   282  	XXPERMDI VS40, VS42, $3, VS43
   283  	XXPERMDI VS59, VS60, $0, VS40
   284  	XXPERMDI VS59, VS60, $3, VS42
   285  
   286  	VMRGOW V12, V13, V12
   287  	VMRGOW V14, V15, V14
   288  
   289  	VSPLTISW $4, V27
   290  	VADDUWM V26, V27, V26
   291  
   292  	XXPERMDI VS44, VS46, $0, VS45
   293  	XXPERMDI VS44, VS46, $3, VS47
   294  	XXPERMDI VS61, VS62, $0, VS44
   295  	XXPERMDI VS61, VS62, $3, VS46
   296  
   297  	VADDUWM V0, V16, V0
   298  	VADDUWM V4, V17, V4
   299  	VADDUWM V8, V18, V8
   300  	VADDUWM V12, V19, V12
   301  
   302  	CMPU LEN, $64
   303  	BLT tail_vsx
   304  
   305  	// Bottom of loop
   306  	LXVW4X (INP)(R0), VS59
   307  	LXVW4X (INP)(R8), VS60
   308  	LXVW4X (INP)(R9), VS61
   309  	LXVW4X (INP)(R10), VS62
   310  
   311  	VXOR V27, V0, V27
   312  	VXOR V28, V4, V28
   313  	VXOR V29, V8, V29
   314  	VXOR V30, V12, V30
   315  
   316  	STXVW4X VS59, (OUT)(R0)
   317  	STXVW4X VS60, (OUT)(R8)
   318  	ADD     $64, INP
   319  	STXVW4X VS61, (OUT)(R9)
   320  	ADD     $-64, LEN
   321  	STXVW4X VS62, (OUT)(R10)
   322  	ADD     $64, OUT
   323  	BEQ     done_vsx
   324  
   325  	VADDUWM V1, V16, V0
   326  	VADDUWM V5, V17, V4
   327  	VADDUWM V9, V18, V8
   328  	VADDUWM V13, V19, V12
   329  
   330  	CMPU  LEN, $64
   331  	BLT   tail_vsx
   332  
   333  	LXVW4X (INP)(R0), VS59
   334  	LXVW4X (INP)(R8), VS60
   335  	LXVW4X (INP)(R9), VS61
   336  	LXVW4X (INP)(R10), VS62
   337  	VXOR   V27, V0, V27
   338  
   339  	VXOR V28, V4, V28
   340  	VXOR V29, V8, V29
   341  	VXOR V30, V12, V30
   342  
   343  	STXVW4X VS59, (OUT)(R0)
   344  	STXVW4X VS60, (OUT)(R8)
   345  	ADD     $64, INP
   346  	STXVW4X VS61, (OUT)(R9)
   347  	ADD     $-64, LEN
   348  	STXVW4X VS62, (OUT)(V10)
   349  	ADD     $64, OUT
   350  	BEQ     done_vsx
   351  
   352  	VADDUWM V2, V16, V0
   353  	VADDUWM V6, V17, V4
   354  	VADDUWM V10, V18, V8
   355  	VADDUWM V14, V19, V12
   356  
   357  	CMPU LEN, $64
   358  	BLT  tail_vsx
   359  
   360  	LXVW4X (INP)(R0), VS59
   361  	LXVW4X (INP)(R8), VS60
   362  	LXVW4X (INP)(R9), VS61
   363  	LXVW4X (INP)(R10), VS62
   364  
   365  	VXOR V27, V0, V27
   366  	VXOR V28, V4, V28
   367  	VXOR V29, V8, V29
   368  	VXOR V30, V12, V30
   369  
   370  	STXVW4X VS59, (OUT)(R0)
   371  	STXVW4X VS60, (OUT)(R8)
   372  	ADD     $64, INP
   373  	STXVW4X VS61, (OUT)(R9)
   374  	ADD     $-64, LEN
   375  	STXVW4X VS62, (OUT)(R10)
   376  	ADD     $64, OUT
   377  	BEQ     done_vsx
   378  
   379  	VADDUWM V3, V16, V0
   380  	VADDUWM V7, V17, V4
   381  	VADDUWM V11, V18, V8
   382  	VADDUWM V15, V19, V12
   383  
   384  	CMPU  LEN, $64
   385  	BLT   tail_vsx
   386  
   387  	LXVW4X (INP)(R0), VS59
   388  	LXVW4X (INP)(R8), VS60
   389  	LXVW4X (INP)(R9), VS61
   390  	LXVW4X (INP)(R10), VS62
   391  
   392  	VXOR V27, V0, V27
   393  	VXOR V28, V4, V28
   394  	VXOR V29, V8, V29
   395  	VXOR V30, V12, V30
   396  
   397  	STXVW4X VS59, (OUT)(R0)
   398  	STXVW4X VS60, (OUT)(R8)
   399  	ADD     $64, INP
   400  	STXVW4X VS61, (OUT)(R9)
   401  	ADD     $-64, LEN
   402  	STXVW4X VS62, (OUT)(R10)
   403  	ADD     $64, OUT
   404  
   405  	MOVD $10, R14
   406  	MOVD R14, CTR
   407  	BNE  loop_outer_vsx
   408  
   409  done_vsx:
   410  	// Increment counter by number of 64 byte blocks
   411  	MOVD (CNT), R14
   412  	ADD  BLOCKS, R14
   413  	MOVD R14, (CNT)
   414  	RET
   415  
   416  tail_vsx:
   417  	ADD  $32, R1, R11
   418  	MOVD LEN, CTR
   419  
   420  	// Save values on stack to copy from
   421  	STXVW4X VS32, (R11)(R0)
   422  	STXVW4X VS36, (R11)(R8)
   423  	STXVW4X VS40, (R11)(R9)
   424  	STXVW4X VS44, (R11)(R10)
   425  	ADD $-1, R11, R12
   426  	ADD $-1, INP
   427  	ADD $-1, OUT
   428  	PCALIGN $16
   429  looptail_vsx:
   430  	// Copying the result to OUT
   431  	// in bytes.
   432  	MOVBZU 1(R12), KEY
   433  	MOVBZU 1(INP), TMP
   434  	XOR    KEY, TMP, KEY
   435  	MOVBU  KEY, 1(OUT)
   436  	BDNZ   looptail_vsx
   437  
   438  	// Clear the stack values
   439  	STXVW4X VS48, (R11)(R0)
   440  	STXVW4X VS48, (R11)(R8)
   441  	STXVW4X VS48, (R11)(R9)
   442  	STXVW4X VS48, (R11)(R10)
   443  	BR      done_vsx
   444  

View as plain text