Text file src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Based on CRYPTOGAMS code with the following comment:
     6  // # ====================================================================
     7  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     8  // # project. The module is, however, dual licensed under OpenSSL and
     9  // # CRYPTOGAMS licenses depending on where you obtain it. For further
    10  // # details see http://www.openssl.org/~appro/cryptogams/.
    11  // # ====================================================================
    12  
    13  // Code for the perl script that generates the ppc64 assembler
    14  // can be found in the cryptogams repository at the link below. It is based on
    15  // the original from openssl.
    16  
    17  // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
    18  
    19  // The differences in this and the original implementation are
    20  // due to the calling conventions and initialization of constants.
    21  
    22  //go:build gc && !purego && (ppc64 || ppc64le)
    23  
    24  #include "textflag.h"
    25  
    26  #define OUT  R3
    27  #define INP  R4
    28  #define LEN  R5
    29  #define KEY  R6
    30  #define CNT  R7
    31  #define TMP  R15
    32  
    33  #define CONSTBASE  R16
    34  #define BLOCKS R17
    35  
    36  // for VPERMXOR
    37  #define MASK  R18
    38  
    39  DATA consts<>+0x00(SB)/4, $0x61707865
    40  DATA consts<>+0x04(SB)/4, $0x3320646e
    41  DATA consts<>+0x08(SB)/4, $0x79622d32
    42  DATA consts<>+0x0c(SB)/4, $0x6b206574
    43  DATA consts<>+0x10(SB)/4, $0x00000001
    44  DATA consts<>+0x14(SB)/4, $0x00000000
    45  DATA consts<>+0x18(SB)/4, $0x00000000
    46  DATA consts<>+0x1c(SB)/4, $0x00000000
    47  DATA consts<>+0x20(SB)/4, $0x00000004
    48  DATA consts<>+0x24(SB)/4, $0x00000000
    49  DATA consts<>+0x28(SB)/4, $0x00000000
    50  DATA consts<>+0x2c(SB)/4, $0x00000000
    51  DATA consts<>+0x30(SB)/4, $0x0e0f0c0d
    52  DATA consts<>+0x34(SB)/4, $0x0a0b0809
    53  DATA consts<>+0x38(SB)/4, $0x06070405
    54  DATA consts<>+0x3c(SB)/4, $0x02030001
    55  DATA consts<>+0x40(SB)/4, $0x0d0e0f0c
    56  DATA consts<>+0x44(SB)/4, $0x090a0b08
    57  DATA consts<>+0x48(SB)/4, $0x05060704
    58  DATA consts<>+0x4c(SB)/4, $0x01020300
    59  DATA consts<>+0x50(SB)/4, $0x61707865
    60  DATA consts<>+0x54(SB)/4, $0x61707865
    61  DATA consts<>+0x58(SB)/4, $0x61707865
    62  DATA consts<>+0x5c(SB)/4, $0x61707865
    63  DATA consts<>+0x60(SB)/4, $0x3320646e
    64  DATA consts<>+0x64(SB)/4, $0x3320646e
    65  DATA consts<>+0x68(SB)/4, $0x3320646e
    66  DATA consts<>+0x6c(SB)/4, $0x3320646e
    67  DATA consts<>+0x70(SB)/4, $0x79622d32
    68  DATA consts<>+0x74(SB)/4, $0x79622d32
    69  DATA consts<>+0x78(SB)/4, $0x79622d32
    70  DATA consts<>+0x7c(SB)/4, $0x79622d32
    71  DATA consts<>+0x80(SB)/4, $0x6b206574
    72  DATA consts<>+0x84(SB)/4, $0x6b206574
    73  DATA consts<>+0x88(SB)/4, $0x6b206574
    74  DATA consts<>+0x8c(SB)/4, $0x6b206574
    75  DATA consts<>+0x90(SB)/4, $0x00000000
    76  DATA consts<>+0x94(SB)/4, $0x00000001
    77  DATA consts<>+0x98(SB)/4, $0x00000002
    78  DATA consts<>+0x9c(SB)/4, $0x00000003
    79  DATA consts<>+0xa0(SB)/4, $0x11223300
    80  DATA consts<>+0xa4(SB)/4, $0x55667744
    81  DATA consts<>+0xa8(SB)/4, $0x99aabb88
    82  DATA consts<>+0xac(SB)/4, $0xddeeffcc
    83  DATA consts<>+0xb0(SB)/4, $0x22330011
    84  DATA consts<>+0xb4(SB)/4, $0x66774455
    85  DATA consts<>+0xb8(SB)/4, $0xaabb8899
    86  DATA consts<>+0xbc(SB)/4, $0xeeffccdd
    87  GLOBL consts<>(SB), RODATA, $0xc0
    88  
    89  #ifdef GOARCH_ppc64
    90  #define BE_XXBRW_INIT() \
    91  		LVSL (R0)(R0), V24 \
    92  		VSPLTISB $3, V25   \
    93  		VXOR V24, V25, V24 \
    94  
    95  #define BE_XXBRW(vr) VPERM vr, vr, V24, vr
    96  #else
    97  #define BE_XXBRW_INIT()
    98  #define BE_XXBRW(vr)
    99  #endif
   100  
   101  //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
   102  TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
   103  	MOVD out+0(FP), OUT
   104  	MOVD inp+8(FP), INP
   105  	MOVD len+16(FP), LEN
   106  	MOVD key+24(FP), KEY
   107  	MOVD counter+32(FP), CNT
   108  
   109  	// Addressing for constants
   110  	MOVD $consts<>+0x00(SB), CONSTBASE
   111  	MOVD $16, R8
   112  	MOVD $32, R9
   113  	MOVD $48, R10
   114  	MOVD $64, R11
   115  	SRD $6, LEN, BLOCKS
   116  	// for VPERMXOR
   117  	MOVD $consts<>+0xa0(SB), MASK
   118  	MOVD $16, R20
   119  	// V16
   120  	LXVW4X (CONSTBASE)(R0), VS48
   121  	ADD $80,CONSTBASE
   122  
   123  	// Load key into V17,V18
   124  	LXVW4X (KEY)(R0), VS49
   125  	LXVW4X (KEY)(R8), VS50
   126  
   127  	// Load CNT, NONCE into V19
   128  	LXVW4X (CNT)(R0), VS51
   129  
   130  	// Clear V27
   131  	VXOR V27, V27, V27
   132  
   133  	BE_XXBRW_INIT()
   134  
   135  	// V28
   136  	LXVW4X (CONSTBASE)(R11), VS60
   137  
   138  	// Load mask constants for VPERMXOR
   139  	LXVW4X (MASK)(R0), V20
   140  	LXVW4X (MASK)(R20), V21
   141  
   142  	// splat slot from V19 -> V26
   143  	VSPLTW $0, V19, V26
   144  
   145  	VSLDOI $4, V19, V27, V19
   146  	VSLDOI $12, V27, V19, V19
   147  
   148  	VADDUWM V26, V28, V26
   149  
   150  	MOVD $10, R14
   151  	MOVD R14, CTR
   152  	PCALIGN $16
   153  loop_outer_vsx:
   154  	// V0, V1, V2, V3
   155  	LXVW4X (R0)(CONSTBASE), VS32
   156  	LXVW4X (R8)(CONSTBASE), VS33
   157  	LXVW4X (R9)(CONSTBASE), VS34
   158  	LXVW4X (R10)(CONSTBASE), VS35
   159  
   160  	// splat values from V17, V18 into V4-V11
   161  	VSPLTW $0, V17, V4
   162  	VSPLTW $1, V17, V5
   163  	VSPLTW $2, V17, V6
   164  	VSPLTW $3, V17, V7
   165  	VSPLTW $0, V18, V8
   166  	VSPLTW $1, V18, V9
   167  	VSPLTW $2, V18, V10
   168  	VSPLTW $3, V18, V11
   169  
   170  	// VOR
   171  	VOR V26, V26, V12
   172  
   173  	// splat values from V19 -> V13, V14, V15
   174  	VSPLTW $1, V19, V13
   175  	VSPLTW $2, V19, V14
   176  	VSPLTW $3, V19, V15
   177  
   178  	// splat   const values
   179  	VSPLTISW $-16, V27
   180  	VSPLTISW $12, V28
   181  	VSPLTISW $8, V29
   182  	VSPLTISW $7, V30
   183  	PCALIGN $16
   184  loop_vsx:
   185  	VADDUWM V0, V4, V0
   186  	VADDUWM V1, V5, V1
   187  	VADDUWM V2, V6, V2
   188  	VADDUWM V3, V7, V3
   189  
   190  	VPERMXOR V12, V0, V21, V12
   191  	VPERMXOR V13, V1, V21, V13
   192  	VPERMXOR V14, V2, V21, V14
   193  	VPERMXOR V15, V3, V21, V15
   194  
   195  	VADDUWM V8, V12, V8
   196  	VADDUWM V9, V13, V9
   197  	VADDUWM V10, V14, V10
   198  	VADDUWM V11, V15, V11
   199  
   200  	VXOR V4, V8, V4
   201  	VXOR V5, V9, V5
   202  	VXOR V6, V10, V6
   203  	VXOR V7, V11, V7
   204  
   205  	VRLW V4, V28, V4
   206  	VRLW V5, V28, V5
   207  	VRLW V6, V28, V6
   208  	VRLW V7, V28, V7
   209  
   210  	VADDUWM V0, V4, V0
   211  	VADDUWM V1, V5, V1
   212  	VADDUWM V2, V6, V2
   213  	VADDUWM V3, V7, V3
   214  
   215  	VPERMXOR V12, V0, V20, V12
   216  	VPERMXOR V13, V1, V20, V13
   217  	VPERMXOR V14, V2, V20, V14
   218  	VPERMXOR V15, V3, V20, V15
   219  
   220  	VADDUWM V8, V12, V8
   221  	VADDUWM V9, V13, V9
   222  	VADDUWM V10, V14, V10
   223  	VADDUWM V11, V15, V11
   224  
   225  	VXOR V4, V8, V4
   226  	VXOR V5, V9, V5
   227  	VXOR V6, V10, V6
   228  	VXOR V7, V11, V7
   229  
   230  	VRLW V4, V30, V4
   231  	VRLW V5, V30, V5
   232  	VRLW V6, V30, V6
   233  	VRLW V7, V30, V7
   234  
   235  	VADDUWM V0, V5, V0
   236  	VADDUWM V1, V6, V1
   237  	VADDUWM V2, V7, V2
   238  	VADDUWM V3, V4, V3
   239  
   240  	VPERMXOR V15, V0, V21, V15
   241  	VPERMXOR V12, V1, V21, V12
   242  	VPERMXOR V13, V2, V21, V13
   243  	VPERMXOR V14, V3, V21, V14
   244  
   245  	VADDUWM V10, V15, V10
   246  	VADDUWM V11, V12, V11
   247  	VADDUWM V8, V13, V8
   248  	VADDUWM V9, V14, V9
   249  
   250  	VXOR V5, V10, V5
   251  	VXOR V6, V11, V6
   252  	VXOR V7, V8, V7
   253  	VXOR V4, V9, V4
   254  
   255  	VRLW V5, V28, V5
   256  	VRLW V6, V28, V6
   257  	VRLW V7, V28, V7
   258  	VRLW V4, V28, V4
   259  
   260  	VADDUWM V0, V5, V0
   261  	VADDUWM V1, V6, V1
   262  	VADDUWM V2, V7, V2
   263  	VADDUWM V3, V4, V3
   264  
   265  	VPERMXOR V15, V0, V20, V15
   266  	VPERMXOR V12, V1, V20, V12
   267  	VPERMXOR V13, V2, V20, V13
   268  	VPERMXOR V14, V3, V20, V14
   269  
   270  	VADDUWM V10, V15, V10
   271  	VADDUWM V11, V12, V11
   272  	VADDUWM V8, V13, V8
   273  	VADDUWM V9, V14, V9
   274  
   275  	VXOR V5, V10, V5
   276  	VXOR V6, V11, V6
   277  	VXOR V7, V8, V7
   278  	VXOR V4, V9, V4
   279  
   280  	VRLW V5, V30, V5
   281  	VRLW V6, V30, V6
   282  	VRLW V7, V30, V7
   283  	VRLW V4, V30, V4
   284  	BDNZ   loop_vsx
   285  
   286  	VADDUWM V12, V26, V12
   287  
   288  	VMRGEW V0, V1, V27
   289  	VMRGEW V2, V3, V28
   290  
   291  	VMRGOW V0, V1, V0
   292  	VMRGOW V2, V3, V2
   293  
   294  	VMRGEW V4, V5, V29
   295  	VMRGEW V6, V7, V30
   296  
   297  	XXPERMDI VS32, VS34, $0, VS33
   298  	XXPERMDI VS32, VS34, $3, VS35
   299  	XXPERMDI VS59, VS60, $0, VS32
   300  	XXPERMDI VS59, VS60, $3, VS34
   301  
   302  	VMRGOW V4, V5, V4
   303  	VMRGOW V6, V7, V6
   304  
   305  	VMRGEW V8, V9, V27
   306  	VMRGEW V10, V11, V28
   307  
   308  	XXPERMDI VS36, VS38, $0, VS37
   309  	XXPERMDI VS36, VS38, $3, VS39
   310  	XXPERMDI VS61, VS62, $0, VS36
   311  	XXPERMDI VS61, VS62, $3, VS38
   312  
   313  	VMRGOW V8, V9, V8
   314  	VMRGOW V10, V11, V10
   315  
   316  	VMRGEW V12, V13, V29
   317  	VMRGEW V14, V15, V30
   318  
   319  	XXPERMDI VS40, VS42, $0, VS41
   320  	XXPERMDI VS40, VS42, $3, VS43
   321  	XXPERMDI VS59, VS60, $0, VS40
   322  	XXPERMDI VS59, VS60, $3, VS42
   323  
   324  	VMRGOW V12, V13, V12
   325  	VMRGOW V14, V15, V14
   326  
   327  	VSPLTISW $4, V27
   328  	VADDUWM V26, V27, V26
   329  
   330  	XXPERMDI VS44, VS46, $0, VS45
   331  	XXPERMDI VS44, VS46, $3, VS47
   332  	XXPERMDI VS61, VS62, $0, VS44
   333  	XXPERMDI VS61, VS62, $3, VS46
   334  
   335  	VADDUWM V0, V16, V0
   336  	VADDUWM V4, V17, V4
   337  	VADDUWM V8, V18, V8
   338  	VADDUWM V12, V19, V12
   339  
   340  	BE_XXBRW(V0)
   341  	BE_XXBRW(V4)
   342  	BE_XXBRW(V8)
   343  	BE_XXBRW(V12)
   344  
   345  	CMPU LEN, $64
   346  	BLT tail_vsx
   347  
   348  	// Bottom of loop
   349  	LXVW4X (INP)(R0), VS59
   350  	LXVW4X (INP)(R8), VS60
   351  	LXVW4X (INP)(R9), VS61
   352  	LXVW4X (INP)(R10), VS62
   353  
   354  	VXOR V27, V0, V27
   355  	VXOR V28, V4, V28
   356  	VXOR V29, V8, V29
   357  	VXOR V30, V12, V30
   358  
   359  	STXVW4X VS59, (OUT)(R0)
   360  	STXVW4X VS60, (OUT)(R8)
   361  	ADD     $64, INP
   362  	STXVW4X VS61, (OUT)(R9)
   363  	ADD     $-64, LEN
   364  	STXVW4X VS62, (OUT)(R10)
   365  	ADD     $64, OUT
   366  	BEQ     done_vsx
   367  
   368  	VADDUWM V1, V16, V0
   369  	VADDUWM V5, V17, V4
   370  	VADDUWM V9, V18, V8
   371  	VADDUWM V13, V19, V12
   372  
   373  	BE_XXBRW(V0)
   374  	BE_XXBRW(V4)
   375  	BE_XXBRW(V8)
   376  	BE_XXBRW(V12)
   377  
   378  	CMPU  LEN, $64
   379  	BLT   tail_vsx
   380  
   381  	LXVW4X (INP)(R0), VS59
   382  	LXVW4X (INP)(R8), VS60
   383  	LXVW4X (INP)(R9), VS61
   384  	LXVW4X (INP)(R10), VS62
   385  
   386  	VXOR V27, V0, V27
   387  	VXOR V28, V4, V28
   388  	VXOR V29, V8, V29
   389  	VXOR V30, V12, V30
   390  
   391  	STXVW4X VS59, (OUT)(R0)
   392  	STXVW4X VS60, (OUT)(R8)
   393  	ADD     $64, INP
   394  	STXVW4X VS61, (OUT)(R9)
   395  	ADD     $-64, LEN
   396  	STXVW4X VS62, (OUT)(V10)
   397  	ADD     $64, OUT
   398  	BEQ     done_vsx
   399  
   400  	VADDUWM V2, V16, V0
   401  	VADDUWM V6, V17, V4
   402  	VADDUWM V10, V18, V8
   403  	VADDUWM V14, V19, V12
   404  
   405  	BE_XXBRW(V0)
   406  	BE_XXBRW(V4)
   407  	BE_XXBRW(V8)
   408  	BE_XXBRW(V12)
   409  
   410  	CMPU LEN, $64
   411  	BLT  tail_vsx
   412  
   413  	LXVW4X (INP)(R0), VS59
   414  	LXVW4X (INP)(R8), VS60
   415  	LXVW4X (INP)(R9), VS61
   416  	LXVW4X (INP)(R10), VS62
   417  
   418  	VXOR V27, V0, V27
   419  	VXOR V28, V4, V28
   420  	VXOR V29, V8, V29
   421  	VXOR V30, V12, V30
   422  
   423  	STXVW4X VS59, (OUT)(R0)
   424  	STXVW4X VS60, (OUT)(R8)
   425  	ADD     $64, INP
   426  	STXVW4X VS61, (OUT)(R9)
   427  	ADD     $-64, LEN
   428  	STXVW4X VS62, (OUT)(R10)
   429  	ADD     $64, OUT
   430  	BEQ     done_vsx
   431  
   432  	VADDUWM V3, V16, V0
   433  	VADDUWM V7, V17, V4
   434  	VADDUWM V11, V18, V8
   435  	VADDUWM V15, V19, V12
   436  
   437  	BE_XXBRW(V0)
   438  	BE_XXBRW(V4)
   439  	BE_XXBRW(V8)
   440  	BE_XXBRW(V12)
   441  
   442  	CMPU  LEN, $64
   443  	BLT   tail_vsx
   444  
   445  	LXVW4X (INP)(R0), VS59
   446  	LXVW4X (INP)(R8), VS60
   447  	LXVW4X (INP)(R9), VS61
   448  	LXVW4X (INP)(R10), VS62
   449  
   450  	VXOR V27, V0, V27
   451  	VXOR V28, V4, V28
   452  	VXOR V29, V8, V29
   453  	VXOR V30, V12, V30
   454  
   455  	STXVW4X VS59, (OUT)(R0)
   456  	STXVW4X VS60, (OUT)(R8)
   457  	ADD     $64, INP
   458  	STXVW4X VS61, (OUT)(R9)
   459  	ADD     $-64, LEN
   460  	STXVW4X VS62, (OUT)(R10)
   461  	ADD     $64, OUT
   462  
   463  	MOVD $10, R14
   464  	MOVD R14, CTR
   465  	BNE  loop_outer_vsx
   466  
   467  done_vsx:
   468  	// Increment counter by number of 64 byte blocks
   469  	MOVWZ (CNT), R14
   470  	ADD  BLOCKS, R14
   471  	MOVWZ R14, (CNT)
   472  	RET
   473  
   474  tail_vsx:
   475  	ADD  $32, R1, R11
   476  	MOVD LEN, CTR
   477  
   478  	// Save values on stack to copy from
   479  	STXVW4X VS32, (R11)(R0)
   480  	STXVW4X VS36, (R11)(R8)
   481  	STXVW4X VS40, (R11)(R9)
   482  	STXVW4X VS44, (R11)(R10)
   483  	ADD $-1, R11, R12
   484  	ADD $-1, INP
   485  	ADD $-1, OUT
   486  	PCALIGN $16
   487  looptail_vsx:
   488  	// Copying the result to OUT
   489  	// in bytes.
   490  	MOVBZU 1(R12), KEY
   491  	MOVBZU 1(INP), TMP
   492  	XOR    KEY, TMP, KEY
   493  	MOVBU  KEY, 1(OUT)
   494  	BDNZ   looptail_vsx
   495  
   496  	// Clear the stack values
   497  	STXVW4X VS48, (R11)(R0)
   498  	STXVW4X VS48, (R11)(R8)
   499  	STXVW4X VS48, (R11)(R9)
   500  	STXVW4X VS48, (R11)(R10)
   501  	BR      done_vsx
   502  

View as plain text