Text file src/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build gc && !purego
     6  
     7  #include "textflag.h"
     8  
     9  #define NUM_ROUNDS 10
    10  
    11  // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
    12  TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
    13  	MOVD	dst+0(FP), R1
    14  	MOVD	src+24(FP), R2
    15  	MOVD	src_len+32(FP), R3
    16  	MOVD	key+48(FP), R4
    17  	MOVD	nonce+56(FP), R6
    18  	MOVD	counter+64(FP), R7
    19  
    20  	MOVD	$·constants(SB), R10
    21  	MOVD	$·incRotMatrix(SB), R11
    22  
    23  	MOVW	(R7), R20
    24  
    25  	AND	$~255, R3, R13
    26  	ADD	R2, R13, R12 // R12 for block end
    27  	AND	$255, R3, R13
    28  loop:
    29  	MOVD	$NUM_ROUNDS, R21
    30  	VLD1	(R11), [V30.S4, V31.S4]
    31  
    32  	// load contants
    33  	// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
    34  	WORD	$0x4D60E940
    35  
    36  	// load keys
    37  	// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
    38  	WORD	$0x4DFFE884
    39  	// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
    40  	WORD	$0x4DFFE888
    41  	SUB	$32, R4
    42  
    43  	// load counter + nonce
    44  	// VLD1R (R7), [V12.S4]
    45  	WORD	$0x4D40C8EC
    46  
    47  	// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
    48  	WORD	$0x4D40E8CD
    49  
    50  	// update counter
    51  	VADD	V30.S4, V12.S4, V12.S4
    52  
    53  chacha:
    54  	// V0..V3 += V4..V7
    55  	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
    56  	VADD	V0.S4, V4.S4, V0.S4
    57  	VADD	V1.S4, V5.S4, V1.S4
    58  	VADD	V2.S4, V6.S4, V2.S4
    59  	VADD	V3.S4, V7.S4, V3.S4
    60  	VEOR	V12.B16, V0.B16, V12.B16
    61  	VEOR	V13.B16, V1.B16, V13.B16
    62  	VEOR	V14.B16, V2.B16, V14.B16
    63  	VEOR	V15.B16, V3.B16, V15.B16
    64  	VREV32	V12.H8, V12.H8
    65  	VREV32	V13.H8, V13.H8
    66  	VREV32	V14.H8, V14.H8
    67  	VREV32	V15.H8, V15.H8
    68  	// V8..V11 += V12..V15
    69  	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
    70  	VADD	V8.S4, V12.S4, V8.S4
    71  	VADD	V9.S4, V13.S4, V9.S4
    72  	VADD	V10.S4, V14.S4, V10.S4
    73  	VADD	V11.S4, V15.S4, V11.S4
    74  	VEOR	V8.B16, V4.B16, V16.B16
    75  	VEOR	V9.B16, V5.B16, V17.B16
    76  	VEOR	V10.B16, V6.B16, V18.B16
    77  	VEOR	V11.B16, V7.B16, V19.B16
    78  	VSHL	$12, V16.S4, V4.S4
    79  	VSHL	$12, V17.S4, V5.S4
    80  	VSHL	$12, V18.S4, V6.S4
    81  	VSHL	$12, V19.S4, V7.S4
    82  	VSRI	$20, V16.S4, V4.S4
    83  	VSRI	$20, V17.S4, V5.S4
    84  	VSRI	$20, V18.S4, V6.S4
    85  	VSRI	$20, V19.S4, V7.S4
    86  
    87  	// V0..V3 += V4..V7
    88  	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
    89  	VADD	V0.S4, V4.S4, V0.S4
    90  	VADD	V1.S4, V5.S4, V1.S4
    91  	VADD	V2.S4, V6.S4, V2.S4
    92  	VADD	V3.S4, V7.S4, V3.S4
    93  	VEOR	V12.B16, V0.B16, V12.B16
    94  	VEOR	V13.B16, V1.B16, V13.B16
    95  	VEOR	V14.B16, V2.B16, V14.B16
    96  	VEOR	V15.B16, V3.B16, V15.B16
    97  	VTBL	V31.B16, [V12.B16], V12.B16
    98  	VTBL	V31.B16, [V13.B16], V13.B16
    99  	VTBL	V31.B16, [V14.B16], V14.B16
   100  	VTBL	V31.B16, [V15.B16], V15.B16
   101  
   102  	// V8..V11 += V12..V15
   103  	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
   104  	VADD	V12.S4, V8.S4, V8.S4
   105  	VADD	V13.S4, V9.S4, V9.S4
   106  	VADD	V14.S4, V10.S4, V10.S4
   107  	VADD	V15.S4, V11.S4, V11.S4
   108  	VEOR	V8.B16, V4.B16, V16.B16
   109  	VEOR	V9.B16, V5.B16, V17.B16
   110  	VEOR	V10.B16, V6.B16, V18.B16
   111  	VEOR	V11.B16, V7.B16, V19.B16
   112  	VSHL	$7, V16.S4, V4.S4
   113  	VSHL	$7, V17.S4, V5.S4
   114  	VSHL	$7, V18.S4, V6.S4
   115  	VSHL	$7, V19.S4, V7.S4
   116  	VSRI	$25, V16.S4, V4.S4
   117  	VSRI	$25, V17.S4, V5.S4
   118  	VSRI	$25, V18.S4, V6.S4
   119  	VSRI	$25, V19.S4, V7.S4
   120  
   121  	// V0..V3 += V5..V7, V4
   122  	// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
   123  	VADD	V0.S4, V5.S4, V0.S4
   124  	VADD	V1.S4, V6.S4, V1.S4
   125  	VADD	V2.S4, V7.S4, V2.S4
   126  	VADD	V3.S4, V4.S4, V3.S4
   127  	VEOR	V15.B16, V0.B16, V15.B16
   128  	VEOR	V12.B16, V1.B16, V12.B16
   129  	VEOR	V13.B16, V2.B16, V13.B16
   130  	VEOR	V14.B16, V3.B16, V14.B16
   131  	VREV32	V12.H8, V12.H8
   132  	VREV32	V13.H8, V13.H8
   133  	VREV32	V14.H8, V14.H8
   134  	VREV32	V15.H8, V15.H8
   135  
   136  	// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
   137  	// ...
   138  	VADD	V15.S4, V10.S4, V10.S4
   139  	VADD	V12.S4, V11.S4, V11.S4
   140  	VADD	V13.S4, V8.S4, V8.S4
   141  	VADD	V14.S4, V9.S4, V9.S4
   142  	VEOR	V10.B16, V5.B16, V16.B16
   143  	VEOR	V11.B16, V6.B16, V17.B16
   144  	VEOR	V8.B16, V7.B16, V18.B16
   145  	VEOR	V9.B16, V4.B16, V19.B16
   146  	VSHL	$12, V16.S4, V5.S4
   147  	VSHL	$12, V17.S4, V6.S4
   148  	VSHL	$12, V18.S4, V7.S4
   149  	VSHL	$12, V19.S4, V4.S4
   150  	VSRI	$20, V16.S4, V5.S4
   151  	VSRI	$20, V17.S4, V6.S4
   152  	VSRI	$20, V18.S4, V7.S4
   153  	VSRI	$20, V19.S4, V4.S4
   154  
   155  	// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
   156  	// ...
   157  	VADD	V5.S4, V0.S4, V0.S4
   158  	VADD	V6.S4, V1.S4, V1.S4
   159  	VADD	V7.S4, V2.S4, V2.S4
   160  	VADD	V4.S4, V3.S4, V3.S4
   161  	VEOR	V0.B16, V15.B16, V15.B16
   162  	VEOR	V1.B16, V12.B16, V12.B16
   163  	VEOR	V2.B16, V13.B16, V13.B16
   164  	VEOR	V3.B16, V14.B16, V14.B16
   165  	VTBL	V31.B16, [V12.B16], V12.B16
   166  	VTBL	V31.B16, [V13.B16], V13.B16
   167  	VTBL	V31.B16, [V14.B16], V14.B16
   168  	VTBL	V31.B16, [V15.B16], V15.B16
   169  
   170  	// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
   171  	// ...
   172  	VADD	V15.S4, V10.S4, V10.S4
   173  	VADD	V12.S4, V11.S4, V11.S4
   174  	VADD	V13.S4, V8.S4, V8.S4
   175  	VADD	V14.S4, V9.S4, V9.S4
   176  	VEOR	V10.B16, V5.B16, V16.B16
   177  	VEOR	V11.B16, V6.B16, V17.B16
   178  	VEOR	V8.B16, V7.B16, V18.B16
   179  	VEOR	V9.B16, V4.B16, V19.B16
   180  	VSHL	$7, V16.S4, V5.S4
   181  	VSHL	$7, V17.S4, V6.S4
   182  	VSHL	$7, V18.S4, V7.S4
   183  	VSHL	$7, V19.S4, V4.S4
   184  	VSRI	$25, V16.S4, V5.S4
   185  	VSRI	$25, V17.S4, V6.S4
   186  	VSRI	$25, V18.S4, V7.S4
   187  	VSRI	$25, V19.S4, V4.S4
   188  
   189  	SUB	$1, R21
   190  	CBNZ	R21, chacha
   191  
   192  	// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
   193  	WORD	$0x4D60E950
   194  
   195  	// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
   196  	WORD	$0x4DFFE894
   197  	VADD	V30.S4, V12.S4, V12.S4
   198  	VADD	V16.S4, V0.S4, V0.S4
   199  	VADD	V17.S4, V1.S4, V1.S4
   200  	VADD	V18.S4, V2.S4, V2.S4
   201  	VADD	V19.S4, V3.S4, V3.S4
   202  	// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
   203  	WORD	$0x4DFFE898
   204  	// restore R4
   205  	SUB	$32, R4
   206  
   207  	// load counter + nonce
   208  	// VLD1R (R7), [V28.S4]
   209  	WORD	$0x4D40C8FC
   210  	// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
   211  	WORD	$0x4D40E8DD
   212  
   213  	VADD	V20.S4, V4.S4, V4.S4
   214  	VADD	V21.S4, V5.S4, V5.S4
   215  	VADD	V22.S4, V6.S4, V6.S4
   216  	VADD	V23.S4, V7.S4, V7.S4
   217  	VADD	V24.S4, V8.S4, V8.S4
   218  	VADD	V25.S4, V9.S4, V9.S4
   219  	VADD	V26.S4, V10.S4, V10.S4
   220  	VADD	V27.S4, V11.S4, V11.S4
   221  	VADD	V28.S4, V12.S4, V12.S4
   222  	VADD	V29.S4, V13.S4, V13.S4
   223  	VADD	V30.S4, V14.S4, V14.S4
   224  	VADD	V31.S4, V15.S4, V15.S4
   225  
   226  	VZIP1	V1.S4, V0.S4, V16.S4
   227  	VZIP2	V1.S4, V0.S4, V17.S4
   228  	VZIP1	V3.S4, V2.S4, V18.S4
   229  	VZIP2	V3.S4, V2.S4, V19.S4
   230  	VZIP1	V5.S4, V4.S4, V20.S4
   231  	VZIP2	V5.S4, V4.S4, V21.S4
   232  	VZIP1	V7.S4, V6.S4, V22.S4
   233  	VZIP2	V7.S4, V6.S4, V23.S4
   234  	VZIP1	V9.S4, V8.S4, V24.S4
   235  	VZIP2	V9.S4, V8.S4, V25.S4
   236  	VZIP1	V11.S4, V10.S4, V26.S4
   237  	VZIP2	V11.S4, V10.S4, V27.S4
   238  	VZIP1	V13.S4, V12.S4, V28.S4
   239  	VZIP2	V13.S4, V12.S4, V29.S4
   240  	VZIP1	V15.S4, V14.S4, V30.S4
   241  	VZIP2	V15.S4, V14.S4, V31.S4
   242  	VZIP1	V18.D2, V16.D2, V0.D2
   243  	VZIP2	V18.D2, V16.D2, V4.D2
   244  	VZIP1	V19.D2, V17.D2, V8.D2
   245  	VZIP2	V19.D2, V17.D2, V12.D2
   246  	VLD1.P	64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
   247  
   248  	VZIP1	V22.D2, V20.D2, V1.D2
   249  	VZIP2	V22.D2, V20.D2, V5.D2
   250  	VZIP1	V23.D2, V21.D2, V9.D2
   251  	VZIP2	V23.D2, V21.D2, V13.D2
   252  	VLD1.P	64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
   253  	VZIP1	V26.D2, V24.D2, V2.D2
   254  	VZIP2	V26.D2, V24.D2, V6.D2
   255  	VZIP1	V27.D2, V25.D2, V10.D2
   256  	VZIP2	V27.D2, V25.D2, V14.D2
   257  	VLD1.P	64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
   258  	VZIP1	V30.D2, V28.D2, V3.D2
   259  	VZIP2	V30.D2, V28.D2, V7.D2
   260  	VZIP1	V31.D2, V29.D2, V11.D2
   261  	VZIP2	V31.D2, V29.D2, V15.D2
   262  	VLD1.P	64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
   263  	VEOR	V0.B16, V16.B16, V16.B16
   264  	VEOR	V1.B16, V17.B16, V17.B16
   265  	VEOR	V2.B16, V18.B16, V18.B16
   266  	VEOR	V3.B16, V19.B16, V19.B16
   267  	VST1.P	[V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
   268  	VEOR	V4.B16, V20.B16, V20.B16
   269  	VEOR	V5.B16, V21.B16, V21.B16
   270  	VEOR	V6.B16, V22.B16, V22.B16
   271  	VEOR	V7.B16, V23.B16, V23.B16
   272  	VST1.P	[V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
   273  	VEOR	V8.B16, V24.B16, V24.B16
   274  	VEOR	V9.B16, V25.B16, V25.B16
   275  	VEOR	V10.B16, V26.B16, V26.B16
   276  	VEOR	V11.B16, V27.B16, V27.B16
   277  	VST1.P	[V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
   278  	VEOR	V12.B16, V28.B16, V28.B16
   279  	VEOR	V13.B16, V29.B16, V29.B16
   280  	VEOR	V14.B16, V30.B16, V30.B16
   281  	VEOR	V15.B16, V31.B16, V31.B16
   282  	VST1.P	[V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
   283  
   284  	ADD	$4, R20
   285  	MOVW	R20, (R7) // update counter
   286  
   287  	CMP	R2, R12
   288  	BGT	loop
   289  
   290  	RET
   291  
   292  
   293  DATA	·constants+0x00(SB)/4, $0x61707865
   294  DATA	·constants+0x04(SB)/4, $0x3320646e
   295  DATA	·constants+0x08(SB)/4, $0x79622d32
   296  DATA	·constants+0x0c(SB)/4, $0x6b206574
   297  GLOBL	·constants(SB), NOPTR|RODATA, $32
   298  
   299  DATA	·incRotMatrix+0x00(SB)/4, $0x00000000
   300  DATA	·incRotMatrix+0x04(SB)/4, $0x00000001
   301  DATA	·incRotMatrix+0x08(SB)/4, $0x00000002
   302  DATA	·incRotMatrix+0x0c(SB)/4, $0x00000003
   303  DATA	·incRotMatrix+0x10(SB)/4, $0x02010003
   304  DATA	·incRotMatrix+0x14(SB)/4, $0x06050407
   305  DATA	·incRotMatrix+0x18(SB)/4, $0x0A09080B
   306  DATA	·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
   307  GLOBL	·incRotMatrix(SB), NOPTR|RODATA, $32
   308  

View as plain text