Text file src/crypto/aes/asm_arm64.s

     1  // Copyright 2017 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  DATA rotInvSRows<>+0x00(SB)/8, $0x080f0205040b0e01
     9  DATA rotInvSRows<>+0x08(SB)/8, $0x00070a0d0c030609
    10  GLOBL rotInvSRows<>(SB), (NOPTR+RODATA), $16
    11  DATA invSRows<>+0x00(SB)/8, $0x0b0e0104070a0d00
    12  DATA invSRows<>+0x08(SB)/8, $0x0306090c0f020508
    13  GLOBL invSRows<>(SB), (NOPTR+RODATA), $16
    14  // func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
    15  TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
    16  	MOVD	nr+0(FP), R9
    17  	MOVD	xk+8(FP), R10
    18  	MOVD	dst+16(FP), R11
    19  	MOVD	src+24(FP), R12
    20  
    21  	VLD1	(R12), [V0.B16]
    22  
    23  	CMP	$12, R9
    24  	BLT	enc128
    25  	BEQ	enc196
    26  enc256:
    27  	VLD1.P	32(R10), [V1.B16, V2.B16]
    28  	AESE	V1.B16, V0.B16
    29  	AESMC	V0.B16, V0.B16
    30  	AESE	V2.B16, V0.B16
    31  	AESMC	V0.B16, V0.B16
    32  enc196:
    33  	VLD1.P	32(R10), [V3.B16, V4.B16]
    34  	AESE	V3.B16, V0.B16
    35  	AESMC	V0.B16, V0.B16
    36  	AESE	V4.B16, V0.B16
    37  	AESMC	V0.B16, V0.B16
    38  enc128:
    39  	VLD1.P	64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
    40  	VLD1.P	64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
    41  	VLD1.P	48(R10), [V13.B16, V14.B16, V15.B16]
    42  	AESE	V5.B16, V0.B16
    43  	AESMC	V0.B16, V0.B16
    44  	AESE	V6.B16, V0.B16
    45  	AESMC	V0.B16, V0.B16
    46  	AESE	V7.B16, V0.B16
    47  	AESMC	V0.B16, V0.B16
    48  	AESE	V8.B16, V0.B16
    49  	AESMC	V0.B16, V0.B16
    50  	AESE	V9.B16, V0.B16
    51  	AESMC	V0.B16, V0.B16
    52  	AESE	V10.B16, V0.B16
    53  	AESMC	V0.B16, V0.B16
    54  	AESE	V11.B16, V0.B16
    55  	AESMC	V0.B16, V0.B16
    56  	AESE	V12.B16, V0.B16
    57  	AESMC	V0.B16, V0.B16
    58  	AESE	V13.B16, V0.B16
    59  	AESMC	V0.B16, V0.B16
    60  	AESE	V14.B16, V0.B16
    61  	VEOR    V0.B16, V15.B16, V0.B16
    62  	VST1	[V0.B16], (R11)
    63  	RET
    64  
    65  // func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
    66  TEXT ·decryptBlockAsm(SB),NOSPLIT,$0
    67  	MOVD	nr+0(FP), R9
    68  	MOVD	xk+8(FP), R10
    69  	MOVD	dst+16(FP), R11
    70  	MOVD	src+24(FP), R12
    71  
    72  	VLD1	(R12), [V0.B16]
    73  
    74  	CMP	$12, R9
    75  	BLT	dec128
    76  	BEQ	dec196
    77  dec256:
    78  	VLD1.P	32(R10), [V1.B16, V2.B16]
    79  	AESD	V1.B16, V0.B16
    80  	AESIMC	V0.B16, V0.B16
    81  	AESD	V2.B16, V0.B16
    82  	AESIMC	V0.B16, V0.B16
    83  dec196:
    84  	VLD1.P	32(R10), [V3.B16, V4.B16]
    85  	AESD	V3.B16, V0.B16
    86  	AESIMC	V0.B16, V0.B16
    87  	AESD	V4.B16, V0.B16
    88  	AESIMC	V0.B16, V0.B16
    89  dec128:
    90  	VLD1.P	64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
    91  	VLD1.P	64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
    92  	VLD1.P	48(R10), [V13.B16, V14.B16, V15.B16]
    93  	AESD	V5.B16, V0.B16
    94  	AESIMC	V0.B16, V0.B16
    95  	AESD	V6.B16, V0.B16
    96  	AESIMC	V0.B16, V0.B16
    97  	AESD	V7.B16, V0.B16
    98  	AESIMC	V0.B16, V0.B16
    99  	AESD	V8.B16, V0.B16
   100  	AESIMC	V0.B16, V0.B16
   101  	AESD	V9.B16, V0.B16
   102  	AESIMC	V0.B16, V0.B16
   103  	AESD	V10.B16, V0.B16
   104  	AESIMC	V0.B16, V0.B16
   105  	AESD	V11.B16, V0.B16
   106  	AESIMC	V0.B16, V0.B16
   107  	AESD	V12.B16, V0.B16
   108  	AESIMC	V0.B16, V0.B16
   109  	AESD	V13.B16, V0.B16
   110  	AESIMC	V0.B16, V0.B16
   111  	AESD	V14.B16, V0.B16
   112  	VEOR    V0.B16, V15.B16, V0.B16
   113  	VST1	[V0.B16], (R11)
   114  	RET
   115  
   116  // func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
   117  // Note that round keys are stored in uint128 format, not uint32
   118  TEXT ·expandKeyAsm(SB),NOSPLIT,$0
   119  	MOVD	nr+0(FP), R8
   120  	MOVD	key+8(FP), R9
   121  	MOVD	enc+16(FP), R10
   122  	MOVD	dec+24(FP), R11
   123  	LDP	rotInvSRows<>(SB), (R0, R1)
   124  	VMOV	R0, V3.D[0]
   125  	VMOV	R1, V3.D[1]
   126  	VEOR	V0.B16, V0.B16, V0.B16 // All zeroes
   127  	MOVW	$1, R13
   128  	TBZ	$1, R8, ks192
   129  	TBNZ	$2, R8, ks256
   130  	LDPW	(R9), (R4, R5)
   131  	LDPW	8(R9), (R6, R7)
   132  	STPW.P	(R4, R5), 8(R10)
   133  	STPW.P	(R6, R7), 8(R10)
   134  	MOVW	$0x1b, R14
   135  ks128Loop:
   136  		VMOV	R7, V2.S[0]
   137  		WORD	$0x4E030042       // TBL V3.B16, [V2.B16], V2.B16
   138  		AESE	V0.B16, V2.B16    // Use AES to compute the SBOX
   139  		EORW	R13, R4
   140  		LSLW	$1, R13           // Compute next Rcon
   141  		ANDSW	$0x100, R13, ZR
   142  		CSELW	NE, R14, R13, R13 // Fake modulo
   143  		SUBS	$1, R8
   144  		VMOV	V2.S[0], R0
   145  		EORW	R0, R4
   146  		EORW	R4, R5
   147  		EORW	R5, R6
   148  		EORW	R6, R7
   149  		STPW.P	(R4, R5), 8(R10)
   150  		STPW.P	(R6, R7), 8(R10)
   151  	BNE	ks128Loop
   152  	CBZ	R11, ksDone       // If dec is nil we are done
   153  	SUB	$176, R10
   154  	// Decryption keys are encryption keys with InverseMixColumns applied
   155  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   156  	VMOV	V0.B16, V7.B16
   157  	AESIMC	V1.B16, V6.B16
   158  	AESIMC	V2.B16, V5.B16
   159  	AESIMC	V3.B16, V4.B16
   160  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   161  	AESIMC	V0.B16, V11.B16
   162  	AESIMC	V1.B16, V10.B16
   163  	AESIMC	V2.B16, V9.B16
   164  	AESIMC	V3.B16, V8.B16
   165  	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
   166  	AESIMC	V0.B16, V14.B16
   167  	AESIMC	V1.B16, V13.B16
   168  	VMOV	V2.B16, V12.B16
   169  	VST1.P	[V12.B16, V13.B16, V14.B16], 48(R11)
   170  	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
   171  	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
   172  	B	ksDone
   173  ks192:
   174  	LDPW	(R9), (R2, R3)
   175  	LDPW	8(R9), (R4, R5)
   176  	LDPW	16(R9), (R6, R7)
   177  	STPW.P	(R2, R3), 8(R10)
   178  	STPW.P	(R4, R5), 8(R10)
   179  	SUB	$4, R8
   180  ks192Loop:
   181  		STPW.P	(R6, R7), 8(R10)
   182  		VMOV	R7, V2.S[0]
   183  		WORD	$0x4E030042 //TBL	V3.B16, [V2.B16], V2.B16
   184  		AESE	V0.B16, V2.B16
   185  		EORW	R13, R2
   186  		LSLW	$1, R13
   187  		SUBS	$1, R8
   188  		VMOV	V2.S[0], R0
   189  		EORW	R0, R2
   190  		EORW	R2, R3
   191  		EORW	R3, R4
   192  		EORW	R4, R5
   193  		EORW	R5, R6
   194  		EORW	R6, R7
   195  		STPW.P	(R2, R3), 8(R10)
   196  		STPW.P	(R4, R5), 8(R10)
   197  	BNE	ks192Loop
   198  	CBZ	R11, ksDone
   199  	SUB	$208, R10
   200  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   201  	VMOV	V0.B16, V7.B16
   202  	AESIMC	V1.B16, V6.B16
   203  	AESIMC	V2.B16, V5.B16
   204  	AESIMC	V3.B16, V4.B16
   205  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   206  	AESIMC	V0.B16, V11.B16
   207  	AESIMC	V1.B16, V10.B16
   208  	AESIMC	V2.B16, V9.B16
   209  	AESIMC	V3.B16, V8.B16
   210  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   211  	AESIMC	V0.B16, V15.B16
   212  	AESIMC	V1.B16, V14.B16
   213  	AESIMC	V2.B16, V13.B16
   214  	AESIMC	V3.B16, V12.B16
   215  	VLD1	(R10), [V0.B16]
   216  	VST1.P	[V0.B16], 16(R11)
   217  	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
   218  	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
   219  	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
   220  	B	ksDone
   221  ks256:
   222  	LDP	invSRows<>(SB), (R0, R1)
   223  	VMOV	R0, V4.D[0]
   224  	VMOV	R1, V4.D[1]
   225  	LDPW	(R9), (R0, R1)
   226  	LDPW	8(R9), (R2, R3)
   227  	LDPW	16(R9), (R4, R5)
   228  	LDPW	24(R9), (R6, R7)
   229  	STPW.P	(R0, R1), 8(R10)
   230  	STPW.P	(R2, R3), 8(R10)
   231  	SUB	$7, R8
   232  ks256Loop:
   233  		STPW.P	(R4, R5), 8(R10)
   234  		STPW.P	(R6, R7), 8(R10)
   235  		VMOV	R7, V2.S[0]
   236  		WORD	$0x4E030042 //TBL	V3.B16, [V2.B16], V2.B16
   237  		AESE	V0.B16, V2.B16
   238  		EORW	R13, R0
   239  		LSLW	$1, R13
   240  		SUBS	$1, R8
   241  		VMOV	V2.S[0], R9
   242  		EORW	R9, R0
   243  		EORW	R0, R1
   244  		EORW	R1, R2
   245  		EORW	R2, R3
   246  		VMOV	R3, V2.S[0]
   247  		WORD	$0x4E040042 //TBL	V3.B16, [V2.B16], V2.B16
   248  		AESE	V0.B16, V2.B16
   249  		VMOV	V2.S[0], R9
   250  		EORW	R9, R4
   251  		EORW	R4, R5
   252  		EORW	R5, R6
   253  		EORW	R6, R7
   254  		STPW.P	(R0, R1), 8(R10)
   255  		STPW.P	(R2, R3), 8(R10)
   256  	BNE	ks256Loop
   257  	CBZ	R11, ksDone
   258  	SUB	$240, R10
   259  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   260  	VMOV	V0.B16, V7.B16
   261  	AESIMC	V1.B16, V6.B16
   262  	AESIMC	V2.B16, V5.B16
   263  	AESIMC	V3.B16, V4.B16
   264  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   265  	AESIMC	V0.B16, V11.B16
   266  	AESIMC	V1.B16, V10.B16
   267  	AESIMC	V2.B16, V9.B16
   268  	AESIMC	V3.B16, V8.B16
   269  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
   270  	AESIMC	V0.B16, V15.B16
   271  	AESIMC	V1.B16, V14.B16
   272  	AESIMC	V2.B16, V13.B16
   273  	AESIMC	V3.B16, V12.B16
   274  	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
   275  	AESIMC	V0.B16, V18.B16
   276  	AESIMC	V1.B16, V17.B16
   277  	VMOV	V2.B16, V16.B16
   278  	VST1.P	[V16.B16, V17.B16, V18.B16], 48(R11)
   279  	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
   280  	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
   281  	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
   282  ksDone:
   283  	RET
   284  

View as plain text