1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build gc && !purego
6
7 #include "go_asm.h"
8 #include "textflag.h"
9
10 // This is an implementation of the ChaCha20 encryption algorithm as
11 // specified in RFC 7539. It uses vector instructions to compute
12 // 4 keystream blocks in parallel (256 bytes) which are then XORed
13 // with the bytes in the input slice.
14
15 GLOBL ·constants<>(SB), RODATA|NOPTR, $32
16 // BSWAP: swap bytes in each 4-byte element
17 DATA ·constants<>+0x00(SB)/4, $0x03020100
18 DATA ·constants<>+0x04(SB)/4, $0x07060504
19 DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
20 DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
21 // J0: [j0, j1, j2, j3]
22 DATA ·constants<>+0x10(SB)/4, $0x61707865
23 DATA ·constants<>+0x14(SB)/4, $0x3320646e
24 DATA ·constants<>+0x18(SB)/4, $0x79622d32
25 DATA ·constants<>+0x1c(SB)/4, $0x6b206574
26
27 #define BSWAP V5
28 #define J0 V6
29 #define KEY0 V7
30 #define KEY1 V8
31 #define NONCE V9
32 #define CTR V10
33 #define M0 V11
34 #define M1 V12
35 #define M2 V13
36 #define M3 V14
37 #define INC V15
38 #define X0 V16
39 #define X1 V17
40 #define X2 V18
41 #define X3 V19
42 #define X4 V20
43 #define X5 V21
44 #define X6 V22
45 #define X7 V23
46 #define X8 V24
47 #define X9 V25
48 #define X10 V26
49 #define X11 V27
50 #define X12 V28
51 #define X13 V29
52 #define X14 V30
53 #define X15 V31
54
55 #define NUM_ROUNDS 20
56
57 #define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
58 VAF a1, a0, a0 \
59 VAF b1, b0, b0 \
60 VAF c1, c0, c0 \
61 VAF d1, d0, d0 \
62 VX a0, a2, a2 \
63 VX b0, b2, b2 \
64 VX c0, c2, c2 \
65 VX d0, d2, d2 \
66 VERLLF $16, a2, a2 \
67 VERLLF $16, b2, b2 \
68 VERLLF $16, c2, c2 \
69 VERLLF $16, d2, d2 \
70 VAF a2, a3, a3 \
71 VAF b2, b3, b3 \
72 VAF c2, c3, c3 \
73 VAF d2, d3, d3 \
74 VX a3, a1, a1 \
75 VX b3, b1, b1 \
76 VX c3, c1, c1 \
77 VX d3, d1, d1 \
78 VERLLF $12, a1, a1 \
79 VERLLF $12, b1, b1 \
80 VERLLF $12, c1, c1 \
81 VERLLF $12, d1, d1 \
82 VAF a1, a0, a0 \
83 VAF b1, b0, b0 \
84 VAF c1, c0, c0 \
85 VAF d1, d0, d0 \
86 VX a0, a2, a2 \
87 VX b0, b2, b2 \
88 VX c0, c2, c2 \
89 VX d0, d2, d2 \
90 VERLLF $8, a2, a2 \
91 VERLLF $8, b2, b2 \
92 VERLLF $8, c2, c2 \
93 VERLLF $8, d2, d2 \
94 VAF a2, a3, a3 \
95 VAF b2, b3, b3 \
96 VAF c2, c3, c3 \
97 VAF d2, d3, d3 \
98 VX a3, a1, a1 \
99 VX b3, b1, b1 \
100 VX c3, c1, c1 \
101 VX d3, d1, d1 \
102 VERLLF $7, a1, a1 \
103 VERLLF $7, b1, b1 \
104 VERLLF $7, c1, c1 \
105 VERLLF $7, d1, d1
106
107 #define PERMUTE(mask, v0, v1, v2, v3) \
108 VPERM v0, v0, mask, v0 \
109 VPERM v1, v1, mask, v1 \
110 VPERM v2, v2, mask, v2 \
111 VPERM v3, v3, mask, v3
112
113 #define ADDV(x, v0, v1, v2, v3) \
114 VAF x, v0, v0 \
115 VAF x, v1, v1 \
116 VAF x, v2, v2 \
117 VAF x, v3, v3
118
119 #define XORV(off, dst, src, v0, v1, v2, v3) \
120 VLM off(src), M0, M3 \
121 PERMUTE(BSWAP, v0, v1, v2, v3) \
122 VX v0, M0, M0 \
123 VX v1, M1, M1 \
124 VX v2, M2, M2 \
125 VX v3, M3, M3 \
126 VSTM M0, M3, off(dst)
127
128 #define SHUFFLE(a, b, c, d, t, u, v, w) \
129 VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
130 VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
131 VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
132 VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
133 VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
134 VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
135 VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
136 VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
137
138 // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
139 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
140 MOVD $·constants<>(SB), R1
141 MOVD dst+0(FP), R2 // R2=&dst[0]
142 LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src)
143 MOVD key+48(FP), R5 // R5=key
144 MOVD nonce+56(FP), R6 // R6=nonce
145 MOVD counter+64(FP), R7 // R7=counter
146
147 // load BSWAP and J0
148 VLM (R1), BSWAP, J0
149
150 // setup
151 MOVD $95, R0
152 VLM (R5), KEY0, KEY1
153 VLL R0, (R6), NONCE
154 VZERO M0
155 VLEIB $7, $32, M0
156 VSRLB M0, NONCE, NONCE
157
158 // initialize counter values
159 VLREPF (R7), CTR
160 VZERO INC
161 VLEIF $1, $1, INC
162 VLEIF $2, $2, INC
163 VLEIF $3, $3, INC
164 VAF INC, CTR, CTR
165 VREPIF $4, INC
166
167 chacha:
168 VREPF $0, J0, X0
169 VREPF $1, J0, X1
170 VREPF $2, J0, X2
171 VREPF $3, J0, X3
172 VREPF $0, KEY0, X4
173 VREPF $1, KEY0, X5
174 VREPF $2, KEY0, X6
175 VREPF $3, KEY0, X7
176 VREPF $0, KEY1, X8
177 VREPF $1, KEY1, X9
178 VREPF $2, KEY1, X10
179 VREPF $3, KEY1, X11
180 VLR CTR, X12
181 VREPF $1, NONCE, X13
182 VREPF $2, NONCE, X14
183 VREPF $3, NONCE, X15
184
185 MOVD $(NUM_ROUNDS/2), R1
186
187 loop:
188 ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11)
189 ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9)
190
191 ADD $-1, R1
192 BNE loop
193
194 // decrement length
195 ADD $-256, R4
196
197 // rearrange vectors
198 SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
199 ADDV(J0, X0, X1, X2, X3)
200 SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
201 ADDV(KEY0, X4, X5, X6, X7)
202 SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
203 ADDV(KEY1, X8, X9, X10, X11)
204 VAF CTR, X12, X12
205 SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
206 ADDV(NONCE, X12, X13, X14, X15)
207
208 // increment counters
209 VAF INC, CTR, CTR
210
211 // xor keystream with plaintext
212 XORV(0*64, R2, R3, X0, X4, X8, X12)
213 XORV(1*64, R2, R3, X1, X5, X9, X13)
214 XORV(2*64, R2, R3, X2, X6, X10, X14)
215 XORV(3*64, R2, R3, X3, X7, X11, X15)
216
217 // increment pointers
218 MOVD $256(R2), R2
219 MOVD $256(R3), R3
220
221 CMPBNE R4, $0, chacha
222
223 VSTEF $0, CTR, (R7)
224 RET
225
View as plain text