1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
6
7 //go:build gc && !purego
8
9 #include "textflag.h"
10 // General register allocation
11 #define oup DI
12 #define inp SI
13 #define inl BX
14 #define adp CX // free to reuse, after we hash the additional data
15 #define keyp R8 // free to reuse, when we copy the key to stack
16 #define itr2 R9 // general iterator
17 #define itr1 CX // general iterator
18 #define acc0 R10
19 #define acc1 R11
20 #define acc2 R12
21 #define t0 R13
22 #define t1 R14
23 #define t2 R15
24 #define t3 R8
25 // Register and stack allocation for the SSE code
26 #define rStore (0*16)(BP)
27 #define sStore (1*16)(BP)
28 #define state1Store (2*16)(BP)
29 #define state2Store (3*16)(BP)
30 #define tmpStore (4*16)(BP)
31 #define ctr0Store (5*16)(BP)
32 #define ctr1Store (6*16)(BP)
33 #define ctr2Store (7*16)(BP)
34 #define ctr3Store (8*16)(BP)
35 #define A0 X0
36 #define A1 X1
37 #define A2 X2
38 #define B0 X3
39 #define B1 X4
40 #define B2 X5
41 #define C0 X6
42 #define C1 X7
43 #define C2 X8
44 #define D0 X9
45 #define D1 X10
46 #define D2 X11
47 #define T0 X12
48 #define T1 X13
49 #define T2 X14
50 #define T3 X15
51 #define A3 T0
52 #define B3 T1
53 #define C3 T2
54 #define D3 T3
55 // Register and stack allocation for the AVX2 code
56 #define rsStoreAVX2 (0*32)(BP)
57 #define state1StoreAVX2 (1*32)(BP)
58 #define state2StoreAVX2 (2*32)(BP)
59 #define ctr0StoreAVX2 (3*32)(BP)
60 #define ctr1StoreAVX2 (4*32)(BP)
61 #define ctr2StoreAVX2 (5*32)(BP)
62 #define ctr3StoreAVX2 (6*32)(BP)
63 #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
64 #define AA0 Y0
65 #define AA1 Y5
66 #define AA2 Y6
67 #define AA3 Y7
68 #define BB0 Y14
69 #define BB1 Y9
70 #define BB2 Y10
71 #define BB3 Y11
72 #define CC0 Y12
73 #define CC1 Y13
74 #define CC2 Y8
75 #define CC3 Y15
76 #define DD0 Y4
77 #define DD1 Y1
78 #define DD2 Y2
79 #define DD3 Y3
80 #define TT0 DD3
81 #define TT1 AA3
82 #define TT2 BB3
83 #define TT3 CC3
84 // ChaCha20 constants
85 DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
86 DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
87 DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
88 DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
89 DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
90 DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
91 DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
92 DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
93 // <<< 16 with PSHUFB
94 DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
95 DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
96 DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
97 DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
98 // <<< 8 with PSHUFB
99 DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
100 DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
101 DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
102 DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
103
104 DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
105 DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
106 DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
107 DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
108
109 DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
110 DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
111 DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
112 DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
113 // Poly1305 key clamp
114 DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
115 DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
116 DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
117 DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
118
119 DATA ·sseIncMask<>+0x00(SB)/8, $0x1
120 DATA ·sseIncMask<>+0x08(SB)/8, $0x0
121 // To load/store the last < 16 bytes in a buffer
122 DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
123 DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
124 DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
125 DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
126 DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
127 DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
128 DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
129 DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
130 DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
131 DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
132 DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
133 DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
134 DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
135 DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
136 DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
137 DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
138 DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
139 DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
140 DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
141 DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
142 DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
143 DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
144 DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
145 DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
146 DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
147 DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
148 DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
149 DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
150 DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
151 DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
152
153 GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
154 GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
155 GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
156 GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
157 GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
158 GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
159 GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
160 GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
161 // No PALIGNR in Go ASM yet (but VPALIGNR is present).
162 #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
163 #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
164 #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
165 #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
166 #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
167 #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
168 #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
169 #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
170 #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
171 #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
172 #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
173 #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
174 #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
175 #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
176 #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
177 #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
178 #define shiftC0Right shiftC0Left
179 #define shiftC1Right shiftC1Left
180 #define shiftC2Right shiftC2Left
181 #define shiftC3Right shiftC3Left
182 #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
183 #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
184 #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
185 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
186
187 // Some macros
188
189 // ROL rotates the uint32s in register R left by N bits, using temporary T.
190 #define ROL(N, R, T) \
191 MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
192
193 // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
194 #ifdef GOAMD64_v2
195 #define ROL16(R, T) PSHUFB ·rol16<>(SB), R
196 #else
197 #define ROL16(R, T) ROL(16, R, T)
198 #endif
199
200 // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
201 #ifdef GOAMD64_v2
202 #define ROL8(R, T) PSHUFB ·rol8<>(SB), R
203 #else
204 #define ROL8(R, T) ROL(8, R, T)
205 #endif
206
207 #define chachaQR(A, B, C, D, T) \
208 PADDD B, A; PXOR A, D; ROL16(D, T) \
209 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
210 PADDD B, A; PXOR A, D; ROL8(D, T) \
211 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
212
213 #define chachaQR_AVX2(A, B, C, D, T) \
214 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
215 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
216 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
217 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
218
219 #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
220 #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
221 #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
222 #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
223 #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
224
225 #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
226 #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
227 #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
228
229 #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
230 #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
231 // ----------------------------------------------------------------------------
232 TEXT polyHashADInternal<>(SB), NOSPLIT, $0
233 // adp points to beginning of additional data
234 // itr2 holds ad length
235 XORQ acc0, acc0
236 XORQ acc1, acc1
237 XORQ acc2, acc2
238 CMPQ itr2, $13
239 JNE hashADLoop
240
241 openFastTLSAD:
242 // Special treatment for the TLS case of 13 bytes
243 MOVQ (adp), acc0
244 MOVQ 5(adp), acc1
245 SHRQ $24, acc1
246 MOVQ $1, acc2
247 polyMul
248 RET
249
250 hashADLoop:
251 // Hash in 16 byte chunks
252 CMPQ itr2, $16
253 JB hashADTail
254 polyAdd(0(adp))
255 LEAQ (1*16)(adp), adp
256 SUBQ $16, itr2
257 polyMul
258 JMP hashADLoop
259
260 hashADTail:
261 CMPQ itr2, $0
262 JE hashADDone
263
264 // Hash last < 16 byte tail
265 XORQ t0, t0
266 XORQ t1, t1
267 XORQ t2, t2
268 ADDQ itr2, adp
269
270 hashADTailLoop:
271 SHLQ $8, t0, t1
272 SHLQ $8, t0
273 MOVB -1(adp), t2
274 XORQ t2, t0
275 DECQ adp
276 DECQ itr2
277 JNE hashADTailLoop
278
279 hashADTailFinish:
280 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
281 polyMul
282
283 // Finished AD
284 hashADDone:
285 RET
286
287 // ----------------------------------------------------------------------------
288 // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
289 TEXT ·chacha20Poly1305Open(SB), 0, $288-97
290 // For aligned stack access
291 MOVQ SP, BP
292 ADDQ $32, BP
293 ANDQ $-32, BP
294 MOVQ dst+0(FP), oup
295 MOVQ key+24(FP), keyp
296 MOVQ src+48(FP), inp
297 MOVQ src_len+56(FP), inl
298 MOVQ ad+72(FP), adp
299
300 // Check for AVX2 support
301 CMPB ·useAVX2(SB), $1
302 JE chacha20Poly1305Open_AVX2
303
304 // Special optimization, for very short buffers
305 CMPQ inl, $128
306 JBE openSSE128 // About 16% faster
307
308 // For long buffers, prepare the poly key first
309 MOVOU ·chacha20Constants<>(SB), A0
310 MOVOU (1*16)(keyp), B0
311 MOVOU (2*16)(keyp), C0
312 MOVOU (3*16)(keyp), D0
313 MOVO D0, T1
314
315 // Store state on stack for future use
316 MOVO B0, state1Store
317 MOVO C0, state2Store
318 MOVO D0, ctr3Store
319 MOVQ $10, itr2
320
321 openSSEPreparePolyKey:
322 chachaQR(A0, B0, C0, D0, T0)
323 shiftB0Left; shiftC0Left; shiftD0Left
324 chachaQR(A0, B0, C0, D0, T0)
325 shiftB0Right; shiftC0Right; shiftD0Right
326 DECQ itr2
327 JNE openSSEPreparePolyKey
328
329 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
330 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
331
332 // Clamp and store the key
333 PAND ·polyClampMask<>(SB), A0
334 MOVO A0, rStore; MOVO B0, sStore
335
336 // Hash AAD
337 MOVQ ad_len+80(FP), itr2
338 CALL polyHashADInternal<>(SB)
339
340 openSSEMainLoop:
341 CMPQ inl, $256
342 JB openSSEMainLoopDone
343
344 // Load state, increment counter blocks
345 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
346 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
347 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
348 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
349
350 // Store counters
351 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
352
353 // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
354 MOVQ $4, itr1
355 MOVQ inp, itr2
356
357 openSSEInternalLoop:
358 MOVO C3, tmpStore
359 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
360 MOVO tmpStore, C3
361 MOVO C1, tmpStore
362 chachaQR(A3, B3, C3, D3, C1)
363 MOVO tmpStore, C1
364 polyAdd(0(itr2))
365 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
366 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
367 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
368 polyMulStage1
369 polyMulStage2
370 LEAQ (2*8)(itr2), itr2
371 MOVO C3, tmpStore
372 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
373 MOVO tmpStore, C3
374 MOVO C1, tmpStore
375 polyMulStage3
376 chachaQR(A3, B3, C3, D3, C1)
377 MOVO tmpStore, C1
378 polyMulReduceStage
379 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
380 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
381 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
382 DECQ itr1
383 JGE openSSEInternalLoop
384
385 polyAdd(0(itr2))
386 polyMul
387 LEAQ (2*8)(itr2), itr2
388
389 CMPQ itr1, $-6
390 JG openSSEInternalLoop
391
392 // Add in the state
393 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
394 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
395 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
396 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
397
398 // Load - xor - store
399 MOVO D3, tmpStore
400 MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
401 MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
402 MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
403 MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
404 MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
405 MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
406 MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
407 MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
408 MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
409 MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
410 MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
411 MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
412 MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
413 MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
414 MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
415 MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
416 LEAQ 256(inp), inp
417 LEAQ 256(oup), oup
418 SUBQ $256, inl
419 JMP openSSEMainLoop
420
421 openSSEMainLoopDone:
422 // Handle the various tail sizes efficiently
423 TESTQ inl, inl
424 JE openSSEFinalize
425 CMPQ inl, $64
426 JBE openSSETail64
427 CMPQ inl, $128
428 JBE openSSETail128
429 CMPQ inl, $192
430 JBE openSSETail192
431 JMP openSSETail256
432
433 openSSEFinalize:
434 // Hash in the PT, AAD lengths
435 ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
436 polyMul
437
438 // Final reduce
439 MOVQ acc0, t0
440 MOVQ acc1, t1
441 MOVQ acc2, t2
442 SUBQ $-5, acc0
443 SBBQ $-1, acc1
444 SBBQ $3, acc2
445 CMOVQCS t0, acc0
446 CMOVQCS t1, acc1
447 CMOVQCS t2, acc2
448
449 // Add in the "s" part of the key
450 ADDQ 0+sStore, acc0
451 ADCQ 8+sStore, acc1
452
453 // Finally, constant time compare to the tag at the end of the message
454 XORQ AX, AX
455 MOVQ $1, DX
456 XORQ (0*8)(inp), acc0
457 XORQ (1*8)(inp), acc1
458 ORQ acc1, acc0
459 CMOVQEQ DX, AX
460
461 // Return true iff tags are equal
462 MOVB AX, ret+96(FP)
463 RET
464
465 // ----------------------------------------------------------------------------
466 // Special optimization for buffers smaller than 129 bytes
467 openSSE128:
468 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
469 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
470 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
471 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
472 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
473 MOVQ $10, itr2
474
475 openSSE128InnerCipherLoop:
476 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
477 shiftB0Left; shiftB1Left; shiftB2Left
478 shiftC0Left; shiftC1Left; shiftC2Left
479 shiftD0Left; shiftD1Left; shiftD2Left
480 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
481 shiftB0Right; shiftB1Right; shiftB2Right
482 shiftC0Right; shiftC1Right; shiftC2Right
483 shiftD0Right; shiftD1Right; shiftD2Right
484 DECQ itr2
485 JNE openSSE128InnerCipherLoop
486
487 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
488 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
489 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
490 PADDL T2, C1; PADDL T2, C2
491 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
492
493 // Clamp and store the key
494 PAND ·polyClampMask<>(SB), A0
495 MOVOU A0, rStore; MOVOU B0, sStore
496
497 // Hash
498 MOVQ ad_len+80(FP), itr2
499 CALL polyHashADInternal<>(SB)
500
501 openSSE128Open:
502 CMPQ inl, $16
503 JB openSSETail16
504 SUBQ $16, inl
505
506 // Load for hashing
507 polyAdd(0(inp))
508
509 // Load for decryption
510 MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
511 LEAQ (1*16)(inp), inp
512 LEAQ (1*16)(oup), oup
513 polyMul
514
515 // Shift the stream "left"
516 MOVO B1, A1
517 MOVO C1, B1
518 MOVO D1, C1
519 MOVO A2, D1
520 MOVO B2, A2
521 MOVO C2, B2
522 MOVO D2, C2
523 JMP openSSE128Open
524
525 openSSETail16:
526 TESTQ inl, inl
527 JE openSSEFinalize
528
529 // We can safely load the CT from the end, because it is padded with the MAC
530 MOVQ inl, itr2
531 SHLQ $4, itr2
532 LEAQ ·andMask<>(SB), t0
533 MOVOU (inp), T0
534 ADDQ inl, inp
535 PAND -16(t0)(itr2*1), T0
536 MOVO T0, 0+tmpStore
537 MOVQ T0, t0
538 MOVQ 8+tmpStore, t1
539 PXOR A1, T0
540
541 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes
542 openSSETail16Store:
543 MOVQ T0, t3
544 MOVB t3, (oup)
545 PSRLDQ $1, T0
546 INCQ oup
547 DECQ inl
548 JNE openSSETail16Store
549 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
550 polyMul
551 JMP openSSEFinalize
552
553 // ----------------------------------------------------------------------------
554 // Special optimization for the last 64 bytes of ciphertext
555 openSSETail64:
556 // Need to decrypt up to 64 bytes - prepare single block
557 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
558 XORQ itr2, itr2
559 MOVQ inl, itr1
560 CMPQ itr1, $16
561 JB openSSETail64LoopB
562
563 openSSETail64LoopA:
564 // Perform ChaCha rounds, while hashing the remaining input
565 polyAdd(0(inp)(itr2*1))
566 polyMul
567 SUBQ $16, itr1
568
569 openSSETail64LoopB:
570 ADDQ $16, itr2
571 chachaQR(A0, B0, C0, D0, T0)
572 shiftB0Left; shiftC0Left; shiftD0Left
573 chachaQR(A0, B0, C0, D0, T0)
574 shiftB0Right; shiftC0Right; shiftD0Right
575
576 CMPQ itr1, $16
577 JAE openSSETail64LoopA
578
579 CMPQ itr2, $160
580 JNE openSSETail64LoopB
581
582 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
583
584 openSSETail64DecLoop:
585 CMPQ inl, $16
586 JB openSSETail64DecLoopDone
587 SUBQ $16, inl
588 MOVOU (inp), T0
589 PXOR T0, A0
590 MOVOU A0, (oup)
591 LEAQ 16(inp), inp
592 LEAQ 16(oup), oup
593 MOVO B0, A0
594 MOVO C0, B0
595 MOVO D0, C0
596 JMP openSSETail64DecLoop
597
598 openSSETail64DecLoopDone:
599 MOVO A0, A1
600 JMP openSSETail16
601
602 // ----------------------------------------------------------------------------
603 // Special optimization for the last 128 bytes of ciphertext
604 openSSETail128:
605 // Need to decrypt up to 128 bytes - prepare two blocks
606 MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
607 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
608 XORQ itr2, itr2
609 MOVQ inl, itr1
610 ANDQ $-16, itr1
611
612 openSSETail128LoopA:
613 // Perform ChaCha rounds, while hashing the remaining input
614 polyAdd(0(inp)(itr2*1))
615 polyMul
616
617 openSSETail128LoopB:
618 ADDQ $16, itr2
619 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
620 shiftB0Left; shiftC0Left; shiftD0Left
621 shiftB1Left; shiftC1Left; shiftD1Left
622 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
623 shiftB0Right; shiftC0Right; shiftD0Right
624 shiftB1Right; shiftC1Right; shiftD1Right
625
626 CMPQ itr2, itr1
627 JB openSSETail128LoopA
628
629 CMPQ itr2, $160
630 JNE openSSETail128LoopB
631
632 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
633 PADDL state1Store, B0; PADDL state1Store, B1
634 PADDL state2Store, C0; PADDL state2Store, C1
635 PADDL ctr1Store, D0; PADDL ctr0Store, D1
636
637 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
638 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
639 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
640
641 SUBQ $64, inl
642 LEAQ 64(inp), inp
643 LEAQ 64(oup), oup
644 JMP openSSETail64DecLoop
645
646 // ----------------------------------------------------------------------------
647 // Special optimization for the last 192 bytes of ciphertext
648 openSSETail192:
649 // Need to decrypt up to 192 bytes - prepare three blocks
650 MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
651 MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
652 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
653
654 MOVQ inl, itr1
655 MOVQ $160, itr2
656 CMPQ itr1, $160
657 CMOVQGT itr2, itr1
658 ANDQ $-16, itr1
659 XORQ itr2, itr2
660
661 openSSLTail192LoopA:
662 // Perform ChaCha rounds, while hashing the remaining input
663 polyAdd(0(inp)(itr2*1))
664 polyMul
665
666 openSSLTail192LoopB:
667 ADDQ $16, itr2
668 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
669 shiftB0Left; shiftC0Left; shiftD0Left
670 shiftB1Left; shiftC1Left; shiftD1Left
671 shiftB2Left; shiftC2Left; shiftD2Left
672
673 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
674 shiftB0Right; shiftC0Right; shiftD0Right
675 shiftB1Right; shiftC1Right; shiftD1Right
676 shiftB2Right; shiftC2Right; shiftD2Right
677
678 CMPQ itr2, itr1
679 JB openSSLTail192LoopA
680
681 CMPQ itr2, $160
682 JNE openSSLTail192LoopB
683
684 CMPQ inl, $176
685 JB openSSLTail192Store
686
687 polyAdd(160(inp))
688 polyMul
689
690 CMPQ inl, $192
691 JB openSSLTail192Store
692
693 polyAdd(176(inp))
694 polyMul
695
696 openSSLTail192Store:
697 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
698 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
699 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
700 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
701
702 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
703 PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
704 MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
705
706 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
707 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
708 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
709
710 SUBQ $128, inl
711 LEAQ 128(inp), inp
712 LEAQ 128(oup), oup
713 JMP openSSETail64DecLoop
714
715 // ----------------------------------------------------------------------------
716 // Special optimization for the last 256 bytes of ciphertext
717 openSSETail256:
718 // Need to decrypt up to 256 bytes - prepare four blocks
719 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
720 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
721 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
722 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
723
724 // Store counters
725 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
726 XORQ itr2, itr2
727
728 openSSETail256Loop:
729 // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
730 polyAdd(0(inp)(itr2*1))
731 MOVO C3, tmpStore
732 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
733 MOVO tmpStore, C3
734 MOVO C1, tmpStore
735 chachaQR(A3, B3, C3, D3, C1)
736 MOVO tmpStore, C1
737 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
738 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
739 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
740 polyMulStage1
741 polyMulStage2
742 MOVO C3, tmpStore
743 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
744 MOVO tmpStore, C3
745 MOVO C1, tmpStore
746 chachaQR(A3, B3, C3, D3, C1)
747 MOVO tmpStore, C1
748 polyMulStage3
749 polyMulReduceStage
750 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
751 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
752 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
753 ADDQ $2*8, itr2
754 CMPQ itr2, $160
755 JB openSSETail256Loop
756 MOVQ inl, itr1
757 ANDQ $-16, itr1
758
759 openSSETail256HashLoop:
760 polyAdd(0(inp)(itr2*1))
761 polyMul
762 ADDQ $2*8, itr2
763 CMPQ itr2, itr1
764 JB openSSETail256HashLoop
765
766 // Add in the state
767 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
768 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
769 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
770 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
771 MOVO D3, tmpStore
772
773 // Load - xor - store
774 MOVOU (0*16)(inp), D3; PXOR D3, A0
775 MOVOU (1*16)(inp), D3; PXOR D3, B0
776 MOVOU (2*16)(inp), D3; PXOR D3, C0
777 MOVOU (3*16)(inp), D3; PXOR D3, D0
778 MOVOU A0, (0*16)(oup)
779 MOVOU B0, (1*16)(oup)
780 MOVOU C0, (2*16)(oup)
781 MOVOU D0, (3*16)(oup)
782 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
783 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
784 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
785 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
786 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
787 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
788 LEAQ 192(inp), inp
789 LEAQ 192(oup), oup
790 SUBQ $192, inl
791 MOVO A3, A0
792 MOVO B3, B0
793 MOVO C3, C0
794 MOVO tmpStore, D0
795
796 JMP openSSETail64DecLoop
797
798 // ----------------------------------------------------------------------------
799 // ------------------------- AVX2 Code ----------------------------------------
800 chacha20Poly1305Open_AVX2:
801 VZEROUPPER
802 VMOVDQU ·chacha20Constants<>(SB), AA0
803 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
804 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
805 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
806 VPADDD ·avx2InitMask<>(SB), DD0, DD0
807
808 // Special optimization, for very short buffers
809 CMPQ inl, $192
810 JBE openAVX2192
811 CMPQ inl, $320
812 JBE openAVX2320
813
814 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
815 VMOVDQA BB0, state1StoreAVX2
816 VMOVDQA CC0, state2StoreAVX2
817 VMOVDQA DD0, ctr3StoreAVX2
818 MOVQ $10, itr2
819
820 openAVX2PreparePolyKey:
821 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
822 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
823 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
824 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
825 DECQ itr2
826 JNE openAVX2PreparePolyKey
827
828 VPADDD ·chacha20Constants<>(SB), AA0, AA0
829 VPADDD state1StoreAVX2, BB0, BB0
830 VPADDD state2StoreAVX2, CC0, CC0
831 VPADDD ctr3StoreAVX2, DD0, DD0
832
833 VPERM2I128 $0x02, AA0, BB0, TT0
834
835 // Clamp and store poly key
836 VPAND ·polyClampMask<>(SB), TT0, TT0
837 VMOVDQA TT0, rsStoreAVX2
838
839 // Stream for the first 64 bytes
840 VPERM2I128 $0x13, AA0, BB0, AA0
841 VPERM2I128 $0x13, CC0, DD0, BB0
842
843 // Hash AD + first 64 bytes
844 MOVQ ad_len+80(FP), itr2
845 CALL polyHashADInternal<>(SB)
846 XORQ itr1, itr1
847
848 openAVX2InitialHash64:
849 polyAdd(0(inp)(itr1*1))
850 polyMulAVX2
851 ADDQ $16, itr1
852 CMPQ itr1, $64
853 JNE openAVX2InitialHash64
854
855 // Decrypt the first 64 bytes
856 VPXOR (0*32)(inp), AA0, AA0
857 VPXOR (1*32)(inp), BB0, BB0
858 VMOVDQU AA0, (0*32)(oup)
859 VMOVDQU BB0, (1*32)(oup)
860 LEAQ (2*32)(inp), inp
861 LEAQ (2*32)(oup), oup
862 SUBQ $64, inl
863
864 openAVX2MainLoop:
865 CMPQ inl, $512
866 JB openAVX2MainLoopDone
867
868 // Load state, increment counter blocks, store the incremented counters
869 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
870 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
871 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
872 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
873 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
874 XORQ itr1, itr1
875
876 openAVX2InternalLoop:
877 // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
878 // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
879 polyAdd(0*8(inp)(itr1*1))
880 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
881 polyMulStage1_AVX2
882 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
883 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
884 polyMulStage2_AVX2
885 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
886 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
887 polyMulStage3_AVX2
888 VMOVDQA CC3, tmpStoreAVX2
889 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
890 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
891 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
892 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
893 VMOVDQA tmpStoreAVX2, CC3
894 polyMulReduceStage
895 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
896 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
897 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
898 polyAdd(2*8(inp)(itr1*1))
899 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
900 polyMulStage1_AVX2
901 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
902 VMOVDQA CC3, tmpStoreAVX2
903 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
904 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
905 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
906 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
907 VMOVDQA tmpStoreAVX2, CC3
908 polyMulStage2_AVX2
909 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
910 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
911 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
912 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
913 polyMulStage3_AVX2
914 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
915 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
916 polyMulReduceStage
917 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
918 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
919 polyAdd(4*8(inp)(itr1*1))
920 LEAQ (6*8)(itr1), itr1
921 VMOVDQA CC3, tmpStoreAVX2
922 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
923 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
924 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
925 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
926 VMOVDQA tmpStoreAVX2, CC3
927 polyMulStage1_AVX2
928 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
929 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
930 polyMulStage2_AVX2
931 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
932 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
933 polyMulStage3_AVX2
934 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
935 VMOVDQA CC3, tmpStoreAVX2
936 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
937 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
938 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
939 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
940 VMOVDQA tmpStoreAVX2, CC3
941 polyMulReduceStage
942 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
943 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
944 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
945 CMPQ itr1, $480
946 JNE openAVX2InternalLoop
947
948 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
949 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
950 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
951 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
952 VMOVDQA CC3, tmpStoreAVX2
953
954 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
955 polyAdd(480(inp))
956 polyMulAVX2
957 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
958 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
959 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
960 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
961 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
962 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
963
964 // and here
965 polyAdd(496(inp))
966 polyMulAVX2
967 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
968 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
969 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
970 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
971 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
972 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
973 LEAQ (32*16)(inp), inp
974 LEAQ (32*16)(oup), oup
975 SUBQ $(32*16), inl
976 JMP openAVX2MainLoop
977
978 openAVX2MainLoopDone:
979 // Handle the various tail sizes efficiently
980 TESTQ inl, inl
981 JE openSSEFinalize
982 CMPQ inl, $128
983 JBE openAVX2Tail128
984 CMPQ inl, $256
985 JBE openAVX2Tail256
986 CMPQ inl, $384
987 JBE openAVX2Tail384
988 JMP openAVX2Tail512
989
990 // ----------------------------------------------------------------------------
991 // Special optimization for buffers smaller than 193 bytes
992 openAVX2192:
993 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
994 VMOVDQA AA0, AA1
995 VMOVDQA BB0, BB1
996 VMOVDQA CC0, CC1
997 VPADDD ·avx2IncMask<>(SB), DD0, DD1
998 VMOVDQA AA0, AA2
999 VMOVDQA BB0, BB2
1000 VMOVDQA CC0, CC2
1001 VMOVDQA DD0, DD2
1002 VMOVDQA DD1, TT3
1003 MOVQ $10, itr2
1004
1005 openAVX2192InnerCipherLoop:
1006 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1007 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
1008 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1009 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
1010 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1011 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
1012 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1013 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
1014 DECQ itr2
1015 JNE openAVX2192InnerCipherLoop
1016 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
1017 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
1018 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
1019 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
1020 VPERM2I128 $0x02, AA0, BB0, TT0
1021
1022 // Clamp and store poly key
1023 VPAND ·polyClampMask<>(SB), TT0, TT0
1024 VMOVDQA TT0, rsStoreAVX2
1025
1026 // Stream for up to 192 bytes
1027 VPERM2I128 $0x13, AA0, BB0, AA0
1028 VPERM2I128 $0x13, CC0, DD0, BB0
1029 VPERM2I128 $0x02, AA1, BB1, CC0
1030 VPERM2I128 $0x02, CC1, DD1, DD0
1031 VPERM2I128 $0x13, AA1, BB1, AA1
1032 VPERM2I128 $0x13, CC1, DD1, BB1
1033
1034 openAVX2ShortOpen:
1035 // Hash
1036 MOVQ ad_len+80(FP), itr2
1037 CALL polyHashADInternal<>(SB)
1038
1039 openAVX2ShortOpenLoop:
1040 CMPQ inl, $32
1041 JB openAVX2ShortTail32
1042 SUBQ $32, inl
1043
1044 // Load for hashing
1045 polyAdd(0*8(inp))
1046 polyMulAVX2
1047 polyAdd(2*8(inp))
1048 polyMulAVX2
1049
1050 // Load for decryption
1051 VPXOR (inp), AA0, AA0
1052 VMOVDQU AA0, (oup)
1053 LEAQ (1*32)(inp), inp
1054 LEAQ (1*32)(oup), oup
1055
1056 // Shift stream left
1057 VMOVDQA BB0, AA0
1058 VMOVDQA CC0, BB0
1059 VMOVDQA DD0, CC0
1060 VMOVDQA AA1, DD0
1061 VMOVDQA BB1, AA1
1062 VMOVDQA CC1, BB1
1063 VMOVDQA DD1, CC1
1064 VMOVDQA AA2, DD1
1065 VMOVDQA BB2, AA2
1066 JMP openAVX2ShortOpenLoop
1067
1068 openAVX2ShortTail32:
1069 CMPQ inl, $16
1070 VMOVDQA A0, A1
1071 JB openAVX2ShortDone
1072
1073 SUBQ $16, inl
1074
1075 // Load for hashing
1076 polyAdd(0*8(inp))
1077 polyMulAVX2
1078
1079 // Load for decryption
1080 VPXOR (inp), A0, T0
1081 VMOVDQU T0, (oup)
1082 LEAQ (1*16)(inp), inp
1083 LEAQ (1*16)(oup), oup
1084 VPERM2I128 $0x11, AA0, AA0, AA0
1085 VMOVDQA A0, A1
1086
1087 openAVX2ShortDone:
1088 VZEROUPPER
1089 JMP openSSETail16
1090
1091 // ----------------------------------------------------------------------------
1092 // Special optimization for buffers smaller than 321 bytes
1093 openAVX2320:
1094 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
1095 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
1096 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
1097 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
1098 MOVQ $10, itr2
1099
1100 openAVX2320InnerCipherLoop:
1101 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1102 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1103 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1104 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1105 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1106 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1107 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1108 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1109 DECQ itr2
1110 JNE openAVX2320InnerCipherLoop
1111
1112 VMOVDQA ·chacha20Constants<>(SB), TT0
1113 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
1114 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
1115 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
1116 VMOVDQA ·avx2IncMask<>(SB), TT0
1117 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
1118 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
1119 VPADDD TT3, DD2, DD2
1120
1121 // Clamp and store poly key
1122 VPERM2I128 $0x02, AA0, BB0, TT0
1123 VPAND ·polyClampMask<>(SB), TT0, TT0
1124 VMOVDQA TT0, rsStoreAVX2
1125
1126 // Stream for up to 320 bytes
1127 VPERM2I128 $0x13, AA0, BB0, AA0
1128 VPERM2I128 $0x13, CC0, DD0, BB0
1129 VPERM2I128 $0x02, AA1, BB1, CC0
1130 VPERM2I128 $0x02, CC1, DD1, DD0
1131 VPERM2I128 $0x13, AA1, BB1, AA1
1132 VPERM2I128 $0x13, CC1, DD1, BB1
1133 VPERM2I128 $0x02, AA2, BB2, CC1
1134 VPERM2I128 $0x02, CC2, DD2, DD1
1135 VPERM2I128 $0x13, AA2, BB2, AA2
1136 VPERM2I128 $0x13, CC2, DD2, BB2
1137 JMP openAVX2ShortOpen
1138
1139 // ----------------------------------------------------------------------------
1140 // Special optimization for the last 128 bytes of ciphertext
1141 openAVX2Tail128:
1142 // Need to decrypt up to 128 bytes - prepare two blocks
1143 VMOVDQA ·chacha20Constants<>(SB), AA1
1144 VMOVDQA state1StoreAVX2, BB1
1145 VMOVDQA state2StoreAVX2, CC1
1146 VMOVDQA ctr3StoreAVX2, DD1
1147 VPADDD ·avx2IncMask<>(SB), DD1, DD1
1148 VMOVDQA DD1, DD0
1149
1150 XORQ itr2, itr2
1151 MOVQ inl, itr1
1152 ANDQ $-16, itr1
1153 TESTQ itr1, itr1
1154 JE openAVX2Tail128LoopB
1155
1156 openAVX2Tail128LoopA:
1157 // Perform ChaCha rounds, while hashing the remaining input
1158 polyAdd(0(inp)(itr2*1))
1159 polyMulAVX2
1160
1161 openAVX2Tail128LoopB:
1162 ADDQ $16, itr2
1163 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1164 VPALIGNR $4, BB1, BB1, BB1
1165 VPALIGNR $8, CC1, CC1, CC1
1166 VPALIGNR $12, DD1, DD1, DD1
1167 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1168 VPALIGNR $12, BB1, BB1, BB1
1169 VPALIGNR $8, CC1, CC1, CC1
1170 VPALIGNR $4, DD1, DD1, DD1
1171 CMPQ itr2, itr1
1172 JB openAVX2Tail128LoopA
1173 CMPQ itr2, $160
1174 JNE openAVX2Tail128LoopB
1175
1176 VPADDD ·chacha20Constants<>(SB), AA1, AA1
1177 VPADDD state1StoreAVX2, BB1, BB1
1178 VPADDD state2StoreAVX2, CC1, CC1
1179 VPADDD DD0, DD1, DD1
1180 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1181
1182 openAVX2TailLoop:
1183 CMPQ inl, $32
1184 JB openAVX2Tail
1185 SUBQ $32, inl
1186
1187 // Load for decryption
1188 VPXOR (inp), AA0, AA0
1189 VMOVDQU AA0, (oup)
1190 LEAQ (1*32)(inp), inp
1191 LEAQ (1*32)(oup), oup
1192 VMOVDQA BB0, AA0
1193 VMOVDQA CC0, BB0
1194 VMOVDQA DD0, CC0
1195 JMP openAVX2TailLoop
1196
1197 openAVX2Tail:
1198 CMPQ inl, $16
1199 VMOVDQA A0, A1
1200 JB openAVX2TailDone
1201 SUBQ $16, inl
1202
1203 // Load for decryption
1204 VPXOR (inp), A0, T0
1205 VMOVDQU T0, (oup)
1206 LEAQ (1*16)(inp), inp
1207 LEAQ (1*16)(oup), oup
1208 VPERM2I128 $0x11, AA0, AA0, AA0
1209 VMOVDQA A0, A1
1210
1211 openAVX2TailDone:
1212 VZEROUPPER
1213 JMP openSSETail16
1214
1215 // ----------------------------------------------------------------------------
1216 // Special optimization for the last 256 bytes of ciphertext
1217 openAVX2Tail256:
1218 // Need to decrypt up to 256 bytes - prepare four blocks
1219 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
1220 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
1221 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
1222 VMOVDQA ctr3StoreAVX2, DD0
1223 VPADDD ·avx2IncMask<>(SB), DD0, DD0
1224 VPADDD ·avx2IncMask<>(SB), DD0, DD1
1225 VMOVDQA DD0, TT1
1226 VMOVDQA DD1, TT2
1227
1228 // Compute the number of iterations that will hash data
1229 MOVQ inl, tmpStoreAVX2
1230 MOVQ inl, itr1
1231 SUBQ $128, itr1
1232 SHRQ $4, itr1
1233 MOVQ $10, itr2
1234 CMPQ itr1, $10
1235 CMOVQGT itr2, itr1
1236 MOVQ inp, inl
1237 XORQ itr2, itr2
1238
1239 openAVX2Tail256LoopA:
1240 polyAdd(0(inl))
1241 polyMulAVX2
1242 LEAQ 16(inl), inl
1243
1244 // Perform ChaCha rounds, while hashing the remaining input
1245 openAVX2Tail256LoopB:
1246 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1247 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
1248 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1249 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
1250 INCQ itr2
1251 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1252 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
1253 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1254 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
1255 CMPQ itr2, itr1
1256 JB openAVX2Tail256LoopA
1257
1258 CMPQ itr2, $10
1259 JNE openAVX2Tail256LoopB
1260
1261 MOVQ inl, itr2
1262 SUBQ inp, inl
1263 MOVQ inl, itr1
1264 MOVQ tmpStoreAVX2, inl
1265
1266 // Hash the remainder of data (if any)
1267 openAVX2Tail256Hash:
1268 ADDQ $16, itr1
1269 CMPQ itr1, inl
1270 JGT openAVX2Tail256HashEnd
1271 polyAdd (0(itr2))
1272 polyMulAVX2
1273 LEAQ 16(itr2), itr2
1274 JMP openAVX2Tail256Hash
1275
1276 // Store 128 bytes safely, then go to store loop
1277 openAVX2Tail256HashEnd:
1278 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
1279 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
1280 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
1281 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
1282 VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
1283 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1284
1285 VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
1286 VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
1287 LEAQ (4*32)(inp), inp
1288 LEAQ (4*32)(oup), oup
1289 SUBQ $4*32, inl
1290
1291 JMP openAVX2TailLoop
1292
1293 // ----------------------------------------------------------------------------
1294 // Special optimization for the last 384 bytes of ciphertext
1295 openAVX2Tail384:
1296 // Need to decrypt up to 384 bytes - prepare six blocks
1297 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
1298 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
1299 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
1300 VMOVDQA ctr3StoreAVX2, DD0
1301 VPADDD ·avx2IncMask<>(SB), DD0, DD0
1302 VPADDD ·avx2IncMask<>(SB), DD0, DD1
1303 VPADDD ·avx2IncMask<>(SB), DD1, DD2
1304 VMOVDQA DD0, ctr0StoreAVX2
1305 VMOVDQA DD1, ctr1StoreAVX2
1306 VMOVDQA DD2, ctr2StoreAVX2
1307
1308 // Compute the number of iterations that will hash two blocks of data
1309 MOVQ inl, tmpStoreAVX2
1310 MOVQ inl, itr1
1311 SUBQ $256, itr1
1312 SHRQ $4, itr1
1313 ADDQ $6, itr1
1314 MOVQ $10, itr2
1315 CMPQ itr1, $10
1316 CMOVQGT itr2, itr1
1317 MOVQ inp, inl
1318 XORQ itr2, itr2
1319
1320 // Perform ChaCha rounds, while hashing the remaining input
1321 openAVX2Tail384LoopB:
1322 polyAdd(0(inl))
1323 polyMulAVX2
1324 LEAQ 16(inl), inl
1325
1326 openAVX2Tail384LoopA:
1327 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1328 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1329 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1330 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1331 polyAdd(0(inl))
1332 polyMulAVX2
1333 LEAQ 16(inl), inl
1334 INCQ itr2
1335 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1336 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1337 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1338 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1339
1340 CMPQ itr2, itr1
1341 JB openAVX2Tail384LoopB
1342
1343 CMPQ itr2, $10
1344 JNE openAVX2Tail384LoopA
1345
1346 MOVQ inl, itr2
1347 SUBQ inp, inl
1348 MOVQ inl, itr1
1349 MOVQ tmpStoreAVX2, inl
1350
1351 openAVX2Tail384Hash:
1352 ADDQ $16, itr1
1353 CMPQ itr1, inl
1354 JGT openAVX2Tail384HashEnd
1355 polyAdd(0(itr2))
1356 polyMulAVX2
1357 LEAQ 16(itr2), itr2
1358 JMP openAVX2Tail384Hash
1359
1360 // Store 256 bytes safely, then go to store loop
1361 openAVX2Tail384HashEnd:
1362 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
1363 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
1364 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
1365 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
1366 VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
1367 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
1368 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
1369 VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
1370 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
1371 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
1372 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1373 LEAQ (8*32)(inp), inp
1374 LEAQ (8*32)(oup), oup
1375 SUBQ $8*32, inl
1376 JMP openAVX2TailLoop
1377
1378 // ----------------------------------------------------------------------------
1379 // Special optimization for the last 512 bytes of ciphertext
1380 openAVX2Tail512:
1381 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1382 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
1383 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
1384 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
1385 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
1386 XORQ itr1, itr1
1387 MOVQ inp, itr2
1388
1389 openAVX2Tail512LoopB:
1390 polyAdd(0(itr2))
1391 polyMulAVX2
1392 LEAQ (2*8)(itr2), itr2
1393
1394 openAVX2Tail512LoopA:
1395 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1396 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1397 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1398 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1399 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1400 VMOVDQA CC3, tmpStoreAVX2
1401 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1402 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1403 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1404 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1405 VMOVDQA tmpStoreAVX2, CC3
1406 polyAdd(0*8(itr2))
1407 polyMulAVX2
1408 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1409 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1410 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1411 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1412 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1413 VMOVDQA CC3, tmpStoreAVX2
1414 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1415 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1416 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1417 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1418 VMOVDQA tmpStoreAVX2, CC3
1419 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
1420 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1421 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
1422 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1423 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1424 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1425 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1426 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1427 polyAdd(2*8(itr2))
1428 polyMulAVX2
1429 LEAQ (4*8)(itr2), itr2
1430 VMOVDQA CC3, tmpStoreAVX2
1431 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1432 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1433 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1434 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1435 VMOVDQA tmpStoreAVX2, CC3
1436 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1437 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1438 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1439 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1440 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1441 VMOVDQA CC3, tmpStoreAVX2
1442 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1443 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1444 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1445 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1446 VMOVDQA tmpStoreAVX2, CC3
1447 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
1448 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1449 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
1450 INCQ itr1
1451 CMPQ itr1, $4
1452 JLT openAVX2Tail512LoopB
1453
1454 CMPQ itr1, $10
1455 JNE openAVX2Tail512LoopA
1456
1457 MOVQ inl, itr1
1458 SUBQ $384, itr1
1459 ANDQ $-16, itr1
1460
1461 openAVX2Tail512HashLoop:
1462 TESTQ itr1, itr1
1463 JE openAVX2Tail512HashEnd
1464 polyAdd(0(itr2))
1465 polyMulAVX2
1466 LEAQ 16(itr2), itr2
1467 SUBQ $16, itr1
1468 JMP openAVX2Tail512HashLoop
1469
1470 openAVX2Tail512HashEnd:
1471 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
1472 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
1473 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
1474 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
1475 VMOVDQA CC3, tmpStoreAVX2
1476 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
1477 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
1478 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
1479 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1480 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
1481 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
1482 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1483 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
1484 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
1485 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
1486
1487 LEAQ (12*32)(inp), inp
1488 LEAQ (12*32)(oup), oup
1489 SUBQ $12*32, inl
1490
1491 JMP openAVX2TailLoop
1492
1493 // ----------------------------------------------------------------------------
1494 // ----------------------------------------------------------------------------
1495 // func chacha20Poly1305Seal(dst, key, src, ad []byte)
1496 TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
1497 // For aligned stack access
1498 MOVQ SP, BP
1499 ADDQ $32, BP
1500 ANDQ $-32, BP
1501 MOVQ dst+0(FP), oup
1502 MOVQ key+24(FP), keyp
1503 MOVQ src+48(FP), inp
1504 MOVQ src_len+56(FP), inl
1505 MOVQ ad+72(FP), adp
1506
1507 CMPB ·useAVX2(SB), $1
1508 JE chacha20Poly1305Seal_AVX2
1509
1510 // Special optimization, for very short buffers
1511 CMPQ inl, $128
1512 JBE sealSSE128 // About 15% faster
1513
1514 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
1515 MOVOU ·chacha20Constants<>(SB), A0
1516 MOVOU (1*16)(keyp), B0
1517 MOVOU (2*16)(keyp), C0
1518 MOVOU (3*16)(keyp), D0
1519
1520 // Store state on stack for future use
1521 MOVO B0, state1Store
1522 MOVO C0, state2Store
1523
1524 // Load state, increment counter blocks
1525 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1526 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1527 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1528
1529 // Store counters
1530 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1531 MOVQ $10, itr2
1532
1533 sealSSEIntroLoop:
1534 MOVO C3, tmpStore
1535 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1536 MOVO tmpStore, C3
1537 MOVO C1, tmpStore
1538 chachaQR(A3, B3, C3, D3, C1)
1539 MOVO tmpStore, C1
1540 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1541 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1542 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1543
1544 MOVO C3, tmpStore
1545 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1546 MOVO tmpStore, C3
1547 MOVO C1, tmpStore
1548 chachaQR(A3, B3, C3, D3, C1)
1549 MOVO tmpStore, C1
1550 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1551 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1552 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1553 DECQ itr2
1554 JNE sealSSEIntroLoop
1555
1556 // Add in the state
1557 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1558 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1559 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1560 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1561
1562 // Clamp and store the key
1563 PAND ·polyClampMask<>(SB), A0
1564 MOVO A0, rStore
1565 MOVO B0, sStore
1566
1567 // Hash AAD
1568 MOVQ ad_len+80(FP), itr2
1569 CALL polyHashADInternal<>(SB)
1570
1571 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1572 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1573 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
1574 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1575 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1576 MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
1577
1578 MOVQ $128, itr1
1579 SUBQ $128, inl
1580 LEAQ 128(inp), inp
1581
1582 MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
1583
1584 CMPQ inl, $64
1585 JBE sealSSE128SealHash
1586
1587 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1588 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1589 MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
1590
1591 ADDQ $64, itr1
1592 SUBQ $64, inl
1593 LEAQ 64(inp), inp
1594
1595 MOVQ $2, itr1
1596 MOVQ $8, itr2
1597
1598 CMPQ inl, $64
1599 JBE sealSSETail64
1600 CMPQ inl, $128
1601 JBE sealSSETail128
1602 CMPQ inl, $192
1603 JBE sealSSETail192
1604
1605 sealSSEMainLoop:
1606 // Load state, increment counter blocks
1607 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
1608 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1609 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1610 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1611
1612 // Store counters
1613 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1614
1615 sealSSEInnerLoop:
1616 MOVO C3, tmpStore
1617 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1618 MOVO tmpStore, C3
1619 MOVO C1, tmpStore
1620 chachaQR(A3, B3, C3, D3, C1)
1621 MOVO tmpStore, C1
1622 polyAdd(0(oup))
1623 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1624 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1625 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1626 polyMulStage1
1627 polyMulStage2
1628 LEAQ (2*8)(oup), oup
1629 MOVO C3, tmpStore
1630 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1631 MOVO tmpStore, C3
1632 MOVO C1, tmpStore
1633 polyMulStage3
1634 chachaQR(A3, B3, C3, D3, C1)
1635 MOVO tmpStore, C1
1636 polyMulReduceStage
1637 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1638 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1639 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1640 DECQ itr2
1641 JGE sealSSEInnerLoop
1642 polyAdd(0(oup))
1643 polyMul
1644 LEAQ (2*8)(oup), oup
1645 DECQ itr1
1646 JG sealSSEInnerLoop
1647
1648 // Add in the state
1649 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1650 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1651 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1652 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1653 MOVO D3, tmpStore
1654
1655 // Load - xor - store
1656 MOVOU (0*16)(inp), D3; PXOR D3, A0
1657 MOVOU (1*16)(inp), D3; PXOR D3, B0
1658 MOVOU (2*16)(inp), D3; PXOR D3, C0
1659 MOVOU (3*16)(inp), D3; PXOR D3, D0
1660 MOVOU A0, (0*16)(oup)
1661 MOVOU B0, (1*16)(oup)
1662 MOVOU C0, (2*16)(oup)
1663 MOVOU D0, (3*16)(oup)
1664 MOVO tmpStore, D3
1665
1666 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1667 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1668 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1669 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
1670 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1671 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
1672 ADDQ $192, inp
1673 MOVQ $192, itr1
1674 SUBQ $192, inl
1675 MOVO A3, A1
1676 MOVO B3, B1
1677 MOVO C3, C1
1678 MOVO D3, D1
1679 CMPQ inl, $64
1680 JBE sealSSE128SealHash
1681 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1682 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1683 MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
1684 LEAQ 64(inp), inp
1685 SUBQ $64, inl
1686 MOVQ $6, itr1
1687 MOVQ $4, itr2
1688 CMPQ inl, $192
1689 JG sealSSEMainLoop
1690
1691 MOVQ inl, itr1
1692 TESTQ inl, inl
1693 JE sealSSE128SealHash
1694 MOVQ $6, itr1
1695 CMPQ inl, $64
1696 JBE sealSSETail64
1697 CMPQ inl, $128
1698 JBE sealSSETail128
1699 JMP sealSSETail192
1700
1701 // ----------------------------------------------------------------------------
1702 // Special optimization for the last 64 bytes of plaintext
1703 sealSSETail64:
1704 // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
1705 MOVO ·chacha20Constants<>(SB), A1
1706 MOVO state1Store, B1
1707 MOVO state2Store, C1
1708 MOVO ctr3Store, D1
1709 PADDL ·sseIncMask<>(SB), D1
1710 MOVO D1, ctr0Store
1711
1712 sealSSETail64LoopA:
1713 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1714 polyAdd(0(oup))
1715 polyMul
1716 LEAQ 16(oup), oup
1717
1718 sealSSETail64LoopB:
1719 chachaQR(A1, B1, C1, D1, T1)
1720 shiftB1Left; shiftC1Left; shiftD1Left
1721 chachaQR(A1, B1, C1, D1, T1)
1722 shiftB1Right; shiftC1Right; shiftD1Right
1723 polyAdd(0(oup))
1724 polyMul
1725 LEAQ 16(oup), oup
1726
1727 DECQ itr1
1728 JG sealSSETail64LoopA
1729
1730 DECQ itr2
1731 JGE sealSSETail64LoopB
1732 PADDL ·chacha20Constants<>(SB), A1
1733 PADDL state1Store, B1
1734 PADDL state2Store, C1
1735 PADDL ctr0Store, D1
1736
1737 JMP sealSSE128Seal
1738
1739 // ----------------------------------------------------------------------------
1740 // Special optimization for the last 128 bytes of plaintext
1741 sealSSETail128:
1742 // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
1743 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1744 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1745
1746 sealSSETail128LoopA:
1747 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1748 polyAdd(0(oup))
1749 polyMul
1750 LEAQ 16(oup), oup
1751
1752 sealSSETail128LoopB:
1753 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1754 shiftB0Left; shiftC0Left; shiftD0Left
1755 shiftB1Left; shiftC1Left; shiftD1Left
1756 polyAdd(0(oup))
1757 polyMul
1758 LEAQ 16(oup), oup
1759 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1760 shiftB0Right; shiftC0Right; shiftD0Right
1761 shiftB1Right; shiftC1Right; shiftD1Right
1762
1763 DECQ itr1
1764 JG sealSSETail128LoopA
1765
1766 DECQ itr2
1767 JGE sealSSETail128LoopB
1768
1769 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
1770 PADDL state1Store, B0; PADDL state1Store, B1
1771 PADDL state2Store, C0; PADDL state2Store, C1
1772 PADDL ctr0Store, D0; PADDL ctr1Store, D1
1773
1774 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1775 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1776 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1777
1778 MOVQ $64, itr1
1779 LEAQ 64(inp), inp
1780 SUBQ $64, inl
1781
1782 JMP sealSSE128SealHash
1783
1784 // ----------------------------------------------------------------------------
1785 // Special optimization for the last 192 bytes of plaintext
1786 sealSSETail192:
1787 // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
1788 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1789 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1790 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
1791
1792 sealSSETail192LoopA:
1793 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1794 polyAdd(0(oup))
1795 polyMul
1796 LEAQ 16(oup), oup
1797
1798 sealSSETail192LoopB:
1799 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1800 shiftB0Left; shiftC0Left; shiftD0Left
1801 shiftB1Left; shiftC1Left; shiftD1Left
1802 shiftB2Left; shiftC2Left; shiftD2Left
1803
1804 polyAdd(0(oup))
1805 polyMul
1806 LEAQ 16(oup), oup
1807
1808 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1809 shiftB0Right; shiftC0Right; shiftD0Right
1810 shiftB1Right; shiftC1Right; shiftD1Right
1811 shiftB2Right; shiftC2Right; shiftD2Right
1812
1813 DECQ itr1
1814 JG sealSSETail192LoopA
1815
1816 DECQ itr2
1817 JGE sealSSETail192LoopB
1818
1819 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1820 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
1821 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
1822 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
1823
1824 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1825 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1826 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1827 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
1828 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
1829 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1830
1831 MOVO A2, A1
1832 MOVO B2, B1
1833 MOVO C2, C1
1834 MOVO D2, D1
1835 MOVQ $128, itr1
1836 LEAQ 128(inp), inp
1837 SUBQ $128, inl
1838
1839 JMP sealSSE128SealHash
1840
1841 // ----------------------------------------------------------------------------
1842 // Special seal optimization for buffers smaller than 129 bytes
1843 sealSSE128:
1844 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
1845 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
1846 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1847 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1848 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
1849 MOVQ $10, itr2
1850
1851 sealSSE128InnerCipherLoop:
1852 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1853 shiftB0Left; shiftB1Left; shiftB2Left
1854 shiftC0Left; shiftC1Left; shiftC2Left
1855 shiftD0Left; shiftD1Left; shiftD2Left
1856 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1857 shiftB0Right; shiftB1Right; shiftB2Right
1858 shiftC0Right; shiftC1Right; shiftC2Right
1859 shiftD0Right; shiftD1Right; shiftD2Right
1860 DECQ itr2
1861 JNE sealSSE128InnerCipherLoop
1862
1863 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
1864 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1865 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
1866 PADDL T2, C1; PADDL T2, C2
1867 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
1868 PAND ·polyClampMask<>(SB), A0
1869 MOVOU A0, rStore
1870 MOVOU B0, sStore
1871
1872 // Hash
1873 MOVQ ad_len+80(FP), itr2
1874 CALL polyHashADInternal<>(SB)
1875 XORQ itr1, itr1
1876
1877 sealSSE128SealHash:
1878 // itr1 holds the number of bytes encrypted but not yet hashed
1879 CMPQ itr1, $16
1880 JB sealSSE128Seal
1881 polyAdd(0(oup))
1882 polyMul
1883
1884 SUBQ $16, itr1
1885 ADDQ $16, oup
1886
1887 JMP sealSSE128SealHash
1888
1889 sealSSE128Seal:
1890 CMPQ inl, $16
1891 JB sealSSETail
1892 SUBQ $16, inl
1893
1894 // Load for decryption
1895 MOVOU (inp), T0
1896 PXOR T0, A1
1897 MOVOU A1, (oup)
1898 LEAQ (1*16)(inp), inp
1899 LEAQ (1*16)(oup), oup
1900
1901 // Extract for hashing
1902 MOVQ A1, t0
1903 PSRLDQ $8, A1
1904 MOVQ A1, t1
1905 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1906 polyMul
1907
1908 // Shift the stream "left"
1909 MOVO B1, A1
1910 MOVO C1, B1
1911 MOVO D1, C1
1912 MOVO A2, D1
1913 MOVO B2, A2
1914 MOVO C2, B2
1915 MOVO D2, C2
1916 JMP sealSSE128Seal
1917
1918 sealSSETail:
1919 TESTQ inl, inl
1920 JE sealSSEFinalize
1921
1922 // We can only load the PT one byte at a time to avoid read after end of buffer
1923 MOVQ inl, itr2
1924 SHLQ $4, itr2
1925 LEAQ ·andMask<>(SB), t0
1926 MOVQ inl, itr1
1927 LEAQ -1(inp)(inl*1), inp
1928 XORQ t2, t2
1929 XORQ t3, t3
1930 XORQ AX, AX
1931
1932 sealSSETailLoadLoop:
1933 SHLQ $8, t2, t3
1934 SHLQ $8, t2
1935 MOVB (inp), AX
1936 XORQ AX, t2
1937 LEAQ -1(inp), inp
1938 DECQ itr1
1939 JNE sealSSETailLoadLoop
1940 MOVQ t2, 0+tmpStore
1941 MOVQ t3, 8+tmpStore
1942 PXOR 0+tmpStore, A1
1943 MOVOU A1, (oup)
1944 MOVOU -16(t0)(itr2*1), T0
1945 PAND T0, A1
1946 MOVQ A1, t0
1947 PSRLDQ $8, A1
1948 MOVQ A1, t1
1949 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1950 polyMul
1951
1952 ADDQ inl, oup
1953
1954 sealSSEFinalize:
1955 // Hash in the buffer lengths
1956 ADDQ ad_len+80(FP), acc0
1957 ADCQ src_len+56(FP), acc1
1958 ADCQ $1, acc2
1959 polyMul
1960
1961 // Final reduce
1962 MOVQ acc0, t0
1963 MOVQ acc1, t1
1964 MOVQ acc2, t2
1965 SUBQ $-5, acc0
1966 SBBQ $-1, acc1
1967 SBBQ $3, acc2
1968 CMOVQCS t0, acc0
1969 CMOVQCS t1, acc1
1970 CMOVQCS t2, acc2
1971
1972 // Add in the "s" part of the key
1973 ADDQ 0+sStore, acc0
1974 ADCQ 8+sStore, acc1
1975
1976 // Finally store the tag at the end of the message
1977 MOVQ acc0, (0*8)(oup)
1978 MOVQ acc1, (1*8)(oup)
1979 RET
1980
1981 // ----------------------------------------------------------------------------
1982 // ------------------------- AVX2 Code ----------------------------------------
1983 chacha20Poly1305Seal_AVX2:
1984 VZEROUPPER
1985 VMOVDQU ·chacha20Constants<>(SB), AA0
1986 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
1987 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
1988 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
1989 VPADDD ·avx2InitMask<>(SB), DD0, DD0
1990
1991 // Special optimizations, for very short buffers
1992 CMPQ inl, $192
1993 JBE seal192AVX2 // 33% faster
1994 CMPQ inl, $320
1995 JBE seal320AVX2 // 17% faster
1996
1997 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
1998 VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1999 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
2000 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
2001 VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
2002 VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
2003 VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
2004 VMOVDQA DD3, ctr3StoreAVX2
2005 MOVQ $10, itr2
2006
2007 sealAVX2IntroLoop:
2008 VMOVDQA CC3, tmpStoreAVX2
2009 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2010 VMOVDQA tmpStoreAVX2, CC3
2011 VMOVDQA CC1, tmpStoreAVX2
2012 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2013 VMOVDQA tmpStoreAVX2, CC1
2014
2015 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
2016 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
2017 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
2018 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2019
2020 VMOVDQA CC3, tmpStoreAVX2
2021 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2022 VMOVDQA tmpStoreAVX2, CC3
2023 VMOVDQA CC1, tmpStoreAVX2
2024 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2025 VMOVDQA tmpStoreAVX2, CC1
2026
2027 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2028 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2029 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2030 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2031 DECQ itr2
2032 JNE sealAVX2IntroLoop
2033
2034 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2035 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2036 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2037 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2038
2039 VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
2040 VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
2041 VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
2042
2043 // Clamp and store poly key
2044 VPAND ·polyClampMask<>(SB), DD0, DD0
2045 VMOVDQA DD0, rsStoreAVX2
2046
2047 // Hash AD
2048 MOVQ ad_len+80(FP), itr2
2049 CALL polyHashADInternal<>(SB)
2050
2051 // Can store at least 320 bytes
2052 VPXOR (0*32)(inp), AA0, AA0
2053 VPXOR (1*32)(inp), CC0, CC0
2054 VMOVDQU AA0, (0*32)(oup)
2055 VMOVDQU CC0, (1*32)(oup)
2056
2057 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2058 VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
2059 VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
2060 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2061 VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
2062 VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
2063
2064 MOVQ $320, itr1
2065 SUBQ $320, inl
2066 LEAQ 320(inp), inp
2067
2068 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
2069 CMPQ inl, $128
2070 JBE sealAVX2SealHash
2071
2072 VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
2073 VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
2074 SUBQ $128, inl
2075 LEAQ 128(inp), inp
2076
2077 MOVQ $8, itr1
2078 MOVQ $2, itr2
2079
2080 CMPQ inl, $128
2081 JBE sealAVX2Tail128
2082 CMPQ inl, $256
2083 JBE sealAVX2Tail256
2084 CMPQ inl, $384
2085 JBE sealAVX2Tail384
2086 CMPQ inl, $512
2087 JBE sealAVX2Tail512
2088
2089 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
2090 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2091 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2092 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2093 VMOVDQA ctr3StoreAVX2, DD0
2094 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2095 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2096
2097 VMOVDQA CC3, tmpStoreAVX2
2098 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2099 VMOVDQA tmpStoreAVX2, CC3
2100 VMOVDQA CC1, tmpStoreAVX2
2101 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2102 VMOVDQA tmpStoreAVX2, CC1
2103
2104 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
2105 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
2106 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
2107 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2108
2109 VMOVDQA CC3, tmpStoreAVX2
2110 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2111 VMOVDQA tmpStoreAVX2, CC3
2112 VMOVDQA CC1, tmpStoreAVX2
2113 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2114 VMOVDQA tmpStoreAVX2, CC1
2115
2116 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2117 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2118 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2119 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2120 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2121 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2122 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2123 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2124 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2125 VMOVDQA CC3, tmpStoreAVX2
2126 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2127 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2128 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2129 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2130 VMOVDQA tmpStoreAVX2, CC3
2131
2132 SUBQ $16, oup // Adjust the pointer
2133 MOVQ $9, itr1
2134 JMP sealAVX2InternalLoopStart
2135
2136 sealAVX2MainLoop:
2137 // Load state, increment counter blocks, store the incremented counters
2138 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2139 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2140 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2141 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2142 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2143 MOVQ $10, itr1
2144
2145 sealAVX2InternalLoop:
2146 polyAdd(0*8(oup))
2147 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2148 polyMulStage1_AVX2
2149 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2150 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2151 polyMulStage2_AVX2
2152 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2153 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2154 polyMulStage3_AVX2
2155 VMOVDQA CC3, tmpStoreAVX2
2156 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2157 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2158 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2159 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2160 VMOVDQA tmpStoreAVX2, CC3
2161 polyMulReduceStage
2162
2163 sealAVX2InternalLoopStart:
2164 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2165 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2166 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2167 polyAdd(2*8(oup))
2168 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2169 polyMulStage1_AVX2
2170 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2171 VMOVDQA CC3, tmpStoreAVX2
2172 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2173 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2174 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2175 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2176 VMOVDQA tmpStoreAVX2, CC3
2177 polyMulStage2_AVX2
2178 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2179 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2180 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2181 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2182 polyMulStage3_AVX2
2183 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2184 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2185 polyMulReduceStage
2186 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2187 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2188 polyAdd(4*8(oup))
2189 LEAQ (6*8)(oup), oup
2190 VMOVDQA CC3, tmpStoreAVX2
2191 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2192 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2193 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2194 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2195 VMOVDQA tmpStoreAVX2, CC3
2196 polyMulStage1_AVX2
2197 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2198 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2199 polyMulStage2_AVX2
2200 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2201 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2202 polyMulStage3_AVX2
2203 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2204 VMOVDQA CC3, tmpStoreAVX2
2205 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2206 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2207 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2208 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2209 VMOVDQA tmpStoreAVX2, CC3
2210 polyMulReduceStage
2211 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2212 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2213 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2214 DECQ itr1
2215 JNE sealAVX2InternalLoop
2216
2217 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2218 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2219 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2220 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2221 VMOVDQA CC3, tmpStoreAVX2
2222
2223 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
2224 polyAdd(0*8(oup))
2225 polyMulAVX2
2226 LEAQ (4*8)(oup), oup
2227 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
2228 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
2229 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
2230 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2231 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2232 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2233
2234 // and here
2235 polyAdd(-2*8(oup))
2236 polyMulAVX2
2237 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2238 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2239 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2240 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2241 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
2242 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
2243 LEAQ (32*16)(inp), inp
2244 SUBQ $(32*16), inl
2245 CMPQ inl, $512
2246 JG sealAVX2MainLoop
2247
2248 // Tail can only hash 480 bytes
2249 polyAdd(0*8(oup))
2250 polyMulAVX2
2251 polyAdd(2*8(oup))
2252 polyMulAVX2
2253 LEAQ 32(oup), oup
2254
2255 MOVQ $10, itr1
2256 MOVQ $0, itr2
2257 CMPQ inl, $128
2258 JBE sealAVX2Tail128
2259 CMPQ inl, $256
2260 JBE sealAVX2Tail256
2261 CMPQ inl, $384
2262 JBE sealAVX2Tail384
2263 JMP sealAVX2Tail512
2264
2265 // ----------------------------------------------------------------------------
2266 // Special optimization for buffers smaller than 193 bytes
2267 seal192AVX2:
2268 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
2269 VMOVDQA AA0, AA1
2270 VMOVDQA BB0, BB1
2271 VMOVDQA CC0, CC1
2272 VPADDD ·avx2IncMask<>(SB), DD0, DD1
2273 VMOVDQA AA0, AA2
2274 VMOVDQA BB0, BB2
2275 VMOVDQA CC0, CC2
2276 VMOVDQA DD0, DD2
2277 VMOVDQA DD1, TT3
2278 MOVQ $10, itr2
2279
2280 sealAVX2192InnerCipherLoop:
2281 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2282 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2283 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2284 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2285 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2286 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2287 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2288 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2289 DECQ itr2
2290 JNE sealAVX2192InnerCipherLoop
2291 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
2292 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
2293 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
2294 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
2295 VPERM2I128 $0x02, AA0, BB0, TT0
2296
2297 // Clamp and store poly key
2298 VPAND ·polyClampMask<>(SB), TT0, TT0
2299 VMOVDQA TT0, rsStoreAVX2
2300
2301 // Stream for up to 192 bytes
2302 VPERM2I128 $0x13, AA0, BB0, AA0
2303 VPERM2I128 $0x13, CC0, DD0, BB0
2304 VPERM2I128 $0x02, AA1, BB1, CC0
2305 VPERM2I128 $0x02, CC1, DD1, DD0
2306 VPERM2I128 $0x13, AA1, BB1, AA1
2307 VPERM2I128 $0x13, CC1, DD1, BB1
2308
2309 sealAVX2ShortSeal:
2310 // Hash aad
2311 MOVQ ad_len+80(FP), itr2
2312 CALL polyHashADInternal<>(SB)
2313 XORQ itr1, itr1
2314
2315 sealAVX2SealHash:
2316 // itr1 holds the number of bytes encrypted but not yet hashed
2317 CMPQ itr1, $16
2318 JB sealAVX2ShortSealLoop
2319 polyAdd(0(oup))
2320 polyMul
2321 SUBQ $16, itr1
2322 ADDQ $16, oup
2323 JMP sealAVX2SealHash
2324
2325 sealAVX2ShortSealLoop:
2326 CMPQ inl, $32
2327 JB sealAVX2ShortTail32
2328 SUBQ $32, inl
2329
2330 // Load for encryption
2331 VPXOR (inp), AA0, AA0
2332 VMOVDQU AA0, (oup)
2333 LEAQ (1*32)(inp), inp
2334
2335 // Now can hash
2336 polyAdd(0*8(oup))
2337 polyMulAVX2
2338 polyAdd(2*8(oup))
2339 polyMulAVX2
2340 LEAQ (1*32)(oup), oup
2341
2342 // Shift stream left
2343 VMOVDQA BB0, AA0
2344 VMOVDQA CC0, BB0
2345 VMOVDQA DD0, CC0
2346 VMOVDQA AA1, DD0
2347 VMOVDQA BB1, AA1
2348 VMOVDQA CC1, BB1
2349 VMOVDQA DD1, CC1
2350 VMOVDQA AA2, DD1
2351 VMOVDQA BB2, AA2
2352 JMP sealAVX2ShortSealLoop
2353
2354 sealAVX2ShortTail32:
2355 CMPQ inl, $16
2356 VMOVDQA A0, A1
2357 JB sealAVX2ShortDone
2358
2359 SUBQ $16, inl
2360
2361 // Load for encryption
2362 VPXOR (inp), A0, T0
2363 VMOVDQU T0, (oup)
2364 LEAQ (1*16)(inp), inp
2365
2366 // Hash
2367 polyAdd(0*8(oup))
2368 polyMulAVX2
2369 LEAQ (1*16)(oup), oup
2370 VPERM2I128 $0x11, AA0, AA0, AA0
2371 VMOVDQA A0, A1
2372
2373 sealAVX2ShortDone:
2374 VZEROUPPER
2375 JMP sealSSETail
2376
2377 // ----------------------------------------------------------------------------
2378 // Special optimization for buffers smaller than 321 bytes
2379 seal320AVX2:
2380 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
2381 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
2382 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2383 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
2384 MOVQ $10, itr2
2385
2386 sealAVX2320InnerCipherLoop:
2387 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2388 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2389 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2390 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2391 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2392 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2393 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2394 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2395 DECQ itr2
2396 JNE sealAVX2320InnerCipherLoop
2397
2398 VMOVDQA ·chacha20Constants<>(SB), TT0
2399 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
2400 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
2401 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
2402 VMOVDQA ·avx2IncMask<>(SB), TT0
2403 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
2404 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
2405 VPADDD TT3, DD2, DD2
2406
2407 // Clamp and store poly key
2408 VPERM2I128 $0x02, AA0, BB0, TT0
2409 VPAND ·polyClampMask<>(SB), TT0, TT0
2410 VMOVDQA TT0, rsStoreAVX2
2411
2412 // Stream for up to 320 bytes
2413 VPERM2I128 $0x13, AA0, BB0, AA0
2414 VPERM2I128 $0x13, CC0, DD0, BB0
2415 VPERM2I128 $0x02, AA1, BB1, CC0
2416 VPERM2I128 $0x02, CC1, DD1, DD0
2417 VPERM2I128 $0x13, AA1, BB1, AA1
2418 VPERM2I128 $0x13, CC1, DD1, BB1
2419 VPERM2I128 $0x02, AA2, BB2, CC1
2420 VPERM2I128 $0x02, CC2, DD2, DD1
2421 VPERM2I128 $0x13, AA2, BB2, AA2
2422 VPERM2I128 $0x13, CC2, DD2, BB2
2423 JMP sealAVX2ShortSeal
2424
2425 // ----------------------------------------------------------------------------
2426 // Special optimization for the last 128 bytes of ciphertext
2427 sealAVX2Tail128:
2428 // Need to decrypt up to 128 bytes - prepare two blocks
2429 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2430 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2431 VMOVDQA ·chacha20Constants<>(SB), AA0
2432 VMOVDQA state1StoreAVX2, BB0
2433 VMOVDQA state2StoreAVX2, CC0
2434 VMOVDQA ctr3StoreAVX2, DD0
2435 VPADDD ·avx2IncMask<>(SB), DD0, DD0
2436 VMOVDQA DD0, DD1
2437
2438 sealAVX2Tail128LoopA:
2439 polyAdd(0(oup))
2440 polyMul
2441 LEAQ 16(oup), oup
2442
2443 sealAVX2Tail128LoopB:
2444 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2445 polyAdd(0(oup))
2446 polyMul
2447 VPALIGNR $4, BB0, BB0, BB0
2448 VPALIGNR $8, CC0, CC0, CC0
2449 VPALIGNR $12, DD0, DD0, DD0
2450 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2451 polyAdd(16(oup))
2452 polyMul
2453 LEAQ 32(oup), oup
2454 VPALIGNR $12, BB0, BB0, BB0
2455 VPALIGNR $8, CC0, CC0, CC0
2456 VPALIGNR $4, DD0, DD0, DD0
2457 DECQ itr1
2458 JG sealAVX2Tail128LoopA
2459 DECQ itr2
2460 JGE sealAVX2Tail128LoopB
2461
2462 VPADDD ·chacha20Constants<>(SB), AA0, AA1
2463 VPADDD state1StoreAVX2, BB0, BB1
2464 VPADDD state2StoreAVX2, CC0, CC1
2465 VPADDD DD1, DD0, DD1
2466
2467 VPERM2I128 $0x02, AA1, BB1, AA0
2468 VPERM2I128 $0x02, CC1, DD1, BB0
2469 VPERM2I128 $0x13, AA1, BB1, CC0
2470 VPERM2I128 $0x13, CC1, DD1, DD0
2471 JMP sealAVX2ShortSealLoop
2472
2473 // ----------------------------------------------------------------------------
2474 // Special optimization for the last 256 bytes of ciphertext
2475 sealAVX2Tail256:
2476 // Need to decrypt up to 256 bytes - prepare two blocks
2477 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2478 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2479 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
2480 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
2481 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
2482 VMOVDQA ctr3StoreAVX2, DD0
2483 VPADDD ·avx2IncMask<>(SB), DD0, DD0
2484 VPADDD ·avx2IncMask<>(SB), DD0, DD1
2485 VMOVDQA DD0, TT1
2486 VMOVDQA DD1, TT2
2487
2488 sealAVX2Tail256LoopA:
2489 polyAdd(0(oup))
2490 polyMul
2491 LEAQ 16(oup), oup
2492
2493 sealAVX2Tail256LoopB:
2494 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2495 polyAdd(0(oup))
2496 polyMul
2497 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2498 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2499 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2500 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2501 polyAdd(16(oup))
2502 polyMul
2503 LEAQ 32(oup), oup
2504 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2505 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2506 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2507 DECQ itr1
2508 JG sealAVX2Tail256LoopA
2509 DECQ itr2
2510 JGE sealAVX2Tail256LoopB
2511
2512 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
2513 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
2514 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
2515 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
2516 VPERM2I128 $0x02, AA0, BB0, TT0
2517 VPERM2I128 $0x02, CC0, DD0, TT1
2518 VPERM2I128 $0x13, AA0, BB0, TT2
2519 VPERM2I128 $0x13, CC0, DD0, TT3
2520 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2521 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2522 MOVQ $128, itr1
2523 LEAQ 128(inp), inp
2524 SUBQ $128, inl
2525 VPERM2I128 $0x02, AA1, BB1, AA0
2526 VPERM2I128 $0x02, CC1, DD1, BB0
2527 VPERM2I128 $0x13, AA1, BB1, CC0
2528 VPERM2I128 $0x13, CC1, DD1, DD0
2529
2530 JMP sealAVX2SealHash
2531
2532 // ----------------------------------------------------------------------------
2533 // Special optimization for the last 384 bytes of ciphertext
2534 sealAVX2Tail384:
2535 // Need to decrypt up to 384 bytes - prepare two blocks
2536 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2537 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2538 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
2539 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
2540 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
2541 VMOVDQA ctr3StoreAVX2, DD0
2542 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2543 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
2544
2545 sealAVX2Tail384LoopA:
2546 polyAdd(0(oup))
2547 polyMul
2548 LEAQ 16(oup), oup
2549
2550 sealAVX2Tail384LoopB:
2551 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2552 polyAdd(0(oup))
2553 polyMul
2554 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2555 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2556 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2557 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2558 polyAdd(16(oup))
2559 polyMul
2560 LEAQ 32(oup), oup
2561 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2562 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2563 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2564 DECQ itr1
2565 JG sealAVX2Tail384LoopA
2566 DECQ itr2
2567 JGE sealAVX2Tail384LoopB
2568
2569 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
2570 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
2571 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
2572 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
2573 VPERM2I128 $0x02, AA0, BB0, TT0
2574 VPERM2I128 $0x02, CC0, DD0, TT1
2575 VPERM2I128 $0x13, AA0, BB0, TT2
2576 VPERM2I128 $0x13, CC0, DD0, TT3
2577 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2578 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2579 VPERM2I128 $0x02, AA1, BB1, TT0
2580 VPERM2I128 $0x02, CC1, DD1, TT1
2581 VPERM2I128 $0x13, AA1, BB1, TT2
2582 VPERM2I128 $0x13, CC1, DD1, TT3
2583 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
2584 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
2585 MOVQ $256, itr1
2586 LEAQ 256(inp), inp
2587 SUBQ $256, inl
2588 VPERM2I128 $0x02, AA2, BB2, AA0
2589 VPERM2I128 $0x02, CC2, DD2, BB0
2590 VPERM2I128 $0x13, AA2, BB2, CC0
2591 VPERM2I128 $0x13, CC2, DD2, DD0
2592
2593 JMP sealAVX2SealHash
2594
2595 // ----------------------------------------------------------------------------
2596 // Special optimization for the last 512 bytes of ciphertext
2597 sealAVX2Tail512:
2598 // Need to decrypt up to 512 bytes - prepare two blocks
2599 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2600 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2601 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2602 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2603 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2604 VMOVDQA ctr3StoreAVX2, DD0
2605 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2606 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2607
2608 sealAVX2Tail512LoopA:
2609 polyAdd(0(oup))
2610 polyMul
2611 LEAQ 16(oup), oup
2612
2613 sealAVX2Tail512LoopB:
2614 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2615 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2616 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2617 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2618 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2619 VMOVDQA CC3, tmpStoreAVX2
2620 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2621 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2622 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2623 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2624 VMOVDQA tmpStoreAVX2, CC3
2625 polyAdd(0*8(oup))
2626 polyMulAVX2
2627 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2628 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2629 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2630 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2631 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2632 VMOVDQA CC3, tmpStoreAVX2
2633 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2634 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2635 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2636 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2637 VMOVDQA tmpStoreAVX2, CC3
2638 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2639 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2640 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2641 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2642 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2643 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2644 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2645 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2646 polyAdd(2*8(oup))
2647 polyMulAVX2
2648 LEAQ (4*8)(oup), oup
2649 VMOVDQA CC3, tmpStoreAVX2
2650 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2651 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2652 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2653 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2654 VMOVDQA tmpStoreAVX2, CC3
2655 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2656 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2657 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2658 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2659 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2660 VMOVDQA CC3, tmpStoreAVX2
2661 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2662 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2663 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2664 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2665 VMOVDQA tmpStoreAVX2, CC3
2666 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2667 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2668 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2669
2670 DECQ itr1
2671 JG sealAVX2Tail512LoopA
2672 DECQ itr2
2673 JGE sealAVX2Tail512LoopB
2674
2675 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2676 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2677 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2678 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2679 VMOVDQA CC3, tmpStoreAVX2
2680 VPERM2I128 $0x02, AA0, BB0, CC3
2681 VPXOR (0*32)(inp), CC3, CC3
2682 VMOVDQU CC3, (0*32)(oup)
2683 VPERM2I128 $0x02, CC0, DD0, CC3
2684 VPXOR (1*32)(inp), CC3, CC3
2685 VMOVDQU CC3, (1*32)(oup)
2686 VPERM2I128 $0x13, AA0, BB0, CC3
2687 VPXOR (2*32)(inp), CC3, CC3
2688 VMOVDQU CC3, (2*32)(oup)
2689 VPERM2I128 $0x13, CC0, DD0, CC3
2690 VPXOR (3*32)(inp), CC3, CC3
2691 VMOVDQU CC3, (3*32)(oup)
2692
2693 VPERM2I128 $0x02, AA1, BB1, AA0
2694 VPERM2I128 $0x02, CC1, DD1, BB0
2695 VPERM2I128 $0x13, AA1, BB1, CC0
2696 VPERM2I128 $0x13, CC1, DD1, DD0
2697 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2698 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2699
2700 VPERM2I128 $0x02, AA2, BB2, AA0
2701 VPERM2I128 $0x02, CC2, DD2, BB0
2702 VPERM2I128 $0x13, AA2, BB2, CC0
2703 VPERM2I128 $0x13, CC2, DD2, DD0
2704 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2705 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2706
2707 MOVQ $384, itr1
2708 LEAQ 384(inp), inp
2709 SUBQ $384, inl
2710 VPERM2I128 $0x02, AA3, BB3, AA0
2711 VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
2712 VPERM2I128 $0x13, AA3, BB3, CC0
2713 VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2714
2715 JMP sealAVX2SealHash
2716
View as plain text