Text file src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
     6  
     7  //go:build gc && !purego
     8  
     9  #include "textflag.h"
    10  // General register allocation
    11  #define oup DI
    12  #define inp SI
    13  #define inl BX
    14  #define adp CX // free to reuse, after we hash the additional data
    15  #define keyp R8 // free to reuse, when we copy the key to stack
    16  #define itr2 R9 // general iterator
    17  #define itr1 CX // general iterator
    18  #define acc0 R10
    19  #define acc1 R11
    20  #define acc2 R12
    21  #define t0 R13
    22  #define t1 R14
    23  #define t2 R15
    24  #define t3 R8
    25  // Register and stack allocation for the SSE code
    26  #define rStore (0*16)(BP)
    27  #define sStore (1*16)(BP)
    28  #define state1Store (2*16)(BP)
    29  #define state2Store (3*16)(BP)
    30  #define tmpStore (4*16)(BP)
    31  #define ctr0Store (5*16)(BP)
    32  #define ctr1Store (6*16)(BP)
    33  #define ctr2Store (7*16)(BP)
    34  #define ctr3Store (8*16)(BP)
    35  #define A0 X0
    36  #define A1 X1
    37  #define A2 X2
    38  #define B0 X3
    39  #define B1 X4
    40  #define B2 X5
    41  #define C0 X6
    42  #define C1 X7
    43  #define C2 X8
    44  #define D0 X9
    45  #define D1 X10
    46  #define D2 X11
    47  #define T0 X12
    48  #define T1 X13
    49  #define T2 X14
    50  #define T3 X15
    51  #define A3 T0
    52  #define B3 T1
    53  #define C3 T2
    54  #define D3 T3
    55  // Register and stack allocation for the AVX2 code
    56  #define rsStoreAVX2 (0*32)(BP)
    57  #define state1StoreAVX2 (1*32)(BP)
    58  #define state2StoreAVX2 (2*32)(BP)
    59  #define ctr0StoreAVX2 (3*32)(BP)
    60  #define ctr1StoreAVX2 (4*32)(BP)
    61  #define ctr2StoreAVX2 (5*32)(BP)
    62  #define ctr3StoreAVX2 (6*32)(BP)
    63  #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
    64  #define AA0 Y0
    65  #define AA1 Y5
    66  #define AA2 Y6
    67  #define AA3 Y7
    68  #define BB0 Y14
    69  #define BB1 Y9
    70  #define BB2 Y10
    71  #define BB3 Y11
    72  #define CC0 Y12
    73  #define CC1 Y13
    74  #define CC2 Y8
    75  #define CC3 Y15
    76  #define DD0 Y4
    77  #define DD1 Y1
    78  #define DD2 Y2
    79  #define DD3 Y3
    80  #define TT0 DD3
    81  #define TT1 AA3
    82  #define TT2 BB3
    83  #define TT3 CC3
    84  // ChaCha20 constants
    85  DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
    86  DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
    87  DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
    88  DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
    89  DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
    90  DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
    91  DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
    92  DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
    93  // <<< 16 with PSHUFB
    94  DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
    95  DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
    96  DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
    97  DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
    98  // <<< 8 with PSHUFB
    99  DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
   100  DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
   101  DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
   102  DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
   103  
   104  DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
   105  DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
   106  DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
   107  DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
   108  
   109  DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
   110  DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
   111  DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
   112  DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
   113  // Poly1305 key clamp
   114  DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
   115  DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
   116  DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   117  DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   118  
   119  DATA ·sseIncMask<>+0x00(SB)/8, $0x1
   120  DATA ·sseIncMask<>+0x08(SB)/8, $0x0
   121  // To load/store the last < 16 bytes in a buffer
   122  DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
   123  DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
   124  DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
   125  DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
   126  DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
   127  DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
   128  DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
   129  DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
   130  DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
   131  DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
   132  DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
   133  DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
   134  DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
   135  DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
   136  DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
   137  DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
   138  DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
   139  DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
   140  DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
   141  DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
   142  DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
   143  DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
   144  DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
   145  DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
   146  DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
   147  DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
   148  DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
   149  DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
   150  DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
   151  DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
   152  
   153  GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
   154  GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
   155  GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
   156  GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
   157  GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
   158  GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
   159  GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
   160  GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
   161  // No PALIGNR in Go ASM yet (but VPALIGNR is present).
   162  #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
   163  #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
   164  #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
   165  #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
   166  #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
   167  #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
   168  #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
   169  #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
   170  #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
   171  #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
   172  #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
   173  #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
   174  #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
   175  #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
   176  #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
   177  #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
   178  #define shiftC0Right shiftC0Left
   179  #define shiftC1Right shiftC1Left
   180  #define shiftC2Right shiftC2Left
   181  #define shiftC3Right shiftC3Left
   182  #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
   183  #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
   184  #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
   185  #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
   186  
   187  // Some macros
   188  
   189  // ROL rotates the uint32s in register R left by N bits, using temporary T.
   190  #define ROL(N, R, T) \
   191  	MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
   192  
   193  // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
   194  #ifdef GOAMD64_v2
   195  #define ROL16(R, T) PSHUFB ·rol16<>(SB), R
   196  #else
   197  #define ROL16(R, T) ROL(16, R, T)
   198  #endif
   199  
   200  // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
   201  #ifdef GOAMD64_v2
   202  #define ROL8(R, T) PSHUFB ·rol8<>(SB), R
   203  #else
   204  #define ROL8(R, T) ROL(8, R, T)
   205  #endif
   206  
   207  #define chachaQR(A, B, C, D, T) \
   208  	PADDD B, A; PXOR A, D; ROL16(D, T) \
   209  	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
   210  	PADDD B, A; PXOR A, D; ROL8(D, T) \
   211  	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
   212  
   213  #define chachaQR_AVX2(A, B, C, D, T) \
   214  	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
   215  	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
   216  	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
   217  	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
   218  
   219  #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
   220  #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
   221  #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
   222  #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
   223  #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
   224  
   225  #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
   226  #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
   227  #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
   228  
   229  #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
   230  #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
   231  // ----------------------------------------------------------------------------
   232  TEXT polyHashADInternal<>(SB), NOSPLIT, $0
   233  	// adp points to beginning of additional data
   234  	// itr2 holds ad length
   235  	XORQ acc0, acc0
   236  	XORQ acc1, acc1
   237  	XORQ acc2, acc2
   238  	CMPQ itr2, $13
   239  	JNE  hashADLoop
   240  
   241  openFastTLSAD:
   242  	// Special treatment for the TLS case of 13 bytes
   243  	MOVQ (adp), acc0
   244  	MOVQ 5(adp), acc1
   245  	SHRQ $24, acc1
   246  	MOVQ $1, acc2
   247  	polyMul
   248  	RET
   249  
   250  hashADLoop:
   251  	// Hash in 16 byte chunks
   252  	CMPQ itr2, $16
   253  	JB   hashADTail
   254  	polyAdd(0(adp))
   255  	LEAQ (1*16)(adp), adp
   256  	SUBQ $16, itr2
   257  	polyMul
   258  	JMP  hashADLoop
   259  
   260  hashADTail:
   261  	CMPQ itr2, $0
   262  	JE   hashADDone
   263  
   264  	// Hash last < 16 byte tail
   265  	XORQ t0, t0
   266  	XORQ t1, t1
   267  	XORQ t2, t2
   268  	ADDQ itr2, adp
   269  
   270  hashADTailLoop:
   271  	SHLQ $8, t0, t1
   272  	SHLQ $8, t0
   273  	MOVB -1(adp), t2
   274  	XORQ t2, t0
   275  	DECQ adp
   276  	DECQ itr2
   277  	JNE  hashADTailLoop
   278  
   279  hashADTailFinish:
   280  	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   281  	polyMul
   282  
   283  	// Finished AD
   284  hashADDone:
   285  	RET
   286  
   287  // ----------------------------------------------------------------------------
   288  // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
   289  TEXT ·chacha20Poly1305Open(SB), 0, $288-97
   290  	// For aligned stack access
   291  	MOVQ SP, BP
   292  	ADDQ $32, BP
   293  	ANDQ $-32, BP
   294  	MOVQ dst+0(FP), oup
   295  	MOVQ key+24(FP), keyp
   296  	MOVQ src+48(FP), inp
   297  	MOVQ src_len+56(FP), inl
   298  	MOVQ ad+72(FP), adp
   299  
   300  	// Check for AVX2 support
   301  	CMPB ·useAVX2(SB), $1
   302  	JE   chacha20Poly1305Open_AVX2
   303  
   304  	// Special optimization, for very short buffers
   305  	CMPQ inl, $128
   306  	JBE  openSSE128 // About 16% faster
   307  
   308  	// For long buffers, prepare the poly key first
   309  	MOVOU ·chacha20Constants<>(SB), A0
   310  	MOVOU (1*16)(keyp), B0
   311  	MOVOU (2*16)(keyp), C0
   312  	MOVOU (3*16)(keyp), D0
   313  	MOVO  D0, T1
   314  
   315  	// Store state on stack for future use
   316  	MOVO B0, state1Store
   317  	MOVO C0, state2Store
   318  	MOVO D0, ctr3Store
   319  	MOVQ $10, itr2
   320  
   321  openSSEPreparePolyKey:
   322  	chachaQR(A0, B0, C0, D0, T0)
   323  	shiftB0Left;  shiftC0Left; shiftD0Left
   324  	chachaQR(A0, B0, C0, D0, T0)
   325  	shiftB0Right; shiftC0Right; shiftD0Right
   326  	DECQ          itr2
   327  	JNE           openSSEPreparePolyKey
   328  
   329  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   330  	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
   331  
   332  	// Clamp and store the key
   333  	PAND ·polyClampMask<>(SB), A0
   334  	MOVO A0, rStore; MOVO B0, sStore
   335  
   336  	// Hash AAD
   337  	MOVQ ad_len+80(FP), itr2
   338  	CALL polyHashADInternal<>(SB)
   339  
   340  openSSEMainLoop:
   341  	CMPQ inl, $256
   342  	JB   openSSEMainLoopDone
   343  
   344  	// Load state, increment counter blocks
   345  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
   346  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   347  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   348  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
   349  
   350  	// Store counters
   351  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   352  
   353  	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
   354  	MOVQ $4, itr1
   355  	MOVQ inp, itr2
   356  
   357  openSSEInternalLoop:
   358  	MOVO          C3, tmpStore
   359  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   360  	MOVO          tmpStore, C3
   361  	MOVO          C1, tmpStore
   362  	chachaQR(A3, B3, C3, D3, C1)
   363  	MOVO          tmpStore, C1
   364  	polyAdd(0(itr2))
   365  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   366  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   367  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   368  	polyMulStage1
   369  	polyMulStage2
   370  	LEAQ          (2*8)(itr2), itr2
   371  	MOVO          C3, tmpStore
   372  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   373  	MOVO          tmpStore, C3
   374  	MOVO          C1, tmpStore
   375  	polyMulStage3
   376  	chachaQR(A3, B3, C3, D3, C1)
   377  	MOVO          tmpStore, C1
   378  	polyMulReduceStage
   379  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   380  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   381  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   382  	DECQ          itr1
   383  	JGE           openSSEInternalLoop
   384  
   385  	polyAdd(0(itr2))
   386  	polyMul
   387  	LEAQ (2*8)(itr2), itr2
   388  
   389  	CMPQ itr1, $-6
   390  	JG   openSSEInternalLoop
   391  
   392  	// Add in the state
   393  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
   394  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   395  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   396  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   397  
   398  	// Load - xor - store
   399  	MOVO  D3, tmpStore
   400  	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
   401  	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
   402  	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
   403  	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
   404  	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
   405  	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
   406  	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
   407  	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
   408  	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
   409  	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
   410  	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
   411  	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
   412  	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
   413  	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
   414  	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
   415  	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
   416  	LEAQ  256(inp), inp
   417  	LEAQ  256(oup), oup
   418  	SUBQ  $256, inl
   419  	JMP   openSSEMainLoop
   420  
   421  openSSEMainLoopDone:
   422  	// Handle the various tail sizes efficiently
   423  	TESTQ inl, inl
   424  	JE    openSSEFinalize
   425  	CMPQ  inl, $64
   426  	JBE   openSSETail64
   427  	CMPQ  inl, $128
   428  	JBE   openSSETail128
   429  	CMPQ  inl, $192
   430  	JBE   openSSETail192
   431  	JMP   openSSETail256
   432  
   433  openSSEFinalize:
   434  	// Hash in the PT, AAD lengths
   435  	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
   436  	polyMul
   437  
   438  	// Final reduce
   439  	MOVQ    acc0, t0
   440  	MOVQ    acc1, t1
   441  	MOVQ    acc2, t2
   442  	SUBQ    $-5, acc0
   443  	SBBQ    $-1, acc1
   444  	SBBQ    $3, acc2
   445  	CMOVQCS t0, acc0
   446  	CMOVQCS t1, acc1
   447  	CMOVQCS t2, acc2
   448  
   449  	// Add in the "s" part of the key
   450  	ADDQ 0+sStore, acc0
   451  	ADCQ 8+sStore, acc1
   452  
   453  	// Finally, constant time compare to the tag at the end of the message
   454  	XORQ    AX, AX
   455  	MOVQ    $1, DX
   456  	XORQ    (0*8)(inp), acc0
   457  	XORQ    (1*8)(inp), acc1
   458  	ORQ     acc1, acc0
   459  	CMOVQEQ DX, AX
   460  
   461  	// Return true iff tags are equal
   462  	MOVB AX, ret+96(FP)
   463  	RET
   464  
   465  // ----------------------------------------------------------------------------
   466  // Special optimization for buffers smaller than 129 bytes
   467  openSSE128:
   468  	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
   469  	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
   470  	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   471  	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   472  	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
   473  	MOVQ  $10, itr2
   474  
   475  openSSE128InnerCipherLoop:
   476  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   477  	shiftB0Left;  shiftB1Left; shiftB2Left
   478  	shiftC0Left;  shiftC1Left; shiftC2Left
   479  	shiftD0Left;  shiftD1Left; shiftD2Left
   480  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   481  	shiftB0Right; shiftB1Right; shiftB2Right
   482  	shiftC0Right; shiftC1Right; shiftC2Right
   483  	shiftD0Right; shiftD1Right; shiftD2Right
   484  	DECQ          itr2
   485  	JNE           openSSE128InnerCipherLoop
   486  
   487  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   488  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
   489  	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
   490  	PADDL T2, C1; PADDL T2, C2
   491  	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
   492  
   493  	// Clamp and store the key
   494  	PAND  ·polyClampMask<>(SB), A0
   495  	MOVOU A0, rStore; MOVOU B0, sStore
   496  
   497  	// Hash
   498  	MOVQ ad_len+80(FP), itr2
   499  	CALL polyHashADInternal<>(SB)
   500  
   501  openSSE128Open:
   502  	CMPQ inl, $16
   503  	JB   openSSETail16
   504  	SUBQ $16, inl
   505  
   506  	// Load for hashing
   507  	polyAdd(0(inp))
   508  
   509  	// Load for decryption
   510  	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
   511  	LEAQ  (1*16)(inp), inp
   512  	LEAQ  (1*16)(oup), oup
   513  	polyMul
   514  
   515  	// Shift the stream "left"
   516  	MOVO B1, A1
   517  	MOVO C1, B1
   518  	MOVO D1, C1
   519  	MOVO A2, D1
   520  	MOVO B2, A2
   521  	MOVO C2, B2
   522  	MOVO D2, C2
   523  	JMP  openSSE128Open
   524  
   525  openSSETail16:
   526  	TESTQ inl, inl
   527  	JE    openSSEFinalize
   528  
   529  	// We can safely load the CT from the end, because it is padded with the MAC
   530  	MOVQ   inl, itr2
   531  	SHLQ   $4, itr2
   532  	LEAQ   ·andMask<>(SB), t0
   533  	MOVOU  (inp), T0
   534  	ADDQ   inl, inp
   535  	PAND   -16(t0)(itr2*1), T0
   536  	MOVO   T0, 0+tmpStore
   537  	MOVQ   T0, t0
   538  	MOVQ   8+tmpStore, t1
   539  	PXOR   A1, T0
   540  
   541  	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
   542  openSSETail16Store:
   543  	MOVQ T0, t3
   544  	MOVB t3, (oup)
   545  	PSRLDQ $1, T0
   546  	INCQ   oup
   547  	DECQ   inl
   548  	JNE    openSSETail16Store
   549  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   550  	polyMul
   551  	JMP    openSSEFinalize
   552  
   553  // ----------------------------------------------------------------------------
   554  // Special optimization for the last 64 bytes of ciphertext
   555  openSSETail64:
   556  	// Need to decrypt up to 64 bytes - prepare single block
   557  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
   558  	XORQ itr2, itr2
   559  	MOVQ inl, itr1
   560  	CMPQ itr1, $16
   561  	JB   openSSETail64LoopB
   562  
   563  openSSETail64LoopA:
   564  	// Perform ChaCha rounds, while hashing the remaining input
   565  	polyAdd(0(inp)(itr2*1))
   566  	polyMul
   567  	SUBQ $16, itr1
   568  
   569  openSSETail64LoopB:
   570  	ADDQ          $16, itr2
   571  	chachaQR(A0, B0, C0, D0, T0)
   572  	shiftB0Left;  shiftC0Left; shiftD0Left
   573  	chachaQR(A0, B0, C0, D0, T0)
   574  	shiftB0Right; shiftC0Right; shiftD0Right
   575  
   576  	CMPQ itr1, $16
   577  	JAE  openSSETail64LoopA
   578  
   579  	CMPQ itr2, $160
   580  	JNE  openSSETail64LoopB
   581  
   582  	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
   583  
   584  openSSETail64DecLoop:
   585  	CMPQ  inl, $16
   586  	JB    openSSETail64DecLoopDone
   587  	SUBQ  $16, inl
   588  	MOVOU (inp), T0
   589  	PXOR  T0, A0
   590  	MOVOU A0, (oup)
   591  	LEAQ  16(inp), inp
   592  	LEAQ  16(oup), oup
   593  	MOVO  B0, A0
   594  	MOVO  C0, B0
   595  	MOVO  D0, C0
   596  	JMP   openSSETail64DecLoop
   597  
   598  openSSETail64DecLoopDone:
   599  	MOVO A0, A1
   600  	JMP  openSSETail16
   601  
   602  // ----------------------------------------------------------------------------
   603  // Special optimization for the last 128 bytes of ciphertext
   604  openSSETail128:
   605  	// Need to decrypt up to 128 bytes - prepare two blocks
   606  	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
   607  	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
   608  	XORQ itr2, itr2
   609  	MOVQ inl, itr1
   610  	ANDQ $-16, itr1
   611  
   612  openSSETail128LoopA:
   613  	// Perform ChaCha rounds, while hashing the remaining input
   614  	polyAdd(0(inp)(itr2*1))
   615  	polyMul
   616  
   617  openSSETail128LoopB:
   618  	ADDQ          $16, itr2
   619  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   620  	shiftB0Left;  shiftC0Left; shiftD0Left
   621  	shiftB1Left;  shiftC1Left; shiftD1Left
   622  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   623  	shiftB0Right; shiftC0Right; shiftD0Right
   624  	shiftB1Right; shiftC1Right; shiftD1Right
   625  
   626  	CMPQ itr2, itr1
   627  	JB   openSSETail128LoopA
   628  
   629  	CMPQ itr2, $160
   630  	JNE  openSSETail128LoopB
   631  
   632  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
   633  	PADDL state1Store, B0; PADDL state1Store, B1
   634  	PADDL state2Store, C0; PADDL state2Store, C1
   635  	PADDL ctr1Store, D0; PADDL ctr0Store, D1
   636  
   637  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   638  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   639  	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
   640  
   641  	SUBQ $64, inl
   642  	LEAQ 64(inp), inp
   643  	LEAQ 64(oup), oup
   644  	JMP  openSSETail64DecLoop
   645  
   646  // ----------------------------------------------------------------------------
   647  // Special optimization for the last 192 bytes of ciphertext
   648  openSSETail192:
   649  	// Need to decrypt up to 192 bytes - prepare three blocks
   650  	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
   651  	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
   652  	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
   653  
   654  	MOVQ    inl, itr1
   655  	MOVQ    $160, itr2
   656  	CMPQ    itr1, $160
   657  	CMOVQGT itr2, itr1
   658  	ANDQ    $-16, itr1
   659  	XORQ    itr2, itr2
   660  
   661  openSSLTail192LoopA:
   662  	// Perform ChaCha rounds, while hashing the remaining input
   663  	polyAdd(0(inp)(itr2*1))
   664  	polyMul
   665  
   666  openSSLTail192LoopB:
   667  	ADDQ         $16, itr2
   668  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   669  	shiftB0Left; shiftC0Left; shiftD0Left
   670  	shiftB1Left; shiftC1Left; shiftD1Left
   671  	shiftB2Left; shiftC2Left; shiftD2Left
   672  
   673  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   674  	shiftB0Right; shiftC0Right; shiftD0Right
   675  	shiftB1Right; shiftC1Right; shiftD1Right
   676  	shiftB2Right; shiftC2Right; shiftD2Right
   677  
   678  	CMPQ itr2, itr1
   679  	JB   openSSLTail192LoopA
   680  
   681  	CMPQ itr2, $160
   682  	JNE  openSSLTail192LoopB
   683  
   684  	CMPQ inl, $176
   685  	JB   openSSLTail192Store
   686  
   687  	polyAdd(160(inp))
   688  	polyMul
   689  
   690  	CMPQ inl, $192
   691  	JB   openSSLTail192Store
   692  
   693  	polyAdd(176(inp))
   694  	polyMul
   695  
   696  openSSLTail192Store:
   697  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
   698  	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
   699  	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
   700  	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
   701  
   702  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   703  	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
   704  	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
   705  
   706  	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
   707  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   708  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   709  
   710  	SUBQ $128, inl
   711  	LEAQ 128(inp), inp
   712  	LEAQ 128(oup), oup
   713  	JMP  openSSETail64DecLoop
   714  
   715  // ----------------------------------------------------------------------------
   716  // Special optimization for the last 256 bytes of ciphertext
   717  openSSETail256:
   718  	// Need to decrypt up to 256 bytes - prepare four blocks
   719  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
   720  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   721  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   722  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
   723  
   724  	// Store counters
   725  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   726  	XORQ itr2, itr2
   727  
   728  openSSETail256Loop:
   729  	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
   730  	polyAdd(0(inp)(itr2*1))
   731  	MOVO          C3, tmpStore
   732  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   733  	MOVO          tmpStore, C3
   734  	MOVO          C1, tmpStore
   735  	chachaQR(A3, B3, C3, D3, C1)
   736  	MOVO          tmpStore, C1
   737  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   738  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   739  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   740  	polyMulStage1
   741  	polyMulStage2
   742  	MOVO          C3, tmpStore
   743  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   744  	MOVO          tmpStore, C3
   745  	MOVO          C1, tmpStore
   746  	chachaQR(A3, B3, C3, D3, C1)
   747  	MOVO          tmpStore, C1
   748  	polyMulStage3
   749  	polyMulReduceStage
   750  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   751  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   752  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   753  	ADDQ          $2*8, itr2
   754  	CMPQ          itr2, $160
   755  	JB            openSSETail256Loop
   756  	MOVQ          inl, itr1
   757  	ANDQ          $-16, itr1
   758  
   759  openSSETail256HashLoop:
   760  	polyAdd(0(inp)(itr2*1))
   761  	polyMul
   762  	ADDQ $2*8, itr2
   763  	CMPQ itr2, itr1
   764  	JB   openSSETail256HashLoop
   765  
   766  	// Add in the state
   767  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
   768  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   769  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   770  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   771  	MOVO  D3, tmpStore
   772  
   773  	// Load - xor - store
   774  	MOVOU (0*16)(inp), D3; PXOR D3, A0
   775  	MOVOU (1*16)(inp), D3; PXOR D3, B0
   776  	MOVOU (2*16)(inp), D3; PXOR D3, C0
   777  	MOVOU (3*16)(inp), D3; PXOR D3, D0
   778  	MOVOU A0, (0*16)(oup)
   779  	MOVOU B0, (1*16)(oup)
   780  	MOVOU C0, (2*16)(oup)
   781  	MOVOU D0, (3*16)(oup)
   782  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
   783  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
   784  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   785  	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
   786  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
   787  	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
   788  	LEAQ  192(inp), inp
   789  	LEAQ  192(oup), oup
   790  	SUBQ  $192, inl
   791  	MOVO  A3, A0
   792  	MOVO  B3, B0
   793  	MOVO  C3, C0
   794  	MOVO  tmpStore, D0
   795  
   796  	JMP openSSETail64DecLoop
   797  
   798  // ----------------------------------------------------------------------------
   799  // ------------------------- AVX2 Code ----------------------------------------
   800  chacha20Poly1305Open_AVX2:
   801  	VZEROUPPER
   802  	VMOVDQU ·chacha20Constants<>(SB), AA0
   803  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
   804  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
   805  	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
   806  	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
   807  
   808  	// Special optimization, for very short buffers
   809  	CMPQ inl, $192
   810  	JBE  openAVX2192
   811  	CMPQ inl, $320
   812  	JBE  openAVX2320
   813  
   814  	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
   815  	VMOVDQA BB0, state1StoreAVX2
   816  	VMOVDQA CC0, state2StoreAVX2
   817  	VMOVDQA DD0, ctr3StoreAVX2
   818  	MOVQ    $10, itr2
   819  
   820  openAVX2PreparePolyKey:
   821  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   822  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
   823  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   824  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
   825  	DECQ     itr2
   826  	JNE      openAVX2PreparePolyKey
   827  
   828  	VPADDD ·chacha20Constants<>(SB), AA0, AA0
   829  	VPADDD state1StoreAVX2, BB0, BB0
   830  	VPADDD state2StoreAVX2, CC0, CC0
   831  	VPADDD ctr3StoreAVX2, DD0, DD0
   832  
   833  	VPERM2I128 $0x02, AA0, BB0, TT0
   834  
   835  	// Clamp and store poly key
   836  	VPAND   ·polyClampMask<>(SB), TT0, TT0
   837  	VMOVDQA TT0, rsStoreAVX2
   838  
   839  	// Stream for the first 64 bytes
   840  	VPERM2I128 $0x13, AA0, BB0, AA0
   841  	VPERM2I128 $0x13, CC0, DD0, BB0
   842  
   843  	// Hash AD + first 64 bytes
   844  	MOVQ ad_len+80(FP), itr2
   845  	CALL polyHashADInternal<>(SB)
   846  	XORQ itr1, itr1
   847  
   848  openAVX2InitialHash64:
   849  	polyAdd(0(inp)(itr1*1))
   850  	polyMulAVX2
   851  	ADDQ $16, itr1
   852  	CMPQ itr1, $64
   853  	JNE  openAVX2InitialHash64
   854  
   855  	// Decrypt the first 64 bytes
   856  	VPXOR   (0*32)(inp), AA0, AA0
   857  	VPXOR   (1*32)(inp), BB0, BB0
   858  	VMOVDQU AA0, (0*32)(oup)
   859  	VMOVDQU BB0, (1*32)(oup)
   860  	LEAQ    (2*32)(inp), inp
   861  	LEAQ    (2*32)(oup), oup
   862  	SUBQ    $64, inl
   863  
   864  openAVX2MainLoop:
   865  	CMPQ inl, $512
   866  	JB   openAVX2MainLoopDone
   867  
   868  	// Load state, increment counter blocks, store the incremented counters
   869  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
   870  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
   871  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
   872  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
   873  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
   874  	XORQ    itr1, itr1
   875  
   876  openAVX2InternalLoop:
   877  	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
   878  	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
   879  	polyAdd(0*8(inp)(itr1*1))
   880  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   881  	polyMulStage1_AVX2
   882  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   883  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
   884  	polyMulStage2_AVX2
   885  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   886  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   887  	polyMulStage3_AVX2
   888  	VMOVDQA  CC3, tmpStoreAVX2
   889  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   890  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   891  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   892  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   893  	VMOVDQA  tmpStoreAVX2, CC3
   894  	polyMulReduceStage
   895  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   896  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   897  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
   898  	polyAdd(2*8(inp)(itr1*1))
   899  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   900  	polyMulStage1_AVX2
   901  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   902  	VMOVDQA  CC3, tmpStoreAVX2
   903  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   904  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   905  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   906  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   907  	VMOVDQA  tmpStoreAVX2, CC3
   908  	polyMulStage2_AVX2
   909  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
   910  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   911  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
   912  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   913  	polyMulStage3_AVX2
   914  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   915  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
   916  	polyMulReduceStage
   917  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   918  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   919  	polyAdd(4*8(inp)(itr1*1))
   920  	LEAQ     (6*8)(itr1), itr1
   921  	VMOVDQA  CC3, tmpStoreAVX2
   922  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   923  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   924  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   925  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   926  	VMOVDQA  tmpStoreAVX2, CC3
   927  	polyMulStage1_AVX2
   928  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   929  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   930  	polyMulStage2_AVX2
   931  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
   932  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   933  	polyMulStage3_AVX2
   934  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   935  	VMOVDQA  CC3, tmpStoreAVX2
   936  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   937  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   938  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   939  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   940  	VMOVDQA  tmpStoreAVX2, CC3
   941  	polyMulReduceStage
   942  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
   943  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   944  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
   945  	CMPQ     itr1, $480
   946  	JNE      openAVX2InternalLoop
   947  
   948  	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
   949  	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
   950  	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
   951  	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
   952  	VMOVDQA CC3, tmpStoreAVX2
   953  
   954  	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
   955  	polyAdd(480(inp))
   956  	polyMulAVX2
   957  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
   958  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
   959  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
   960  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
   961  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
   962  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
   963  
   964  	// and here
   965  	polyAdd(496(inp))
   966  	polyMulAVX2
   967  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
   968  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
   969  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
   970  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
   971  	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
   972  	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
   973  	LEAQ       (32*16)(inp), inp
   974  	LEAQ       (32*16)(oup), oup
   975  	SUBQ       $(32*16), inl
   976  	JMP        openAVX2MainLoop
   977  
   978  openAVX2MainLoopDone:
   979  	// Handle the various tail sizes efficiently
   980  	TESTQ inl, inl
   981  	JE    openSSEFinalize
   982  	CMPQ  inl, $128
   983  	JBE   openAVX2Tail128
   984  	CMPQ  inl, $256
   985  	JBE   openAVX2Tail256
   986  	CMPQ  inl, $384
   987  	JBE   openAVX2Tail384
   988  	JMP   openAVX2Tail512
   989  
   990  // ----------------------------------------------------------------------------
   991  // Special optimization for buffers smaller than 193 bytes
   992  openAVX2192:
   993  	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
   994  	VMOVDQA AA0, AA1
   995  	VMOVDQA BB0, BB1
   996  	VMOVDQA CC0, CC1
   997  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
   998  	VMOVDQA AA0, AA2
   999  	VMOVDQA BB0, BB2
  1000  	VMOVDQA CC0, CC2
  1001  	VMOVDQA DD0, DD2
  1002  	VMOVDQA DD1, TT3
  1003  	MOVQ    $10, itr2
  1004  
  1005  openAVX2192InnerCipherLoop:
  1006  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1007  	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  1008  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1009  	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  1010  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1011  	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  1012  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1013  	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  1014  	DECQ       itr2
  1015  	JNE        openAVX2192InnerCipherLoop
  1016  	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  1017  	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  1018  	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  1019  	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  1020  	VPERM2I128 $0x02, AA0, BB0, TT0
  1021  
  1022  	// Clamp and store poly key
  1023  	VPAND   ·polyClampMask<>(SB), TT0, TT0
  1024  	VMOVDQA TT0, rsStoreAVX2
  1025  
  1026  	// Stream for up to 192 bytes
  1027  	VPERM2I128 $0x13, AA0, BB0, AA0
  1028  	VPERM2I128 $0x13, CC0, DD0, BB0
  1029  	VPERM2I128 $0x02, AA1, BB1, CC0
  1030  	VPERM2I128 $0x02, CC1, DD1, DD0
  1031  	VPERM2I128 $0x13, AA1, BB1, AA1
  1032  	VPERM2I128 $0x13, CC1, DD1, BB1
  1033  
  1034  openAVX2ShortOpen:
  1035  	// Hash
  1036  	MOVQ ad_len+80(FP), itr2
  1037  	CALL polyHashADInternal<>(SB)
  1038  
  1039  openAVX2ShortOpenLoop:
  1040  	CMPQ inl, $32
  1041  	JB   openAVX2ShortTail32
  1042  	SUBQ $32, inl
  1043  
  1044  	// Load for hashing
  1045  	polyAdd(0*8(inp))
  1046  	polyMulAVX2
  1047  	polyAdd(2*8(inp))
  1048  	polyMulAVX2
  1049  
  1050  	// Load for decryption
  1051  	VPXOR   (inp), AA0, AA0
  1052  	VMOVDQU AA0, (oup)
  1053  	LEAQ    (1*32)(inp), inp
  1054  	LEAQ    (1*32)(oup), oup
  1055  
  1056  	// Shift stream left
  1057  	VMOVDQA BB0, AA0
  1058  	VMOVDQA CC0, BB0
  1059  	VMOVDQA DD0, CC0
  1060  	VMOVDQA AA1, DD0
  1061  	VMOVDQA BB1, AA1
  1062  	VMOVDQA CC1, BB1
  1063  	VMOVDQA DD1, CC1
  1064  	VMOVDQA AA2, DD1
  1065  	VMOVDQA BB2, AA2
  1066  	JMP     openAVX2ShortOpenLoop
  1067  
  1068  openAVX2ShortTail32:
  1069  	CMPQ    inl, $16
  1070  	VMOVDQA A0, A1
  1071  	JB      openAVX2ShortDone
  1072  
  1073  	SUBQ $16, inl
  1074  
  1075  	// Load for hashing
  1076  	polyAdd(0*8(inp))
  1077  	polyMulAVX2
  1078  
  1079  	// Load for decryption
  1080  	VPXOR      (inp), A0, T0
  1081  	VMOVDQU    T0, (oup)
  1082  	LEAQ       (1*16)(inp), inp
  1083  	LEAQ       (1*16)(oup), oup
  1084  	VPERM2I128 $0x11, AA0, AA0, AA0
  1085  	VMOVDQA    A0, A1
  1086  
  1087  openAVX2ShortDone:
  1088  	VZEROUPPER
  1089  	JMP openSSETail16
  1090  
  1091  // ----------------------------------------------------------------------------
  1092  // Special optimization for buffers smaller than 321 bytes
  1093  openAVX2320:
  1094  	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  1095  	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1096  	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  1097  	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  1098  	MOVQ    $10, itr2
  1099  
  1100  openAVX2320InnerCipherLoop:
  1101  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1102  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1103  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1104  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1105  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1106  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1107  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1108  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1109  	DECQ     itr2
  1110  	JNE      openAVX2320InnerCipherLoop
  1111  
  1112  	VMOVDQA ·chacha20Constants<>(SB), TT0
  1113  	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  1114  	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  1115  	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  1116  	VMOVDQA ·avx2IncMask<>(SB), TT0
  1117  	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  1118  	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  1119  	VPADDD  TT3, DD2, DD2
  1120  
  1121  	// Clamp and store poly key
  1122  	VPERM2I128 $0x02, AA0, BB0, TT0
  1123  	VPAND      ·polyClampMask<>(SB), TT0, TT0
  1124  	VMOVDQA    TT0, rsStoreAVX2
  1125  
  1126  	// Stream for up to 320 bytes
  1127  	VPERM2I128 $0x13, AA0, BB0, AA0
  1128  	VPERM2I128 $0x13, CC0, DD0, BB0
  1129  	VPERM2I128 $0x02, AA1, BB1, CC0
  1130  	VPERM2I128 $0x02, CC1, DD1, DD0
  1131  	VPERM2I128 $0x13, AA1, BB1, AA1
  1132  	VPERM2I128 $0x13, CC1, DD1, BB1
  1133  	VPERM2I128 $0x02, AA2, BB2, CC1
  1134  	VPERM2I128 $0x02, CC2, DD2, DD1
  1135  	VPERM2I128 $0x13, AA2, BB2, AA2
  1136  	VPERM2I128 $0x13, CC2, DD2, BB2
  1137  	JMP        openAVX2ShortOpen
  1138  
  1139  // ----------------------------------------------------------------------------
  1140  // Special optimization for the last 128 bytes of ciphertext
  1141  openAVX2Tail128:
  1142  	// Need to decrypt up to 128 bytes - prepare two blocks
  1143  	VMOVDQA ·chacha20Constants<>(SB), AA1
  1144  	VMOVDQA state1StoreAVX2, BB1
  1145  	VMOVDQA state2StoreAVX2, CC1
  1146  	VMOVDQA ctr3StoreAVX2, DD1
  1147  	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
  1148  	VMOVDQA DD1, DD0
  1149  
  1150  	XORQ  itr2, itr2
  1151  	MOVQ  inl, itr1
  1152  	ANDQ  $-16, itr1
  1153  	TESTQ itr1, itr1
  1154  	JE    openAVX2Tail128LoopB
  1155  
  1156  openAVX2Tail128LoopA:
  1157  	// Perform ChaCha rounds, while hashing the remaining input
  1158  	polyAdd(0(inp)(itr2*1))
  1159  	polyMulAVX2
  1160  
  1161  openAVX2Tail128LoopB:
  1162  	ADDQ     $16, itr2
  1163  	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1164  	VPALIGNR $4, BB1, BB1, BB1
  1165  	VPALIGNR $8, CC1, CC1, CC1
  1166  	VPALIGNR $12, DD1, DD1, DD1
  1167  	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1168  	VPALIGNR $12, BB1, BB1, BB1
  1169  	VPALIGNR $8, CC1, CC1, CC1
  1170  	VPALIGNR $4, DD1, DD1, DD1
  1171  	CMPQ     itr2, itr1
  1172  	JB       openAVX2Tail128LoopA
  1173  	CMPQ     itr2, $160
  1174  	JNE      openAVX2Tail128LoopB
  1175  
  1176  	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
  1177  	VPADDD     state1StoreAVX2, BB1, BB1
  1178  	VPADDD     state2StoreAVX2, CC1, CC1
  1179  	VPADDD     DD0, DD1, DD1
  1180  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1181  
  1182  openAVX2TailLoop:
  1183  	CMPQ inl, $32
  1184  	JB   openAVX2Tail
  1185  	SUBQ $32, inl
  1186  
  1187  	// Load for decryption
  1188  	VPXOR   (inp), AA0, AA0
  1189  	VMOVDQU AA0, (oup)
  1190  	LEAQ    (1*32)(inp), inp
  1191  	LEAQ    (1*32)(oup), oup
  1192  	VMOVDQA BB0, AA0
  1193  	VMOVDQA CC0, BB0
  1194  	VMOVDQA DD0, CC0
  1195  	JMP     openAVX2TailLoop
  1196  
  1197  openAVX2Tail:
  1198  	CMPQ    inl, $16
  1199  	VMOVDQA A0, A1
  1200  	JB      openAVX2TailDone
  1201  	SUBQ    $16, inl
  1202  
  1203  	// Load for decryption
  1204  	VPXOR      (inp), A0, T0
  1205  	VMOVDQU    T0, (oup)
  1206  	LEAQ       (1*16)(inp), inp
  1207  	LEAQ       (1*16)(oup), oup
  1208  	VPERM2I128 $0x11, AA0, AA0, AA0
  1209  	VMOVDQA    A0, A1
  1210  
  1211  openAVX2TailDone:
  1212  	VZEROUPPER
  1213  	JMP openSSETail16
  1214  
  1215  // ----------------------------------------------------------------------------
  1216  // Special optimization for the last 256 bytes of ciphertext
  1217  openAVX2Tail256:
  1218  	// Need to decrypt up to 256 bytes - prepare four blocks
  1219  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
  1220  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
  1221  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
  1222  	VMOVDQA ctr3StoreAVX2, DD0
  1223  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  1224  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  1225  	VMOVDQA DD0, TT1
  1226  	VMOVDQA DD1, TT2
  1227  
  1228  	// Compute the number of iterations that will hash data
  1229  	MOVQ    inl, tmpStoreAVX2
  1230  	MOVQ    inl, itr1
  1231  	SUBQ    $128, itr1
  1232  	SHRQ    $4, itr1
  1233  	MOVQ    $10, itr2
  1234  	CMPQ    itr1, $10
  1235  	CMOVQGT itr2, itr1
  1236  	MOVQ    inp, inl
  1237  	XORQ    itr2, itr2
  1238  
  1239  openAVX2Tail256LoopA:
  1240  	polyAdd(0(inl))
  1241  	polyMulAVX2
  1242  	LEAQ 16(inl), inl
  1243  
  1244  	// Perform ChaCha rounds, while hashing the remaining input
  1245  openAVX2Tail256LoopB:
  1246  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1247  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  1248  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1249  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  1250  	INCQ     itr2
  1251  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1252  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  1253  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1254  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  1255  	CMPQ     itr2, itr1
  1256  	JB       openAVX2Tail256LoopA
  1257  
  1258  	CMPQ itr2, $10
  1259  	JNE  openAVX2Tail256LoopB
  1260  
  1261  	MOVQ inl, itr2
  1262  	SUBQ inp, inl
  1263  	MOVQ inl, itr1
  1264  	MOVQ tmpStoreAVX2, inl
  1265  
  1266  	// Hash the remainder of data (if any)
  1267  openAVX2Tail256Hash:
  1268  	ADDQ $16, itr1
  1269  	CMPQ itr1, inl
  1270  	JGT  openAVX2Tail256HashEnd
  1271  	polyAdd (0(itr2))
  1272  	polyMulAVX2
  1273  	LEAQ 16(itr2), itr2
  1274  	JMP  openAVX2Tail256Hash
  1275  
  1276  // Store 128 bytes safely, then go to store loop
  1277  openAVX2Tail256HashEnd:
  1278  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  1279  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  1280  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  1281  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  1282  	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
  1283  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1284  
  1285  	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
  1286  	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
  1287  	LEAQ    (4*32)(inp), inp
  1288  	LEAQ    (4*32)(oup), oup
  1289  	SUBQ    $4*32, inl
  1290  
  1291  	JMP openAVX2TailLoop
  1292  
  1293  // ----------------------------------------------------------------------------
  1294  // Special optimization for the last 384 bytes of ciphertext
  1295  openAVX2Tail384:
  1296  	// Need to decrypt up to 384 bytes - prepare six blocks
  1297  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  1298  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  1299  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  1300  	VMOVDQA ctr3StoreAVX2, DD0
  1301  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  1302  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  1303  	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
  1304  	VMOVDQA DD0, ctr0StoreAVX2
  1305  	VMOVDQA DD1, ctr1StoreAVX2
  1306  	VMOVDQA DD2, ctr2StoreAVX2
  1307  
  1308  	// Compute the number of iterations that will hash two blocks of data
  1309  	MOVQ    inl, tmpStoreAVX2
  1310  	MOVQ    inl, itr1
  1311  	SUBQ    $256, itr1
  1312  	SHRQ    $4, itr1
  1313  	ADDQ    $6, itr1
  1314  	MOVQ    $10, itr2
  1315  	CMPQ    itr1, $10
  1316  	CMOVQGT itr2, itr1
  1317  	MOVQ    inp, inl
  1318  	XORQ    itr2, itr2
  1319  
  1320  	// Perform ChaCha rounds, while hashing the remaining input
  1321  openAVX2Tail384LoopB:
  1322  	polyAdd(0(inl))
  1323  	polyMulAVX2
  1324  	LEAQ 16(inl), inl
  1325  
  1326  openAVX2Tail384LoopA:
  1327  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1328  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1329  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1330  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1331  	polyAdd(0(inl))
  1332  	polyMulAVX2
  1333  	LEAQ     16(inl), inl
  1334  	INCQ     itr2
  1335  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1336  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1337  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1338  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1339  
  1340  	CMPQ itr2, itr1
  1341  	JB   openAVX2Tail384LoopB
  1342  
  1343  	CMPQ itr2, $10
  1344  	JNE  openAVX2Tail384LoopA
  1345  
  1346  	MOVQ inl, itr2
  1347  	SUBQ inp, inl
  1348  	MOVQ inl, itr1
  1349  	MOVQ tmpStoreAVX2, inl
  1350  
  1351  openAVX2Tail384Hash:
  1352  	ADDQ $16, itr1
  1353  	CMPQ itr1, inl
  1354  	JGT  openAVX2Tail384HashEnd
  1355  	polyAdd(0(itr2))
  1356  	polyMulAVX2
  1357  	LEAQ 16(itr2), itr2
  1358  	JMP  openAVX2Tail384Hash
  1359  
  1360  // Store 256 bytes safely, then go to store loop
  1361  openAVX2Tail384HashEnd:
  1362  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  1363  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  1364  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  1365  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
  1366  	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
  1367  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  1368  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  1369  	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
  1370  	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  1371  	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  1372  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1373  	LEAQ       (8*32)(inp), inp
  1374  	LEAQ       (8*32)(oup), oup
  1375  	SUBQ       $8*32, inl
  1376  	JMP        openAVX2TailLoop
  1377  
  1378  // ----------------------------------------------------------------------------
  1379  // Special optimization for the last 512 bytes of ciphertext
  1380  openAVX2Tail512:
  1381  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1382  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1383  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1384  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1385  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1386  	XORQ    itr1, itr1
  1387  	MOVQ    inp, itr2
  1388  
  1389  openAVX2Tail512LoopB:
  1390  	polyAdd(0(itr2))
  1391  	polyMulAVX2
  1392  	LEAQ (2*8)(itr2), itr2
  1393  
  1394  openAVX2Tail512LoopA:
  1395  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1396  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1397  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1398  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1399  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1400  	VMOVDQA  CC3, tmpStoreAVX2
  1401  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1402  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1403  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1404  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1405  	VMOVDQA  tmpStoreAVX2, CC3
  1406  	polyAdd(0*8(itr2))
  1407  	polyMulAVX2
  1408  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1409  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1410  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1411  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1412  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1413  	VMOVDQA  CC3, tmpStoreAVX2
  1414  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1415  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1416  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1417  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1418  	VMOVDQA  tmpStoreAVX2, CC3
  1419  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  1420  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1421  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  1422  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1423  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1424  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1425  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1426  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1427  	polyAdd(2*8(itr2))
  1428  	polyMulAVX2
  1429  	LEAQ     (4*8)(itr2), itr2
  1430  	VMOVDQA  CC3, tmpStoreAVX2
  1431  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1432  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1433  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1434  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1435  	VMOVDQA  tmpStoreAVX2, CC3
  1436  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1437  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1438  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1439  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1440  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1441  	VMOVDQA  CC3, tmpStoreAVX2
  1442  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1443  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1444  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1445  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1446  	VMOVDQA  tmpStoreAVX2, CC3
  1447  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  1448  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1449  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  1450  	INCQ     itr1
  1451  	CMPQ     itr1, $4
  1452  	JLT      openAVX2Tail512LoopB
  1453  
  1454  	CMPQ itr1, $10
  1455  	JNE  openAVX2Tail512LoopA
  1456  
  1457  	MOVQ inl, itr1
  1458  	SUBQ $384, itr1
  1459  	ANDQ $-16, itr1
  1460  
  1461  openAVX2Tail512HashLoop:
  1462  	TESTQ itr1, itr1
  1463  	JE    openAVX2Tail512HashEnd
  1464  	polyAdd(0(itr2))
  1465  	polyMulAVX2
  1466  	LEAQ  16(itr2), itr2
  1467  	SUBQ  $16, itr1
  1468  	JMP   openAVX2Tail512HashLoop
  1469  
  1470  openAVX2Tail512HashEnd:
  1471  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1472  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1473  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1474  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1475  	VMOVDQA    CC3, tmpStoreAVX2
  1476  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  1477  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  1478  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  1479  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1480  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  1481  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  1482  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1483  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  1484  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  1485  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  1486  
  1487  	LEAQ (12*32)(inp), inp
  1488  	LEAQ (12*32)(oup), oup
  1489  	SUBQ $12*32, inl
  1490  
  1491  	JMP openAVX2TailLoop
  1492  
  1493  // ----------------------------------------------------------------------------
  1494  // ----------------------------------------------------------------------------
  1495  // func chacha20Poly1305Seal(dst, key, src, ad []byte)
  1496  TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
  1497  	// For aligned stack access
  1498  	MOVQ SP, BP
  1499  	ADDQ $32, BP
  1500  	ANDQ $-32, BP
  1501  	MOVQ dst+0(FP), oup
  1502  	MOVQ key+24(FP), keyp
  1503  	MOVQ src+48(FP), inp
  1504  	MOVQ src_len+56(FP), inl
  1505  	MOVQ ad+72(FP), adp
  1506  
  1507  	CMPB ·useAVX2(SB), $1
  1508  	JE   chacha20Poly1305Seal_AVX2
  1509  
  1510  	// Special optimization, for very short buffers
  1511  	CMPQ inl, $128
  1512  	JBE  sealSSE128 // About 15% faster
  1513  
  1514  	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
  1515  	MOVOU ·chacha20Constants<>(SB), A0
  1516  	MOVOU (1*16)(keyp), B0
  1517  	MOVOU (2*16)(keyp), C0
  1518  	MOVOU (3*16)(keyp), D0
  1519  
  1520  	// Store state on stack for future use
  1521  	MOVO B0, state1Store
  1522  	MOVO C0, state2Store
  1523  
  1524  	// Load state, increment counter blocks
  1525  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1526  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1527  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1528  
  1529  	// Store counters
  1530  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1531  	MOVQ $10, itr2
  1532  
  1533  sealSSEIntroLoop:
  1534  	MOVO         C3, tmpStore
  1535  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1536  	MOVO         tmpStore, C3
  1537  	MOVO         C1, tmpStore
  1538  	chachaQR(A3, B3, C3, D3, C1)
  1539  	MOVO         tmpStore, C1
  1540  	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  1541  	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  1542  	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  1543  
  1544  	MOVO          C3, tmpStore
  1545  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1546  	MOVO          tmpStore, C3
  1547  	MOVO          C1, tmpStore
  1548  	chachaQR(A3, B3, C3, D3, C1)
  1549  	MOVO          tmpStore, C1
  1550  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1551  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1552  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1553  	DECQ          itr2
  1554  	JNE           sealSSEIntroLoop
  1555  
  1556  	// Add in the state
  1557  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1558  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1559  	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1560  	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1561  
  1562  	// Clamp and store the key
  1563  	PAND ·polyClampMask<>(SB), A0
  1564  	MOVO A0, rStore
  1565  	MOVO B0, sStore
  1566  
  1567  	// Hash AAD
  1568  	MOVQ ad_len+80(FP), itr2
  1569  	CALL polyHashADInternal<>(SB)
  1570  
  1571  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1572  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1573  	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
  1574  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1575  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1576  	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
  1577  
  1578  	MOVQ $128, itr1
  1579  	SUBQ $128, inl
  1580  	LEAQ 128(inp), inp
  1581  
  1582  	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
  1583  
  1584  	CMPQ inl, $64
  1585  	JBE  sealSSE128SealHash
  1586  
  1587  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1588  	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1589  	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
  1590  
  1591  	ADDQ $64, itr1
  1592  	SUBQ $64, inl
  1593  	LEAQ 64(inp), inp
  1594  
  1595  	MOVQ $2, itr1
  1596  	MOVQ $8, itr2
  1597  
  1598  	CMPQ inl, $64
  1599  	JBE  sealSSETail64
  1600  	CMPQ inl, $128
  1601  	JBE  sealSSETail128
  1602  	CMPQ inl, $192
  1603  	JBE  sealSSETail192
  1604  
  1605  sealSSEMainLoop:
  1606  	// Load state, increment counter blocks
  1607  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  1608  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1609  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1610  	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1611  
  1612  	// Store counters
  1613  	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1614  
  1615  sealSSEInnerLoop:
  1616  	MOVO          C3, tmpStore
  1617  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1618  	MOVO          tmpStore, C3
  1619  	MOVO          C1, tmpStore
  1620  	chachaQR(A3, B3, C3, D3, C1)
  1621  	MOVO          tmpStore, C1
  1622  	polyAdd(0(oup))
  1623  	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
  1624  	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
  1625  	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
  1626  	polyMulStage1
  1627  	polyMulStage2
  1628  	LEAQ          (2*8)(oup), oup
  1629  	MOVO          C3, tmpStore
  1630  	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1631  	MOVO          tmpStore, C3
  1632  	MOVO          C1, tmpStore
  1633  	polyMulStage3
  1634  	chachaQR(A3, B3, C3, D3, C1)
  1635  	MOVO          tmpStore, C1
  1636  	polyMulReduceStage
  1637  	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1638  	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1639  	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1640  	DECQ          itr2
  1641  	JGE           sealSSEInnerLoop
  1642  	polyAdd(0(oup))
  1643  	polyMul
  1644  	LEAQ          (2*8)(oup), oup
  1645  	DECQ          itr1
  1646  	JG            sealSSEInnerLoop
  1647  
  1648  	// Add in the state
  1649  	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1650  	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1651  	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1652  	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1653  	MOVO  D3, tmpStore
  1654  
  1655  	// Load - xor - store
  1656  	MOVOU (0*16)(inp), D3; PXOR D3, A0
  1657  	MOVOU (1*16)(inp), D3; PXOR D3, B0
  1658  	MOVOU (2*16)(inp), D3; PXOR D3, C0
  1659  	MOVOU (3*16)(inp), D3; PXOR D3, D0
  1660  	MOVOU A0, (0*16)(oup)
  1661  	MOVOU B0, (1*16)(oup)
  1662  	MOVOU C0, (2*16)(oup)
  1663  	MOVOU D0, (3*16)(oup)
  1664  	MOVO  tmpStore, D3
  1665  
  1666  	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1667  	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1668  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1669  	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
  1670  	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1671  	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
  1672  	ADDQ  $192, inp
  1673  	MOVQ  $192, itr1
  1674  	SUBQ  $192, inl
  1675  	MOVO  A3, A1
  1676  	MOVO  B3, B1
  1677  	MOVO  C3, C1
  1678  	MOVO  D3, D1
  1679  	CMPQ  inl, $64
  1680  	JBE   sealSSE128SealHash
  1681  	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1682  	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1683  	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
  1684  	LEAQ  64(inp), inp
  1685  	SUBQ  $64, inl
  1686  	MOVQ  $6, itr1
  1687  	MOVQ  $4, itr2
  1688  	CMPQ  inl, $192
  1689  	JG    sealSSEMainLoop
  1690  
  1691  	MOVQ  inl, itr1
  1692  	TESTQ inl, inl
  1693  	JE    sealSSE128SealHash
  1694  	MOVQ  $6, itr1
  1695  	CMPQ  inl, $64
  1696  	JBE   sealSSETail64
  1697  	CMPQ  inl, $128
  1698  	JBE   sealSSETail128
  1699  	JMP   sealSSETail192
  1700  
  1701  // ----------------------------------------------------------------------------
  1702  // Special optimization for the last 64 bytes of plaintext
  1703  sealSSETail64:
  1704  	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
  1705  	MOVO  ·chacha20Constants<>(SB), A1
  1706  	MOVO  state1Store, B1
  1707  	MOVO  state2Store, C1
  1708  	MOVO  ctr3Store, D1
  1709  	PADDL ·sseIncMask<>(SB), D1
  1710  	MOVO  D1, ctr0Store
  1711  
  1712  sealSSETail64LoopA:
  1713  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1714  	polyAdd(0(oup))
  1715  	polyMul
  1716  	LEAQ 16(oup), oup
  1717  
  1718  sealSSETail64LoopB:
  1719  	chachaQR(A1, B1, C1, D1, T1)
  1720  	shiftB1Left;  shiftC1Left; shiftD1Left
  1721  	chachaQR(A1, B1, C1, D1, T1)
  1722  	shiftB1Right; shiftC1Right; shiftD1Right
  1723  	polyAdd(0(oup))
  1724  	polyMul
  1725  	LEAQ          16(oup), oup
  1726  
  1727  	DECQ itr1
  1728  	JG   sealSSETail64LoopA
  1729  
  1730  	DECQ  itr2
  1731  	JGE   sealSSETail64LoopB
  1732  	PADDL ·chacha20Constants<>(SB), A1
  1733  	PADDL state1Store, B1
  1734  	PADDL state2Store, C1
  1735  	PADDL ctr0Store, D1
  1736  
  1737  	JMP sealSSE128Seal
  1738  
  1739  // ----------------------------------------------------------------------------
  1740  // Special optimization for the last 128 bytes of plaintext
  1741  sealSSETail128:
  1742  	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
  1743  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1744  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1745  
  1746  sealSSETail128LoopA:
  1747  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1748  	polyAdd(0(oup))
  1749  	polyMul
  1750  	LEAQ 16(oup), oup
  1751  
  1752  sealSSETail128LoopB:
  1753  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1754  	shiftB0Left;  shiftC0Left; shiftD0Left
  1755  	shiftB1Left;  shiftC1Left; shiftD1Left
  1756  	polyAdd(0(oup))
  1757  	polyMul
  1758  	LEAQ          16(oup), oup
  1759  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1760  	shiftB0Right; shiftC0Right; shiftD0Right
  1761  	shiftB1Right; shiftC1Right; shiftD1Right
  1762  
  1763  	DECQ itr1
  1764  	JG   sealSSETail128LoopA
  1765  
  1766  	DECQ itr2
  1767  	JGE  sealSSETail128LoopB
  1768  
  1769  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
  1770  	PADDL state1Store, B0; PADDL state1Store, B1
  1771  	PADDL state2Store, C0; PADDL state2Store, C1
  1772  	PADDL ctr0Store, D0; PADDL ctr1Store, D1
  1773  
  1774  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1775  	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1776  	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1777  
  1778  	MOVQ $64, itr1
  1779  	LEAQ 64(inp), inp
  1780  	SUBQ $64, inl
  1781  
  1782  	JMP sealSSE128SealHash
  1783  
  1784  // ----------------------------------------------------------------------------
  1785  // Special optimization for the last 192 bytes of plaintext
  1786  sealSSETail192:
  1787  	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
  1788  	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1789  	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1790  	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
  1791  
  1792  sealSSETail192LoopA:
  1793  	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1794  	polyAdd(0(oup))
  1795  	polyMul
  1796  	LEAQ 16(oup), oup
  1797  
  1798  sealSSETail192LoopB:
  1799  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1800  	shiftB0Left; shiftC0Left; shiftD0Left
  1801  	shiftB1Left; shiftC1Left; shiftD1Left
  1802  	shiftB2Left; shiftC2Left; shiftD2Left
  1803  
  1804  	polyAdd(0(oup))
  1805  	polyMul
  1806  	LEAQ 16(oup), oup
  1807  
  1808  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1809  	shiftB0Right; shiftC0Right; shiftD0Right
  1810  	shiftB1Right; shiftC1Right; shiftD1Right
  1811  	shiftB2Right; shiftC2Right; shiftD2Right
  1812  
  1813  	DECQ itr1
  1814  	JG   sealSSETail192LoopA
  1815  
  1816  	DECQ itr2
  1817  	JGE  sealSSETail192LoopB
  1818  
  1819  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1820  	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
  1821  	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
  1822  	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
  1823  
  1824  	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1825  	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1826  	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1827  	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
  1828  	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  1829  	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1830  
  1831  	MOVO A2, A1
  1832  	MOVO B2, B1
  1833  	MOVO C2, C1
  1834  	MOVO D2, D1
  1835  	MOVQ $128, itr1
  1836  	LEAQ 128(inp), inp
  1837  	SUBQ $128, inl
  1838  
  1839  	JMP sealSSE128SealHash
  1840  
  1841  // ----------------------------------------------------------------------------
  1842  // Special seal optimization for buffers smaller than 129 bytes
  1843  sealSSE128:
  1844  	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
  1845  	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
  1846  	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1847  	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1848  	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
  1849  	MOVQ  $10, itr2
  1850  
  1851  sealSSE128InnerCipherLoop:
  1852  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1853  	shiftB0Left;  shiftB1Left; shiftB2Left
  1854  	shiftC0Left;  shiftC1Left; shiftC2Left
  1855  	shiftD0Left;  shiftD1Left; shiftD2Left
  1856  	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1857  	shiftB0Right; shiftB1Right; shiftB2Right
  1858  	shiftC0Right; shiftC1Right; shiftC2Right
  1859  	shiftD0Right; shiftD1Right; shiftD2Right
  1860  	DECQ          itr2
  1861  	JNE           sealSSE128InnerCipherLoop
  1862  
  1863  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  1864  	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1865  	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
  1866  	PADDL T2, C1; PADDL T2, C2
  1867  	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
  1868  	PAND  ·polyClampMask<>(SB), A0
  1869  	MOVOU A0, rStore
  1870  	MOVOU B0, sStore
  1871  
  1872  	// Hash
  1873  	MOVQ ad_len+80(FP), itr2
  1874  	CALL polyHashADInternal<>(SB)
  1875  	XORQ itr1, itr1
  1876  
  1877  sealSSE128SealHash:
  1878  	// itr1 holds the number of bytes encrypted but not yet hashed
  1879  	CMPQ itr1, $16
  1880  	JB   sealSSE128Seal
  1881  	polyAdd(0(oup))
  1882  	polyMul
  1883  
  1884  	SUBQ $16, itr1
  1885  	ADDQ $16, oup
  1886  
  1887  	JMP sealSSE128SealHash
  1888  
  1889  sealSSE128Seal:
  1890  	CMPQ inl, $16
  1891  	JB   sealSSETail
  1892  	SUBQ $16, inl
  1893  
  1894  	// Load for decryption
  1895  	MOVOU (inp), T0
  1896  	PXOR  T0, A1
  1897  	MOVOU A1, (oup)
  1898  	LEAQ  (1*16)(inp), inp
  1899  	LEAQ  (1*16)(oup), oup
  1900  
  1901  	// Extract for hashing
  1902  	MOVQ   A1, t0
  1903  	PSRLDQ $8, A1
  1904  	MOVQ A1, t1
  1905  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1906  	polyMul
  1907  
  1908  	// Shift the stream "left"
  1909  	MOVO B1, A1
  1910  	MOVO C1, B1
  1911  	MOVO D1, C1
  1912  	MOVO A2, D1
  1913  	MOVO B2, A2
  1914  	MOVO C2, B2
  1915  	MOVO D2, C2
  1916  	JMP  sealSSE128Seal
  1917  
  1918  sealSSETail:
  1919  	TESTQ inl, inl
  1920  	JE    sealSSEFinalize
  1921  
  1922  	// We can only load the PT one byte at a time to avoid read after end of buffer
  1923  	MOVQ inl, itr2
  1924  	SHLQ $4, itr2
  1925  	LEAQ ·andMask<>(SB), t0
  1926  	MOVQ inl, itr1
  1927  	LEAQ -1(inp)(inl*1), inp
  1928  	XORQ t2, t2
  1929  	XORQ t3, t3
  1930  	XORQ AX, AX
  1931  
  1932  sealSSETailLoadLoop:
  1933  	SHLQ $8, t2, t3
  1934  	SHLQ $8, t2
  1935  	MOVB (inp), AX
  1936  	XORQ AX, t2
  1937  	LEAQ   -1(inp), inp
  1938  	DECQ   itr1
  1939  	JNE    sealSSETailLoadLoop
  1940  	MOVQ t2, 0+tmpStore
  1941  	MOVQ t3, 8+tmpStore
  1942  	PXOR 0+tmpStore, A1
  1943  	MOVOU  A1, (oup)
  1944  	MOVOU  -16(t0)(itr2*1), T0
  1945  	PAND   T0, A1
  1946  	MOVQ   A1, t0
  1947  	PSRLDQ $8, A1
  1948  	MOVQ   A1, t1
  1949  	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1950  	polyMul
  1951  
  1952  	ADDQ inl, oup
  1953  
  1954  sealSSEFinalize:
  1955  	// Hash in the buffer lengths
  1956  	ADDQ ad_len+80(FP), acc0
  1957  	ADCQ src_len+56(FP), acc1
  1958  	ADCQ $1, acc2
  1959  	polyMul
  1960  
  1961  	// Final reduce
  1962  	MOVQ    acc0, t0
  1963  	MOVQ    acc1, t1
  1964  	MOVQ    acc2, t2
  1965  	SUBQ    $-5, acc0
  1966  	SBBQ    $-1, acc1
  1967  	SBBQ    $3, acc2
  1968  	CMOVQCS t0, acc0
  1969  	CMOVQCS t1, acc1
  1970  	CMOVQCS t2, acc2
  1971  
  1972  	// Add in the "s" part of the key
  1973  	ADDQ 0+sStore, acc0
  1974  	ADCQ 8+sStore, acc1
  1975  
  1976  	// Finally store the tag at the end of the message
  1977  	MOVQ acc0, (0*8)(oup)
  1978  	MOVQ acc1, (1*8)(oup)
  1979  	RET
  1980  
  1981  // ----------------------------------------------------------------------------
  1982  // ------------------------- AVX2 Code ----------------------------------------
  1983  chacha20Poly1305Seal_AVX2:
  1984  	VZEROUPPER
  1985  	VMOVDQU ·chacha20Constants<>(SB), AA0
  1986  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
  1987  	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
  1988  	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
  1989  	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
  1990  
  1991  	// Special optimizations, for very short buffers
  1992  	CMPQ inl, $192
  1993  	JBE  seal192AVX2 // 33% faster
  1994  	CMPQ inl, $320
  1995  	JBE  seal320AVX2 // 17% faster
  1996  
  1997  	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  1998  	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1999  	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
  2000  	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
  2001  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
  2002  	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
  2003  	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
  2004  	VMOVDQA DD3, ctr3StoreAVX2
  2005  	MOVQ    $10, itr2
  2006  
  2007  sealAVX2IntroLoop:
  2008  	VMOVDQA CC3, tmpStoreAVX2
  2009  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2010  	VMOVDQA tmpStoreAVX2, CC3
  2011  	VMOVDQA CC1, tmpStoreAVX2
  2012  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2013  	VMOVDQA tmpStoreAVX2, CC1
  2014  
  2015  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  2016  	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  2017  	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  2018  	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  2019  
  2020  	VMOVDQA CC3, tmpStoreAVX2
  2021  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2022  	VMOVDQA tmpStoreAVX2, CC3
  2023  	VMOVDQA CC1, tmpStoreAVX2
  2024  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2025  	VMOVDQA tmpStoreAVX2, CC1
  2026  
  2027  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  2028  	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  2029  	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  2030  	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  2031  	DECQ     itr2
  2032  	JNE      sealAVX2IntroLoop
  2033  
  2034  	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2035  	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2036  	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2037  	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2038  
  2039  	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
  2040  	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
  2041  	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
  2042  
  2043  	// Clamp and store poly key
  2044  	VPAND   ·polyClampMask<>(SB), DD0, DD0
  2045  	VMOVDQA DD0, rsStoreAVX2
  2046  
  2047  	// Hash AD
  2048  	MOVQ ad_len+80(FP), itr2
  2049  	CALL polyHashADInternal<>(SB)
  2050  
  2051  	// Can store at least 320 bytes
  2052  	VPXOR   (0*32)(inp), AA0, AA0
  2053  	VPXOR   (1*32)(inp), CC0, CC0
  2054  	VMOVDQU AA0, (0*32)(oup)
  2055  	VMOVDQU CC0, (1*32)(oup)
  2056  
  2057  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  2058  	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
  2059  	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
  2060  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  2061  	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
  2062  	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
  2063  
  2064  	MOVQ $320, itr1
  2065  	SUBQ $320, inl
  2066  	LEAQ 320(inp), inp
  2067  
  2068  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
  2069  	CMPQ       inl, $128
  2070  	JBE        sealAVX2SealHash
  2071  
  2072  	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
  2073  	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
  2074  	SUBQ    $128, inl
  2075  	LEAQ    128(inp), inp
  2076  
  2077  	MOVQ $8, itr1
  2078  	MOVQ $2, itr2
  2079  
  2080  	CMPQ inl, $128
  2081  	JBE  sealAVX2Tail128
  2082  	CMPQ inl, $256
  2083  	JBE  sealAVX2Tail256
  2084  	CMPQ inl, $384
  2085  	JBE  sealAVX2Tail384
  2086  	CMPQ inl, $512
  2087  	JBE  sealAVX2Tail512
  2088  
  2089  	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
  2090  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2091  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2092  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2093  	VMOVDQA ctr3StoreAVX2, DD0
  2094  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2095  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2096  
  2097  	VMOVDQA CC3, tmpStoreAVX2
  2098  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2099  	VMOVDQA tmpStoreAVX2, CC3
  2100  	VMOVDQA CC1, tmpStoreAVX2
  2101  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2102  	VMOVDQA tmpStoreAVX2, CC1
  2103  
  2104  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  2105  	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  2106  	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  2107  	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  2108  
  2109  	VMOVDQA CC3, tmpStoreAVX2
  2110  	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2111  	VMOVDQA tmpStoreAVX2, CC3
  2112  	VMOVDQA CC1, tmpStoreAVX2
  2113  	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2114  	VMOVDQA tmpStoreAVX2, CC1
  2115  
  2116  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  2117  	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  2118  	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  2119  	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  2120  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2121  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2122  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2123  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2124  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2125  	VMOVDQA  CC3, tmpStoreAVX2
  2126  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2127  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2128  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2129  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2130  	VMOVDQA  tmpStoreAVX2, CC3
  2131  
  2132  	SUBQ $16, oup                  // Adjust the pointer
  2133  	MOVQ $9, itr1
  2134  	JMP  sealAVX2InternalLoopStart
  2135  
  2136  sealAVX2MainLoop:
  2137  	// Load state, increment counter blocks, store the incremented counters
  2138  	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2139  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2140  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2141  	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2142  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2143  	MOVQ    $10, itr1
  2144  
  2145  sealAVX2InternalLoop:
  2146  	polyAdd(0*8(oup))
  2147  	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2148  	polyMulStage1_AVX2
  2149  	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2150  	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2151  	polyMulStage2_AVX2
  2152  	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2153  	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2154  	polyMulStage3_AVX2
  2155  	VMOVDQA CC3, tmpStoreAVX2
  2156  	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2157  	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2158  	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2159  	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2160  	VMOVDQA tmpStoreAVX2, CC3
  2161  	polyMulReduceStage
  2162  
  2163  sealAVX2InternalLoopStart:
  2164  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2165  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2166  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2167  	polyAdd(2*8(oup))
  2168  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2169  	polyMulStage1_AVX2
  2170  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2171  	VMOVDQA  CC3, tmpStoreAVX2
  2172  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2173  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2174  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2175  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2176  	VMOVDQA  tmpStoreAVX2, CC3
  2177  	polyMulStage2_AVX2
  2178  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2179  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2180  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2181  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2182  	polyMulStage3_AVX2
  2183  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2184  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2185  	polyMulReduceStage
  2186  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2187  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2188  	polyAdd(4*8(oup))
  2189  	LEAQ     (6*8)(oup), oup
  2190  	VMOVDQA  CC3, tmpStoreAVX2
  2191  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2192  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2193  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2194  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2195  	VMOVDQA  tmpStoreAVX2, CC3
  2196  	polyMulStage1_AVX2
  2197  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2198  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2199  	polyMulStage2_AVX2
  2200  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2201  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2202  	polyMulStage3_AVX2
  2203  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2204  	VMOVDQA  CC3, tmpStoreAVX2
  2205  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2206  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2207  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2208  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2209  	VMOVDQA  tmpStoreAVX2, CC3
  2210  	polyMulReduceStage
  2211  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2212  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2213  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2214  	DECQ     itr1
  2215  	JNE      sealAVX2InternalLoop
  2216  
  2217  	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2218  	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2219  	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2220  	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2221  	VMOVDQA CC3, tmpStoreAVX2
  2222  
  2223  	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  2224  	polyAdd(0*8(oup))
  2225  	polyMulAVX2
  2226  	LEAQ       (4*8)(oup), oup
  2227  	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  2228  	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  2229  	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  2230  	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  2231  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2232  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2233  
  2234  	// and here
  2235  	polyAdd(-2*8(oup))
  2236  	polyMulAVX2
  2237  	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  2238  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2239  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2240  	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2241  	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
  2242  	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
  2243  	LEAQ       (32*16)(inp), inp
  2244  	SUBQ       $(32*16), inl
  2245  	CMPQ       inl, $512
  2246  	JG         sealAVX2MainLoop
  2247  
  2248  	// Tail can only hash 480 bytes
  2249  	polyAdd(0*8(oup))
  2250  	polyMulAVX2
  2251  	polyAdd(2*8(oup))
  2252  	polyMulAVX2
  2253  	LEAQ 32(oup), oup
  2254  
  2255  	MOVQ $10, itr1
  2256  	MOVQ $0, itr2
  2257  	CMPQ inl, $128
  2258  	JBE  sealAVX2Tail128
  2259  	CMPQ inl, $256
  2260  	JBE  sealAVX2Tail256
  2261  	CMPQ inl, $384
  2262  	JBE  sealAVX2Tail384
  2263  	JMP  sealAVX2Tail512
  2264  
  2265  // ----------------------------------------------------------------------------
  2266  // Special optimization for buffers smaller than 193 bytes
  2267  seal192AVX2:
  2268  	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
  2269  	VMOVDQA AA0, AA1
  2270  	VMOVDQA BB0, BB1
  2271  	VMOVDQA CC0, CC1
  2272  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  2273  	VMOVDQA AA0, AA2
  2274  	VMOVDQA BB0, BB2
  2275  	VMOVDQA CC0, CC2
  2276  	VMOVDQA DD0, DD2
  2277  	VMOVDQA DD1, TT3
  2278  	MOVQ    $10, itr2
  2279  
  2280  sealAVX2192InnerCipherLoop:
  2281  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2282  	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2283  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2284  	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2285  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2286  	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2287  	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2288  	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2289  	DECQ       itr2
  2290  	JNE        sealAVX2192InnerCipherLoop
  2291  	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  2292  	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  2293  	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  2294  	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  2295  	VPERM2I128 $0x02, AA0, BB0, TT0
  2296  
  2297  	// Clamp and store poly key
  2298  	VPAND   ·polyClampMask<>(SB), TT0, TT0
  2299  	VMOVDQA TT0, rsStoreAVX2
  2300  
  2301  	// Stream for up to 192 bytes
  2302  	VPERM2I128 $0x13, AA0, BB0, AA0
  2303  	VPERM2I128 $0x13, CC0, DD0, BB0
  2304  	VPERM2I128 $0x02, AA1, BB1, CC0
  2305  	VPERM2I128 $0x02, CC1, DD1, DD0
  2306  	VPERM2I128 $0x13, AA1, BB1, AA1
  2307  	VPERM2I128 $0x13, CC1, DD1, BB1
  2308  
  2309  sealAVX2ShortSeal:
  2310  	// Hash aad
  2311  	MOVQ ad_len+80(FP), itr2
  2312  	CALL polyHashADInternal<>(SB)
  2313  	XORQ itr1, itr1
  2314  
  2315  sealAVX2SealHash:
  2316  	// itr1 holds the number of bytes encrypted but not yet hashed
  2317  	CMPQ itr1, $16
  2318  	JB   sealAVX2ShortSealLoop
  2319  	polyAdd(0(oup))
  2320  	polyMul
  2321  	SUBQ $16, itr1
  2322  	ADDQ $16, oup
  2323  	JMP  sealAVX2SealHash
  2324  
  2325  sealAVX2ShortSealLoop:
  2326  	CMPQ inl, $32
  2327  	JB   sealAVX2ShortTail32
  2328  	SUBQ $32, inl
  2329  
  2330  	// Load for encryption
  2331  	VPXOR   (inp), AA0, AA0
  2332  	VMOVDQU AA0, (oup)
  2333  	LEAQ    (1*32)(inp), inp
  2334  
  2335  	// Now can hash
  2336  	polyAdd(0*8(oup))
  2337  	polyMulAVX2
  2338  	polyAdd(2*8(oup))
  2339  	polyMulAVX2
  2340  	LEAQ (1*32)(oup), oup
  2341  
  2342  	// Shift stream left
  2343  	VMOVDQA BB0, AA0
  2344  	VMOVDQA CC0, BB0
  2345  	VMOVDQA DD0, CC0
  2346  	VMOVDQA AA1, DD0
  2347  	VMOVDQA BB1, AA1
  2348  	VMOVDQA CC1, BB1
  2349  	VMOVDQA DD1, CC1
  2350  	VMOVDQA AA2, DD1
  2351  	VMOVDQA BB2, AA2
  2352  	JMP     sealAVX2ShortSealLoop
  2353  
  2354  sealAVX2ShortTail32:
  2355  	CMPQ    inl, $16
  2356  	VMOVDQA A0, A1
  2357  	JB      sealAVX2ShortDone
  2358  
  2359  	SUBQ $16, inl
  2360  
  2361  	// Load for encryption
  2362  	VPXOR   (inp), A0, T0
  2363  	VMOVDQU T0, (oup)
  2364  	LEAQ    (1*16)(inp), inp
  2365  
  2366  	// Hash
  2367  	polyAdd(0*8(oup))
  2368  	polyMulAVX2
  2369  	LEAQ       (1*16)(oup), oup
  2370  	VPERM2I128 $0x11, AA0, AA0, AA0
  2371  	VMOVDQA    A0, A1
  2372  
  2373  sealAVX2ShortDone:
  2374  	VZEROUPPER
  2375  	JMP sealSSETail
  2376  
  2377  // ----------------------------------------------------------------------------
  2378  // Special optimization for buffers smaller than 321 bytes
  2379  seal320AVX2:
  2380  	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  2381  	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  2382  	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2383  	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  2384  	MOVQ    $10, itr2
  2385  
  2386  sealAVX2320InnerCipherLoop:
  2387  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2388  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2389  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2390  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2391  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2392  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2393  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2394  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2395  	DECQ     itr2
  2396  	JNE      sealAVX2320InnerCipherLoop
  2397  
  2398  	VMOVDQA ·chacha20Constants<>(SB), TT0
  2399  	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  2400  	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  2401  	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  2402  	VMOVDQA ·avx2IncMask<>(SB), TT0
  2403  	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  2404  	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  2405  	VPADDD  TT3, DD2, DD2
  2406  
  2407  	// Clamp and store poly key
  2408  	VPERM2I128 $0x02, AA0, BB0, TT0
  2409  	VPAND      ·polyClampMask<>(SB), TT0, TT0
  2410  	VMOVDQA    TT0, rsStoreAVX2
  2411  
  2412  	// Stream for up to 320 bytes
  2413  	VPERM2I128 $0x13, AA0, BB0, AA0
  2414  	VPERM2I128 $0x13, CC0, DD0, BB0
  2415  	VPERM2I128 $0x02, AA1, BB1, CC0
  2416  	VPERM2I128 $0x02, CC1, DD1, DD0
  2417  	VPERM2I128 $0x13, AA1, BB1, AA1
  2418  	VPERM2I128 $0x13, CC1, DD1, BB1
  2419  	VPERM2I128 $0x02, AA2, BB2, CC1
  2420  	VPERM2I128 $0x02, CC2, DD2, DD1
  2421  	VPERM2I128 $0x13, AA2, BB2, AA2
  2422  	VPERM2I128 $0x13, CC2, DD2, BB2
  2423  	JMP        sealAVX2ShortSeal
  2424  
  2425  // ----------------------------------------------------------------------------
  2426  // Special optimization for the last 128 bytes of ciphertext
  2427  sealAVX2Tail128:
  2428  	// Need to decrypt up to 128 bytes - prepare two blocks
  2429  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2430  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2431  	VMOVDQA ·chacha20Constants<>(SB), AA0
  2432  	VMOVDQA state1StoreAVX2, BB0
  2433  	VMOVDQA state2StoreAVX2, CC0
  2434  	VMOVDQA ctr3StoreAVX2, DD0
  2435  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  2436  	VMOVDQA DD0, DD1
  2437  
  2438  sealAVX2Tail128LoopA:
  2439  	polyAdd(0(oup))
  2440  	polyMul
  2441  	LEAQ 16(oup), oup
  2442  
  2443  sealAVX2Tail128LoopB:
  2444  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2445  	polyAdd(0(oup))
  2446  	polyMul
  2447  	VPALIGNR $4, BB0, BB0, BB0
  2448  	VPALIGNR $8, CC0, CC0, CC0
  2449  	VPALIGNR $12, DD0, DD0, DD0
  2450  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2451  	polyAdd(16(oup))
  2452  	polyMul
  2453  	LEAQ     32(oup), oup
  2454  	VPALIGNR $12, BB0, BB0, BB0
  2455  	VPALIGNR $8, CC0, CC0, CC0
  2456  	VPALIGNR $4, DD0, DD0, DD0
  2457  	DECQ     itr1
  2458  	JG       sealAVX2Tail128LoopA
  2459  	DECQ     itr2
  2460  	JGE      sealAVX2Tail128LoopB
  2461  
  2462  	VPADDD ·chacha20Constants<>(SB), AA0, AA1
  2463  	VPADDD state1StoreAVX2, BB0, BB1
  2464  	VPADDD state2StoreAVX2, CC0, CC1
  2465  	VPADDD DD1, DD0, DD1
  2466  
  2467  	VPERM2I128 $0x02, AA1, BB1, AA0
  2468  	VPERM2I128 $0x02, CC1, DD1, BB0
  2469  	VPERM2I128 $0x13, AA1, BB1, CC0
  2470  	VPERM2I128 $0x13, CC1, DD1, DD0
  2471  	JMP        sealAVX2ShortSealLoop
  2472  
  2473  // ----------------------------------------------------------------------------
  2474  // Special optimization for the last 256 bytes of ciphertext
  2475  sealAVX2Tail256:
  2476  	// Need to decrypt up to 256 bytes - prepare two blocks
  2477  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2478  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2479  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
  2480  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
  2481  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
  2482  	VMOVDQA ctr3StoreAVX2, DD0
  2483  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  2484  	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  2485  	VMOVDQA DD0, TT1
  2486  	VMOVDQA DD1, TT2
  2487  
  2488  sealAVX2Tail256LoopA:
  2489  	polyAdd(0(oup))
  2490  	polyMul
  2491  	LEAQ 16(oup), oup
  2492  
  2493  sealAVX2Tail256LoopB:
  2494  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2495  	polyAdd(0(oup))
  2496  	polyMul
  2497  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2498  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2499  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2500  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2501  	polyAdd(16(oup))
  2502  	polyMul
  2503  	LEAQ     32(oup), oup
  2504  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2505  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2506  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2507  	DECQ     itr1
  2508  	JG       sealAVX2Tail256LoopA
  2509  	DECQ     itr2
  2510  	JGE      sealAVX2Tail256LoopB
  2511  
  2512  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  2513  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  2514  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  2515  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  2516  	VPERM2I128 $0x02, AA0, BB0, TT0
  2517  	VPERM2I128 $0x02, CC0, DD0, TT1
  2518  	VPERM2I128 $0x13, AA0, BB0, TT2
  2519  	VPERM2I128 $0x13, CC0, DD0, TT3
  2520  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2521  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2522  	MOVQ       $128, itr1
  2523  	LEAQ       128(inp), inp
  2524  	SUBQ       $128, inl
  2525  	VPERM2I128 $0x02, AA1, BB1, AA0
  2526  	VPERM2I128 $0x02, CC1, DD1, BB0
  2527  	VPERM2I128 $0x13, AA1, BB1, CC0
  2528  	VPERM2I128 $0x13, CC1, DD1, DD0
  2529  
  2530  	JMP sealAVX2SealHash
  2531  
  2532  // ----------------------------------------------------------------------------
  2533  // Special optimization for the last 384 bytes of ciphertext
  2534  sealAVX2Tail384:
  2535  	// Need to decrypt up to 384 bytes - prepare two blocks
  2536  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2537  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2538  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  2539  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  2540  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  2541  	VMOVDQA ctr3StoreAVX2, DD0
  2542  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2543  	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
  2544  
  2545  sealAVX2Tail384LoopA:
  2546  	polyAdd(0(oup))
  2547  	polyMul
  2548  	LEAQ 16(oup), oup
  2549  
  2550  sealAVX2Tail384LoopB:
  2551  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2552  	polyAdd(0(oup))
  2553  	polyMul
  2554  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2555  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2556  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2557  	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2558  	polyAdd(16(oup))
  2559  	polyMul
  2560  	LEAQ     32(oup), oup
  2561  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2562  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2563  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2564  	DECQ     itr1
  2565  	JG       sealAVX2Tail384LoopA
  2566  	DECQ     itr2
  2567  	JGE      sealAVX2Tail384LoopB
  2568  
  2569  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  2570  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  2571  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  2572  	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
  2573  	VPERM2I128 $0x02, AA0, BB0, TT0
  2574  	VPERM2I128 $0x02, CC0, DD0, TT1
  2575  	VPERM2I128 $0x13, AA0, BB0, TT2
  2576  	VPERM2I128 $0x13, CC0, DD0, TT3
  2577  	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2578  	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2579  	VPERM2I128 $0x02, AA1, BB1, TT0
  2580  	VPERM2I128 $0x02, CC1, DD1, TT1
  2581  	VPERM2I128 $0x13, AA1, BB1, TT2
  2582  	VPERM2I128 $0x13, CC1, DD1, TT3
  2583  	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  2584  	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  2585  	MOVQ       $256, itr1
  2586  	LEAQ       256(inp), inp
  2587  	SUBQ       $256, inl
  2588  	VPERM2I128 $0x02, AA2, BB2, AA0
  2589  	VPERM2I128 $0x02, CC2, DD2, BB0
  2590  	VPERM2I128 $0x13, AA2, BB2, CC0
  2591  	VPERM2I128 $0x13, CC2, DD2, DD0
  2592  
  2593  	JMP sealAVX2SealHash
  2594  
  2595  // ----------------------------------------------------------------------------
  2596  // Special optimization for the last 512 bytes of ciphertext
  2597  sealAVX2Tail512:
  2598  	// Need to decrypt up to 512 bytes - prepare two blocks
  2599  	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2600  	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2601  	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2602  	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2603  	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2604  	VMOVDQA ctr3StoreAVX2, DD0
  2605  	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2606  	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2607  
  2608  sealAVX2Tail512LoopA:
  2609  	polyAdd(0(oup))
  2610  	polyMul
  2611  	LEAQ 16(oup), oup
  2612  
  2613  sealAVX2Tail512LoopB:
  2614  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2615  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2616  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2617  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2618  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2619  	VMOVDQA  CC3, tmpStoreAVX2
  2620  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2621  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2622  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2623  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2624  	VMOVDQA  tmpStoreAVX2, CC3
  2625  	polyAdd(0*8(oup))
  2626  	polyMulAVX2
  2627  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2628  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2629  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2630  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2631  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2632  	VMOVDQA  CC3, tmpStoreAVX2
  2633  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2634  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2635  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2636  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2637  	VMOVDQA  tmpStoreAVX2, CC3
  2638  	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2639  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2640  	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2641  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2642  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2643  	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2644  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2645  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2646  	polyAdd(2*8(oup))
  2647  	polyMulAVX2
  2648  	LEAQ     (4*8)(oup), oup
  2649  	VMOVDQA  CC3, tmpStoreAVX2
  2650  	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2651  	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2652  	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2653  	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2654  	VMOVDQA  tmpStoreAVX2, CC3
  2655  	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2656  	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2657  	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2658  	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2659  	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2660  	VMOVDQA  CC3, tmpStoreAVX2
  2661  	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2662  	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2663  	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2664  	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2665  	VMOVDQA  tmpStoreAVX2, CC3
  2666  	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2667  	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2668  	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2669  
  2670  	DECQ itr1
  2671  	JG   sealAVX2Tail512LoopA
  2672  	DECQ itr2
  2673  	JGE  sealAVX2Tail512LoopB
  2674  
  2675  	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2676  	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2677  	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2678  	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2679  	VMOVDQA    CC3, tmpStoreAVX2
  2680  	VPERM2I128 $0x02, AA0, BB0, CC3
  2681  	VPXOR      (0*32)(inp), CC3, CC3
  2682  	VMOVDQU    CC3, (0*32)(oup)
  2683  	VPERM2I128 $0x02, CC0, DD0, CC3
  2684  	VPXOR      (1*32)(inp), CC3, CC3
  2685  	VMOVDQU    CC3, (1*32)(oup)
  2686  	VPERM2I128 $0x13, AA0, BB0, CC3
  2687  	VPXOR      (2*32)(inp), CC3, CC3
  2688  	VMOVDQU    CC3, (2*32)(oup)
  2689  	VPERM2I128 $0x13, CC0, DD0, CC3
  2690  	VPXOR      (3*32)(inp), CC3, CC3
  2691  	VMOVDQU    CC3, (3*32)(oup)
  2692  
  2693  	VPERM2I128 $0x02, AA1, BB1, AA0
  2694  	VPERM2I128 $0x02, CC1, DD1, BB0
  2695  	VPERM2I128 $0x13, AA1, BB1, CC0
  2696  	VPERM2I128 $0x13, CC1, DD1, DD0
  2697  	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2698  	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2699  
  2700  	VPERM2I128 $0x02, AA2, BB2, AA0
  2701  	VPERM2I128 $0x02, CC2, DD2, BB0
  2702  	VPERM2I128 $0x13, AA2, BB2, CC0
  2703  	VPERM2I128 $0x13, CC2, DD2, DD0
  2704  	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2705  	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2706  
  2707  	MOVQ       $384, itr1
  2708  	LEAQ       384(inp), inp
  2709  	SUBQ       $384, inl
  2710  	VPERM2I128 $0x02, AA3, BB3, AA0
  2711  	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
  2712  	VPERM2I128 $0x13, AA3, BB3, CC0
  2713  	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2714  
  2715  	JMP sealAVX2SealHash
  2716  

View as plain text