Text file src/crypto/aes/gcm_amd64.s

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     8  // The implementation uses some optimization as described in:
     9  // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
    10  //     Instruction and its Usage for Computing the GCM Mode rev. 2.02
    11  // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
    12  //     Hardware
    13  
    14  #include "textflag.h"
    15  
    16  #define B0 X0
    17  #define B1 X1
    18  #define B2 X2
    19  #define B3 X3
    20  #define B4 X4
    21  #define B5 X5
    22  #define B6 X6
    23  #define B7 X7
    24  
    25  #define ACC0 X8
    26  #define ACC1 X9
    27  #define ACCM X10
    28  
    29  #define T0 X11
    30  #define T1 X12
    31  #define T2 X13
    32  #define POLY X14
    33  #define BSWAP X15
    34  
    35  DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    36  DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    37  
    38  DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    39  DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    40  
    41  DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    42  DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    43  DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    44  DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    45  DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    46  DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    47  DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    48  DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    49  DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    50  DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    51  DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    52  DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    53  DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    54  DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    55  DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    56  DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    57  DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    58  DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    59  DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    60  DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    61  DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    62  DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    63  DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    64  DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    65  DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    66  DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    67  DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    68  DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    69  DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    70  DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    71  
    72  GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
    73  GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    74  GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    75  
    76  // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    77  TEXT ·gcmAesFinish(SB),NOSPLIT,$0
    78  #define pTbl DI
    79  #define tMsk SI
    80  #define tPtr DX
    81  #define plen AX
    82  #define dlen CX
    83  
    84  	MOVQ productTable+0(FP), pTbl
    85  	MOVQ tagMask+8(FP), tMsk
    86  	MOVQ T+16(FP), tPtr
    87  	MOVQ pLen+24(FP), plen
    88  	MOVQ dLen+32(FP), dlen
    89  
    90  	MOVOU (tPtr), ACC0
    91  	MOVOU (tMsk), T2
    92  
    93  	MOVOU bswapMask<>(SB), BSWAP
    94  	MOVOU gcmPoly<>(SB), POLY
    95  
    96  	SHLQ $3, plen
    97  	SHLQ $3, dlen
    98  
    99  	MOVQ plen, B0
   100  	PINSRQ $1, dlen, B0
   101  
   102  	PXOR ACC0, B0
   103  
   104  	MOVOU (16*14)(pTbl), ACC0
   105  	MOVOU (16*15)(pTbl), ACCM
   106  	MOVOU ACC0, ACC1
   107  
   108  	PCLMULQDQ $0x00, B0, ACC0
   109  	PCLMULQDQ $0x11, B0, ACC1
   110  	PSHUFD $78, B0, T0
   111  	PXOR B0, T0
   112  	PCLMULQDQ $0x00, T0, ACCM
   113  
   114  	PXOR ACC0, ACCM
   115  	PXOR ACC1, ACCM
   116  	MOVOU ACCM, T0
   117  	PSRLDQ $8, ACCM
   118  	PSLLDQ $8, T0
   119  	PXOR ACCM, ACC1
   120  	PXOR T0, ACC0
   121  
   122  	MOVOU POLY, T0
   123  	PCLMULQDQ $0x01, ACC0, T0
   124  	PSHUFD $78, ACC0, ACC0
   125  	PXOR T0, ACC0
   126  
   127  	MOVOU POLY, T0
   128  	PCLMULQDQ $0x01, ACC0, T0
   129  	PSHUFD $78, ACC0, ACC0
   130  	PXOR T0, ACC0
   131  
   132  	PXOR ACC1, ACC0
   133  
   134  	PSHUFB BSWAP, ACC0
   135  	PXOR T2, ACC0
   136  	MOVOU ACC0, (tPtr)
   137  
   138  	RET
   139  #undef pTbl
   140  #undef tMsk
   141  #undef tPtr
   142  #undef plen
   143  #undef dlen
   144  
   145  // func gcmAesInit(productTable *[256]byte, ks []uint32)
   146  TEXT ·gcmAesInit(SB),NOSPLIT,$0
   147  #define dst DI
   148  #define KS SI
   149  #define NR DX
   150  
   151  	MOVQ productTable+0(FP), dst
   152  	MOVQ ks_base+8(FP), KS
   153  	MOVQ ks_len+16(FP), NR
   154  
   155  	SHRQ $2, NR
   156  	DECQ NR
   157  
   158  	MOVOU bswapMask<>(SB), BSWAP
   159  	MOVOU gcmPoly<>(SB), POLY
   160  
   161  	// Encrypt block 0, with the AES key to generate the hash key H
   162  	MOVOU (16*0)(KS), B0
   163  	MOVOU (16*1)(KS), T0
   164  	AESENC T0, B0
   165  	MOVOU (16*2)(KS), T0
   166  	AESENC T0, B0
   167  	MOVOU (16*3)(KS), T0
   168  	AESENC T0, B0
   169  	MOVOU (16*4)(KS), T0
   170  	AESENC T0, B0
   171  	MOVOU (16*5)(KS), T0
   172  	AESENC T0, B0
   173  	MOVOU (16*6)(KS), T0
   174  	AESENC T0, B0
   175  	MOVOU (16*7)(KS), T0
   176  	AESENC T0, B0
   177  	MOVOU (16*8)(KS), T0
   178  	AESENC T0, B0
   179  	MOVOU (16*9)(KS), T0
   180  	AESENC T0, B0
   181  	MOVOU (16*10)(KS), T0
   182  	CMPQ NR, $12
   183  	JB initEncLast
   184  	AESENC T0, B0
   185  	MOVOU (16*11)(KS), T0
   186  	AESENC T0, B0
   187  	MOVOU (16*12)(KS), T0
   188  	JE initEncLast
   189  	AESENC T0, B0
   190  	MOVOU (16*13)(KS), T0
   191  	AESENC T0, B0
   192  	MOVOU (16*14)(KS), T0
   193  initEncLast:
   194  	AESENCLAST T0, B0
   195  
   196  	PSHUFB BSWAP, B0
   197  	// H * 2
   198  	PSHUFD $0xff, B0, T0
   199  	MOVOU B0, T1
   200  	PSRAL $31, T0
   201  	PAND POLY, T0
   202  	PSRLL $31, T1
   203  	PSLLDQ $4, T1
   204  	PSLLL $1, B0
   205  	PXOR T0, B0
   206  	PXOR T1, B0
   207  	// Karatsuba pre-computations
   208  	MOVOU B0, (16*14)(dst)
   209  	PSHUFD $78, B0, B1
   210  	PXOR B0, B1
   211  	MOVOU B1, (16*15)(dst)
   212  
   213  	MOVOU B0, B2
   214  	MOVOU B1, B3
   215  	// Now prepare powers of H and pre-computations for them
   216  	MOVQ $7, AX
   217  
   218  initLoop:
   219  		MOVOU B2, T0
   220  		MOVOU B2, T1
   221  		MOVOU B3, T2
   222  		PCLMULQDQ $0x00, B0, T0
   223  		PCLMULQDQ $0x11, B0, T1
   224  		PCLMULQDQ $0x00, B1, T2
   225  
   226  		PXOR T0, T2
   227  		PXOR T1, T2
   228  		MOVOU T2, B4
   229  		PSLLDQ $8, B4
   230  		PSRLDQ $8, T2
   231  		PXOR B4, T0
   232  		PXOR T2, T1
   233  
   234  		MOVOU POLY, B2
   235  		PCLMULQDQ $0x01, T0, B2
   236  		PSHUFD $78, T0, T0
   237  		PXOR B2, T0
   238  		MOVOU POLY, B2
   239  		PCLMULQDQ $0x01, T0, B2
   240  		PSHUFD $78, T0, T0
   241  		PXOR T0, B2
   242  		PXOR T1, B2
   243  
   244  		MOVOU B2, (16*12)(dst)
   245  		PSHUFD $78, B2, B3
   246  		PXOR B2, B3
   247  		MOVOU B3, (16*13)(dst)
   248  
   249  		DECQ AX
   250  		LEAQ (-16*2)(dst), dst
   251  	JNE initLoop
   252  
   253  	RET
   254  #undef NR
   255  #undef KS
   256  #undef dst
   257  
   258  // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   259  TEXT ·gcmAesData(SB),NOSPLIT,$0
   260  #define pTbl DI
   261  #define aut SI
   262  #define tPtr CX
   263  #define autLen DX
   264  
   265  #define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   266  #define mulRoundAAD(X ,i) \
   267  	MOVOU (16*(i*2))(pTbl), T1;\
   268  	MOVOU T1, T2;\
   269  	PCLMULQDQ $0x00, X, T1;\
   270  	PXOR T1, ACC0;\
   271  	PCLMULQDQ $0x11, X, T2;\
   272  	PXOR T2, ACC1;\
   273  	PSHUFD $78, X, T1;\
   274  	PXOR T1, X;\
   275  	MOVOU (16*(i*2+1))(pTbl), T1;\
   276  	PCLMULQDQ $0x00, X, T1;\
   277  	PXOR T1, ACCM
   278  
   279  	MOVQ productTable+0(FP), pTbl
   280  	MOVQ data_base+8(FP), aut
   281  	MOVQ data_len+16(FP), autLen
   282  	MOVQ T+32(FP), tPtr
   283  
   284  	PXOR ACC0, ACC0
   285  	MOVOU bswapMask<>(SB), BSWAP
   286  	MOVOU gcmPoly<>(SB), POLY
   287  
   288  	TESTQ autLen, autLen
   289  	JEQ dataBail
   290  
   291  	CMPQ autLen, $13	// optimize the TLS case
   292  	JE dataTLS
   293  	CMPQ autLen, $128
   294  	JB startSinglesLoop
   295  	JMP dataOctaLoop
   296  
   297  dataTLS:
   298  	MOVOU (16*14)(pTbl), T1
   299  	MOVOU (16*15)(pTbl), T2
   300  	PXOR B0, B0
   301  	MOVQ (aut), B0
   302  	PINSRD $2, 8(aut), B0
   303  	PINSRB $12, 12(aut), B0
   304  	XORQ autLen, autLen
   305  	JMP dataMul
   306  
   307  dataOctaLoop:
   308  		CMPQ autLen, $128
   309  		JB startSinglesLoop
   310  		SUBQ $128, autLen
   311  
   312  		MOVOU (16*0)(aut), X0
   313  		MOVOU (16*1)(aut), X1
   314  		MOVOU (16*2)(aut), X2
   315  		MOVOU (16*3)(aut), X3
   316  		MOVOU (16*4)(aut), X4
   317  		MOVOU (16*5)(aut), X5
   318  		MOVOU (16*6)(aut), X6
   319  		MOVOU (16*7)(aut), X7
   320  		LEAQ (16*8)(aut), aut
   321  		PSHUFB BSWAP, X0
   322  		PSHUFB BSWAP, X1
   323  		PSHUFB BSWAP, X2
   324  		PSHUFB BSWAP, X3
   325  		PSHUFB BSWAP, X4
   326  		PSHUFB BSWAP, X5
   327  		PSHUFB BSWAP, X6
   328  		PSHUFB BSWAP, X7
   329  		PXOR ACC0, X0
   330  
   331  		MOVOU (16*0)(pTbl), ACC0
   332  		MOVOU (16*1)(pTbl), ACCM
   333  		MOVOU ACC0, ACC1
   334  		PSHUFD $78, X0, T1
   335  		PXOR X0, T1
   336  		PCLMULQDQ $0x00, X0, ACC0
   337  		PCLMULQDQ $0x11, X0, ACC1
   338  		PCLMULQDQ $0x00, T1, ACCM
   339  
   340  		mulRoundAAD(X1, 1)
   341  		mulRoundAAD(X2, 2)
   342  		mulRoundAAD(X3, 3)
   343  		mulRoundAAD(X4, 4)
   344  		mulRoundAAD(X5, 5)
   345  		mulRoundAAD(X6, 6)
   346  		mulRoundAAD(X7, 7)
   347  
   348  		PXOR ACC0, ACCM
   349  		PXOR ACC1, ACCM
   350  		MOVOU ACCM, T0
   351  		PSRLDQ $8, ACCM
   352  		PSLLDQ $8, T0
   353  		PXOR ACCM, ACC1
   354  		PXOR T0, ACC0
   355  		reduceRound(ACC0)
   356  		reduceRound(ACC0)
   357  		PXOR ACC1, ACC0
   358  	JMP dataOctaLoop
   359  
   360  startSinglesLoop:
   361  	MOVOU (16*14)(pTbl), T1
   362  	MOVOU (16*15)(pTbl), T2
   363  
   364  dataSinglesLoop:
   365  
   366  		CMPQ autLen, $16
   367  		JB dataEnd
   368  		SUBQ $16, autLen
   369  
   370  		MOVOU (aut), B0
   371  dataMul:
   372  		PSHUFB BSWAP, B0
   373  		PXOR ACC0, B0
   374  
   375  		MOVOU T1, ACC0
   376  		MOVOU T2, ACCM
   377  		MOVOU T1, ACC1
   378  
   379  		PSHUFD $78, B0, T0
   380  		PXOR B0, T0
   381  		PCLMULQDQ $0x00, B0, ACC0
   382  		PCLMULQDQ $0x11, B0, ACC1
   383  		PCLMULQDQ $0x00, T0, ACCM
   384  
   385  		PXOR ACC0, ACCM
   386  		PXOR ACC1, ACCM
   387  		MOVOU ACCM, T0
   388  		PSRLDQ $8, ACCM
   389  		PSLLDQ $8, T0
   390  		PXOR ACCM, ACC1
   391  		PXOR T0, ACC0
   392  
   393  		MOVOU POLY, T0
   394  		PCLMULQDQ $0x01, ACC0, T0
   395  		PSHUFD $78, ACC0, ACC0
   396  		PXOR T0, ACC0
   397  
   398  		MOVOU POLY, T0
   399  		PCLMULQDQ $0x01, ACC0, T0
   400  		PSHUFD $78, ACC0, ACC0
   401  		PXOR T0, ACC0
   402  		PXOR ACC1, ACC0
   403  
   404  		LEAQ 16(aut), aut
   405  
   406  	JMP dataSinglesLoop
   407  
   408  dataEnd:
   409  
   410  	TESTQ autLen, autLen
   411  	JEQ dataBail
   412  
   413  	PXOR B0, B0
   414  	LEAQ -1(aut)(autLen*1), aut
   415  
   416  dataLoadLoop:
   417  
   418  		PSLLDQ $1, B0
   419  		PINSRB $0, (aut), B0
   420  
   421  		LEAQ -1(aut), aut
   422  		DECQ autLen
   423  		JNE dataLoadLoop
   424  
   425  	JMP dataMul
   426  
   427  dataBail:
   428  	MOVOU ACC0, (tPtr)
   429  	RET
   430  #undef pTbl
   431  #undef aut
   432  #undef tPtr
   433  #undef autLen
   434  
   435  // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   436  TEXT ·gcmAesEnc(SB),0,$256-96
   437  #define pTbl DI
   438  #define ctx DX
   439  #define ctrPtr CX
   440  #define ptx SI
   441  #define ks AX
   442  #define tPtr R8
   443  #define ptxLen R9
   444  #define aluCTR R10
   445  #define aluTMP R11
   446  #define aluK R12
   447  #define NR R13
   448  
   449  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   450  #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
   451  #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
   452  #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
   453  #define combinedRound(i) \
   454  	MOVOU (16*i)(ks), T0;\
   455  	AESENC T0, B0;\
   456  	AESENC T0, B1;\
   457  	AESENC T0, B2;\
   458  	AESENC T0, B3;\
   459  	 MOVOU (16*(i*2))(pTbl), T1;\
   460  	 MOVOU T1, T2;\
   461  	AESENC T0, B4;\
   462  	AESENC T0, B5;\
   463  	AESENC T0, B6;\
   464  	AESENC T0, B7;\
   465  	 MOVOU (16*i)(SP), T0;\
   466  	 PCLMULQDQ $0x00, T0, T1;\
   467  	 PXOR T1, ACC0;\
   468  	 PSHUFD $78, T0, T1;\
   469  	 PCLMULQDQ $0x11, T0, T2;\
   470  	 PXOR T1, T0;\
   471  	 PXOR T2, ACC1;\
   472  	 MOVOU (16*(i*2+1))(pTbl), T2;\
   473  	 PCLMULQDQ $0x00, T2, T0;\
   474  	 PXOR T0, ACCM
   475  #define mulRound(i) \
   476  	MOVOU (16*i)(SP), T0;\
   477  	MOVOU (16*(i*2))(pTbl), T1;\
   478  	MOVOU T1, T2;\
   479  	PCLMULQDQ $0x00, T0, T1;\
   480  	PXOR T1, ACC0;\
   481  	PCLMULQDQ $0x11, T0, T2;\
   482  	PXOR T2, ACC1;\
   483  	PSHUFD $78, T0, T1;\
   484  	PXOR T1, T0;\
   485  	MOVOU (16*(i*2+1))(pTbl), T1;\
   486  	PCLMULQDQ $0x00, T0, T1;\
   487  	PXOR T1, ACCM
   488  
   489  	MOVQ productTable+0(FP), pTbl
   490  	MOVQ dst+8(FP), ctx
   491  	MOVQ src_base+32(FP), ptx
   492  	MOVQ src_len+40(FP), ptxLen
   493  	MOVQ ctr+56(FP), ctrPtr
   494  	MOVQ T+64(FP), tPtr
   495  	MOVQ ks_base+72(FP), ks
   496  	MOVQ ks_len+80(FP), NR
   497  
   498  	SHRQ $2, NR
   499  	DECQ NR
   500  
   501  	MOVOU bswapMask<>(SB), BSWAP
   502  	MOVOU gcmPoly<>(SB), POLY
   503  
   504  	MOVOU (tPtr), ACC0
   505  	PXOR ACC1, ACC1
   506  	PXOR ACCM, ACCM
   507  	MOVOU (ctrPtr), B0
   508  	MOVL (3*4)(ctrPtr), aluCTR
   509  	MOVOU (ks), T0
   510  	MOVL (3*4)(ks), aluK
   511  	BSWAPL aluCTR
   512  	BSWAPL aluK
   513  
   514  	PXOR B0, T0
   515  	MOVOU T0, (8*16 + 0*16)(SP)
   516  	increment(0)
   517  
   518  	CMPQ ptxLen, $128
   519  	JB gcmAesEncSingles
   520  	SUBQ $128, ptxLen
   521  
   522  	// We have at least 8 blocks to encrypt, prepare the rest of the counters
   523  	MOVOU T0, (8*16 + 1*16)(SP)
   524  	increment(1)
   525  	MOVOU T0, (8*16 + 2*16)(SP)
   526  	increment(2)
   527  	MOVOU T0, (8*16 + 3*16)(SP)
   528  	increment(3)
   529  	MOVOU T0, (8*16 + 4*16)(SP)
   530  	increment(4)
   531  	MOVOU T0, (8*16 + 5*16)(SP)
   532  	increment(5)
   533  	MOVOU T0, (8*16 + 6*16)(SP)
   534  	increment(6)
   535  	MOVOU T0, (8*16 + 7*16)(SP)
   536  	increment(7)
   537  
   538  	MOVOU (8*16 + 0*16)(SP), B0
   539  	MOVOU (8*16 + 1*16)(SP), B1
   540  	MOVOU (8*16 + 2*16)(SP), B2
   541  	MOVOU (8*16 + 3*16)(SP), B3
   542  	MOVOU (8*16 + 4*16)(SP), B4
   543  	MOVOU (8*16 + 5*16)(SP), B5
   544  	MOVOU (8*16 + 6*16)(SP), B6
   545  	MOVOU (8*16 + 7*16)(SP), B7
   546  
   547  	aesRound(1)
   548  	increment(0)
   549  	aesRound(2)
   550  	increment(1)
   551  	aesRound(3)
   552  	increment(2)
   553  	aesRound(4)
   554  	increment(3)
   555  	aesRound(5)
   556  	increment(4)
   557  	aesRound(6)
   558  	increment(5)
   559  	aesRound(7)
   560  	increment(6)
   561  	aesRound(8)
   562  	increment(7)
   563  	aesRound(9)
   564  	MOVOU (16*10)(ks), T0
   565  	CMPQ NR, $12
   566  	JB encLast1
   567  	aesRnd(T0)
   568  	aesRound(11)
   569  	MOVOU (16*12)(ks), T0
   570  	JE encLast1
   571  	aesRnd(T0)
   572  	aesRound(13)
   573  	MOVOU (16*14)(ks), T0
   574  encLast1:
   575  	aesRndLast(T0)
   576  
   577  	MOVOU (16*0)(ptx), T0
   578  	PXOR T0, B0
   579  	MOVOU (16*1)(ptx), T0
   580  	PXOR T0, B1
   581  	MOVOU (16*2)(ptx), T0
   582  	PXOR T0, B2
   583  	MOVOU (16*3)(ptx), T0
   584  	PXOR T0, B3
   585  	MOVOU (16*4)(ptx), T0
   586  	PXOR T0, B4
   587  	MOVOU (16*5)(ptx), T0
   588  	PXOR T0, B5
   589  	MOVOU (16*6)(ptx), T0
   590  	PXOR T0, B6
   591  	MOVOU (16*7)(ptx), T0
   592  	PXOR T0, B7
   593  
   594  	MOVOU B0, (16*0)(ctx)
   595  	PSHUFB BSWAP, B0
   596  	PXOR ACC0, B0
   597  	MOVOU B1, (16*1)(ctx)
   598  	PSHUFB BSWAP, B1
   599  	MOVOU B2, (16*2)(ctx)
   600  	PSHUFB BSWAP, B2
   601  	MOVOU B3, (16*3)(ctx)
   602  	PSHUFB BSWAP, B3
   603  	MOVOU B4, (16*4)(ctx)
   604  	PSHUFB BSWAP, B4
   605  	MOVOU B5, (16*5)(ctx)
   606  	PSHUFB BSWAP, B5
   607  	MOVOU B6, (16*6)(ctx)
   608  	PSHUFB BSWAP, B6
   609  	MOVOU B7, (16*7)(ctx)
   610  	PSHUFB BSWAP, B7
   611  
   612  	MOVOU B0, (16*0)(SP)
   613  	MOVOU B1, (16*1)(SP)
   614  	MOVOU B2, (16*2)(SP)
   615  	MOVOU B3, (16*3)(SP)
   616  	MOVOU B4, (16*4)(SP)
   617  	MOVOU B5, (16*5)(SP)
   618  	MOVOU B6, (16*6)(SP)
   619  	MOVOU B7, (16*7)(SP)
   620  
   621  	LEAQ 128(ptx), ptx
   622  	LEAQ 128(ctx), ctx
   623  
   624  gcmAesEncOctetsLoop:
   625  
   626  		CMPQ ptxLen, $128
   627  		JB gcmAesEncOctetsEnd
   628  		SUBQ $128, ptxLen
   629  
   630  		MOVOU (8*16 + 0*16)(SP), B0
   631  		MOVOU (8*16 + 1*16)(SP), B1
   632  		MOVOU (8*16 + 2*16)(SP), B2
   633  		MOVOU (8*16 + 3*16)(SP), B3
   634  		MOVOU (8*16 + 4*16)(SP), B4
   635  		MOVOU (8*16 + 5*16)(SP), B5
   636  		MOVOU (8*16 + 6*16)(SP), B6
   637  		MOVOU (8*16 + 7*16)(SP), B7
   638  
   639  		MOVOU (16*0)(SP), T0
   640  		PSHUFD $78, T0, T1
   641  		PXOR T0, T1
   642  
   643  		MOVOU (16*0)(pTbl), ACC0
   644  		MOVOU (16*1)(pTbl), ACCM
   645  		MOVOU ACC0, ACC1
   646  
   647  		PCLMULQDQ $0x00, T1, ACCM
   648  		PCLMULQDQ $0x00, T0, ACC0
   649  		PCLMULQDQ $0x11, T0, ACC1
   650  
   651  		combinedRound(1)
   652  		increment(0)
   653  		combinedRound(2)
   654  		increment(1)
   655  		combinedRound(3)
   656  		increment(2)
   657  		combinedRound(4)
   658  		increment(3)
   659  		combinedRound(5)
   660  		increment(4)
   661  		combinedRound(6)
   662  		increment(5)
   663  		combinedRound(7)
   664  		increment(6)
   665  
   666  		aesRound(8)
   667  		increment(7)
   668  
   669  		PXOR ACC0, ACCM
   670  		PXOR ACC1, ACCM
   671  		MOVOU ACCM, T0
   672  		PSRLDQ $8, ACCM
   673  		PSLLDQ $8, T0
   674  		PXOR ACCM, ACC1
   675  		PXOR T0, ACC0
   676  
   677  		reduceRound(ACC0)
   678  		aesRound(9)
   679  
   680  		reduceRound(ACC0)
   681  		PXOR ACC1, ACC0
   682  
   683  		MOVOU (16*10)(ks), T0
   684  		CMPQ NR, $12
   685  		JB encLast2
   686  		aesRnd(T0)
   687  		aesRound(11)
   688  		MOVOU (16*12)(ks), T0
   689  		JE encLast2
   690  		aesRnd(T0)
   691  		aesRound(13)
   692  		MOVOU (16*14)(ks), T0
   693  encLast2:
   694  		aesRndLast(T0)
   695  
   696  		MOVOU (16*0)(ptx), T0
   697  		PXOR T0, B0
   698  		MOVOU (16*1)(ptx), T0
   699  		PXOR T0, B1
   700  		MOVOU (16*2)(ptx), T0
   701  		PXOR T0, B2
   702  		MOVOU (16*3)(ptx), T0
   703  		PXOR T0, B3
   704  		MOVOU (16*4)(ptx), T0
   705  		PXOR T0, B4
   706  		MOVOU (16*5)(ptx), T0
   707  		PXOR T0, B5
   708  		MOVOU (16*6)(ptx), T0
   709  		PXOR T0, B6
   710  		MOVOU (16*7)(ptx), T0
   711  		PXOR T0, B7
   712  
   713  		MOVOU B0, (16*0)(ctx)
   714  		PSHUFB BSWAP, B0
   715  		PXOR ACC0, B0
   716  		MOVOU B1, (16*1)(ctx)
   717  		PSHUFB BSWAP, B1
   718  		MOVOU B2, (16*2)(ctx)
   719  		PSHUFB BSWAP, B2
   720  		MOVOU B3, (16*3)(ctx)
   721  		PSHUFB BSWAP, B3
   722  		MOVOU B4, (16*4)(ctx)
   723  		PSHUFB BSWAP, B4
   724  		MOVOU B5, (16*5)(ctx)
   725  		PSHUFB BSWAP, B5
   726  		MOVOU B6, (16*6)(ctx)
   727  		PSHUFB BSWAP, B6
   728  		MOVOU B7, (16*7)(ctx)
   729  		PSHUFB BSWAP, B7
   730  
   731  		MOVOU B0, (16*0)(SP)
   732  		MOVOU B1, (16*1)(SP)
   733  		MOVOU B2, (16*2)(SP)
   734  		MOVOU B3, (16*3)(SP)
   735  		MOVOU B4, (16*4)(SP)
   736  		MOVOU B5, (16*5)(SP)
   737  		MOVOU B6, (16*6)(SP)
   738  		MOVOU B7, (16*7)(SP)
   739  
   740  		LEAQ 128(ptx), ptx
   741  		LEAQ 128(ctx), ctx
   742  
   743  		JMP gcmAesEncOctetsLoop
   744  
   745  gcmAesEncOctetsEnd:
   746  
   747  	MOVOU (16*0)(SP), T0
   748  	MOVOU (16*0)(pTbl), ACC0
   749  	MOVOU (16*1)(pTbl), ACCM
   750  	MOVOU ACC0, ACC1
   751  	PSHUFD $78, T0, T1
   752  	PXOR T0, T1
   753  	PCLMULQDQ $0x00, T0, ACC0
   754  	PCLMULQDQ $0x11, T0, ACC1
   755  	PCLMULQDQ $0x00, T1, ACCM
   756  
   757  	mulRound(1)
   758  	mulRound(2)
   759  	mulRound(3)
   760  	mulRound(4)
   761  	mulRound(5)
   762  	mulRound(6)
   763  	mulRound(7)
   764  
   765  	PXOR ACC0, ACCM
   766  	PXOR ACC1, ACCM
   767  	MOVOU ACCM, T0
   768  	PSRLDQ $8, ACCM
   769  	PSLLDQ $8, T0
   770  	PXOR ACCM, ACC1
   771  	PXOR T0, ACC0
   772  
   773  	reduceRound(ACC0)
   774  	reduceRound(ACC0)
   775  	PXOR ACC1, ACC0
   776  
   777  	TESTQ ptxLen, ptxLen
   778  	JE gcmAesEncDone
   779  
   780  	SUBQ $7, aluCTR
   781  
   782  gcmAesEncSingles:
   783  
   784  	MOVOU (16*1)(ks), B1
   785  	MOVOU (16*2)(ks), B2
   786  	MOVOU (16*3)(ks), B3
   787  	MOVOU (16*4)(ks), B4
   788  	MOVOU (16*5)(ks), B5
   789  	MOVOU (16*6)(ks), B6
   790  	MOVOU (16*7)(ks), B7
   791  
   792  	MOVOU (16*14)(pTbl), T2
   793  
   794  gcmAesEncSinglesLoop:
   795  
   796  		CMPQ ptxLen, $16
   797  		JB gcmAesEncTail
   798  		SUBQ $16, ptxLen
   799  
   800  		MOVOU (8*16 + 0*16)(SP), B0
   801  		increment(0)
   802  
   803  		AESENC B1, B0
   804  		AESENC B2, B0
   805  		AESENC B3, B0
   806  		AESENC B4, B0
   807  		AESENC B5, B0
   808  		AESENC B6, B0
   809  		AESENC B7, B0
   810  		MOVOU (16*8)(ks), T0
   811  		AESENC T0, B0
   812  		MOVOU (16*9)(ks), T0
   813  		AESENC T0, B0
   814  		MOVOU (16*10)(ks), T0
   815  		CMPQ NR, $12
   816  		JB encLast3
   817  		AESENC T0, B0
   818  		MOVOU (16*11)(ks), T0
   819  		AESENC T0, B0
   820  		MOVOU (16*12)(ks), T0
   821  		JE encLast3
   822  		AESENC T0, B0
   823  		MOVOU (16*13)(ks), T0
   824  		AESENC T0, B0
   825  		MOVOU (16*14)(ks), T0
   826  encLast3:
   827  		AESENCLAST T0, B0
   828  
   829  		MOVOU (ptx), T0
   830  		PXOR T0, B0
   831  		MOVOU B0, (ctx)
   832  
   833  		PSHUFB BSWAP, B0
   834  		PXOR ACC0, B0
   835  
   836  		MOVOU T2, ACC0
   837  		MOVOU T2, ACC1
   838  		MOVOU (16*15)(pTbl), ACCM
   839  
   840  		PSHUFD $78, B0, T0
   841  		PXOR B0, T0
   842  		PCLMULQDQ $0x00, B0, ACC0
   843  		PCLMULQDQ $0x11, B0, ACC1
   844  		PCLMULQDQ $0x00, T0, ACCM
   845  
   846  		PXOR ACC0, ACCM
   847  		PXOR ACC1, ACCM
   848  		MOVOU ACCM, T0
   849  		PSRLDQ $8, ACCM
   850  		PSLLDQ $8, T0
   851  		PXOR ACCM, ACC1
   852  		PXOR T0, ACC0
   853  
   854  		reduceRound(ACC0)
   855  		reduceRound(ACC0)
   856  		PXOR ACC1, ACC0
   857  
   858  		LEAQ (16*1)(ptx), ptx
   859  		LEAQ (16*1)(ctx), ctx
   860  
   861  	JMP gcmAesEncSinglesLoop
   862  
   863  gcmAesEncTail:
   864  	TESTQ ptxLen, ptxLen
   865  	JE gcmAesEncDone
   866  
   867  	MOVOU (8*16 + 0*16)(SP), B0
   868  	AESENC B1, B0
   869  	AESENC B2, B0
   870  	AESENC B3, B0
   871  	AESENC B4, B0
   872  	AESENC B5, B0
   873  	AESENC B6, B0
   874  	AESENC B7, B0
   875  	MOVOU (16*8)(ks), T0
   876  	AESENC T0, B0
   877  	MOVOU (16*9)(ks), T0
   878  	AESENC T0, B0
   879  	MOVOU (16*10)(ks), T0
   880  	CMPQ NR, $12
   881  	JB encLast4
   882  	AESENC T0, B0
   883  	MOVOU (16*11)(ks), T0
   884  	AESENC T0, B0
   885  	MOVOU (16*12)(ks), T0
   886  	JE encLast4
   887  	AESENC T0, B0
   888  	MOVOU (16*13)(ks), T0
   889  	AESENC T0, B0
   890  	MOVOU (16*14)(ks), T0
   891  encLast4:
   892  	AESENCLAST T0, B0
   893  	MOVOU B0, T0
   894  
   895  	LEAQ -1(ptx)(ptxLen*1), ptx
   896  
   897  	MOVQ ptxLen, aluTMP
   898  	SHLQ $4, aluTMP
   899  
   900  	LEAQ andMask<>(SB), aluCTR
   901  	MOVOU -16(aluCTR)(aluTMP*1), T1
   902  
   903  	PXOR B0, B0
   904  ptxLoadLoop:
   905  		PSLLDQ $1, B0
   906  		PINSRB $0, (ptx), B0
   907  		LEAQ -1(ptx), ptx
   908  		DECQ ptxLen
   909  	JNE ptxLoadLoop
   910  
   911  	PXOR T0, B0
   912  	PAND T1, B0
   913  	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   914  
   915  	PSHUFB BSWAP, B0
   916  	PXOR ACC0, B0
   917  
   918  	MOVOU T2, ACC0
   919  	MOVOU T2, ACC1
   920  	MOVOU (16*15)(pTbl), ACCM
   921  
   922  	PSHUFD $78, B0, T0
   923  	PXOR B0, T0
   924  	PCLMULQDQ $0x00, B0, ACC0
   925  	PCLMULQDQ $0x11, B0, ACC1
   926  	PCLMULQDQ $0x00, T0, ACCM
   927  
   928  	PXOR ACC0, ACCM
   929  	PXOR ACC1, ACCM
   930  	MOVOU ACCM, T0
   931  	PSRLDQ $8, ACCM
   932  	PSLLDQ $8, T0
   933  	PXOR ACCM, ACC1
   934  	PXOR T0, ACC0
   935  
   936  	reduceRound(ACC0)
   937  	reduceRound(ACC0)
   938  	PXOR ACC1, ACC0
   939  
   940  gcmAesEncDone:
   941  	MOVOU ACC0, (tPtr)
   942  	RET
   943  #undef increment
   944  
   945  // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   946  TEXT ·gcmAesDec(SB),0,$128-96
   947  #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
   948  #define combinedDecRound(i) \
   949  	MOVOU (16*i)(ks), T0;\
   950  	AESENC T0, B0;\
   951  	AESENC T0, B1;\
   952  	AESENC T0, B2;\
   953  	AESENC T0, B3;\
   954  	MOVOU (16*(i*2))(pTbl), T1;\
   955  	MOVOU T1, T2;\
   956  	AESENC T0, B4;\
   957  	AESENC T0, B5;\
   958  	AESENC T0, B6;\
   959  	AESENC T0, B7;\
   960  	MOVOU (16*i)(ctx), T0;\
   961  	PSHUFB BSWAP, T0;\
   962  	PCLMULQDQ $0x00, T0, T1;\
   963  	PXOR T1, ACC0;\
   964  	PSHUFD $78, T0, T1;\
   965  	PCLMULQDQ $0x11, T0, T2;\
   966  	PXOR T1, T0;\
   967  	PXOR T2, ACC1;\
   968  	MOVOU (16*(i*2+1))(pTbl), T2;\
   969  	PCLMULQDQ $0x00, T2, T0;\
   970  	PXOR T0, ACCM
   971  
   972  	MOVQ productTable+0(FP), pTbl
   973  	MOVQ dst+8(FP), ptx
   974  	MOVQ src_base+32(FP), ctx
   975  	MOVQ src_len+40(FP), ptxLen
   976  	MOVQ ctr+56(FP), ctrPtr
   977  	MOVQ T+64(FP), tPtr
   978  	MOVQ ks_base+72(FP), ks
   979  	MOVQ ks_len+80(FP), NR
   980  
   981  	SHRQ $2, NR
   982  	DECQ NR
   983  
   984  	MOVOU bswapMask<>(SB), BSWAP
   985  	MOVOU gcmPoly<>(SB), POLY
   986  
   987  	MOVOU (tPtr), ACC0
   988  	PXOR ACC1, ACC1
   989  	PXOR ACCM, ACCM
   990  	MOVOU (ctrPtr), B0
   991  	MOVL (3*4)(ctrPtr), aluCTR
   992  	MOVOU (ks), T0
   993  	MOVL (3*4)(ks), aluK
   994  	BSWAPL aluCTR
   995  	BSWAPL aluK
   996  
   997  	PXOR B0, T0
   998  	MOVOU T0, (0*16)(SP)
   999  	increment(0)
  1000  
  1001  	CMPQ ptxLen, $128
  1002  	JB gcmAesDecSingles
  1003  
  1004  	MOVOU T0, (1*16)(SP)
  1005  	increment(1)
  1006  	MOVOU T0, (2*16)(SP)
  1007  	increment(2)
  1008  	MOVOU T0, (3*16)(SP)
  1009  	increment(3)
  1010  	MOVOU T0, (4*16)(SP)
  1011  	increment(4)
  1012  	MOVOU T0, (5*16)(SP)
  1013  	increment(5)
  1014  	MOVOU T0, (6*16)(SP)
  1015  	increment(6)
  1016  	MOVOU T0, (7*16)(SP)
  1017  	increment(7)
  1018  
  1019  gcmAesDecOctetsLoop:
  1020  
  1021  		CMPQ ptxLen, $128
  1022  		JB gcmAesDecEndOctets
  1023  		SUBQ $128, ptxLen
  1024  
  1025  		MOVOU (0*16)(SP), B0
  1026  		MOVOU (1*16)(SP), B1
  1027  		MOVOU (2*16)(SP), B2
  1028  		MOVOU (3*16)(SP), B3
  1029  		MOVOU (4*16)(SP), B4
  1030  		MOVOU (5*16)(SP), B5
  1031  		MOVOU (6*16)(SP), B6
  1032  		MOVOU (7*16)(SP), B7
  1033  
  1034  		MOVOU (16*0)(ctx), T0
  1035  		PSHUFB BSWAP, T0
  1036  		PXOR ACC0, T0
  1037  		PSHUFD $78, T0, T1
  1038  		PXOR T0, T1
  1039  
  1040  		MOVOU (16*0)(pTbl), ACC0
  1041  		MOVOU (16*1)(pTbl), ACCM
  1042  		MOVOU ACC0, ACC1
  1043  
  1044  		PCLMULQDQ $0x00, T1, ACCM
  1045  		PCLMULQDQ $0x00, T0, ACC0
  1046  		PCLMULQDQ $0x11, T0, ACC1
  1047  
  1048  		combinedDecRound(1)
  1049  		increment(0)
  1050  		combinedDecRound(2)
  1051  		increment(1)
  1052  		combinedDecRound(3)
  1053  		increment(2)
  1054  		combinedDecRound(4)
  1055  		increment(3)
  1056  		combinedDecRound(5)
  1057  		increment(4)
  1058  		combinedDecRound(6)
  1059  		increment(5)
  1060  		combinedDecRound(7)
  1061  		increment(6)
  1062  
  1063  		aesRound(8)
  1064  		increment(7)
  1065  
  1066  		PXOR ACC0, ACCM
  1067  		PXOR ACC1, ACCM
  1068  		MOVOU ACCM, T0
  1069  		PSRLDQ $8, ACCM
  1070  		PSLLDQ $8, T0
  1071  		PXOR ACCM, ACC1
  1072  		PXOR T0, ACC0
  1073  
  1074  		reduceRound(ACC0)
  1075  		aesRound(9)
  1076  
  1077  		reduceRound(ACC0)
  1078  		PXOR ACC1, ACC0
  1079  
  1080  		MOVOU (16*10)(ks), T0
  1081  		CMPQ NR, $12
  1082  		JB decLast1
  1083  		aesRnd(T0)
  1084  		aesRound(11)
  1085  		MOVOU (16*12)(ks), T0
  1086  		JE decLast1
  1087  		aesRnd(T0)
  1088  		aesRound(13)
  1089  		MOVOU (16*14)(ks), T0
  1090  decLast1:
  1091  		aesRndLast(T0)
  1092  
  1093  		MOVOU (16*0)(ctx), T0
  1094  		PXOR T0, B0
  1095  		MOVOU (16*1)(ctx), T0
  1096  		PXOR T0, B1
  1097  		MOVOU (16*2)(ctx), T0
  1098  		PXOR T0, B2
  1099  		MOVOU (16*3)(ctx), T0
  1100  		PXOR T0, B3
  1101  		MOVOU (16*4)(ctx), T0
  1102  		PXOR T0, B4
  1103  		MOVOU (16*5)(ctx), T0
  1104  		PXOR T0, B5
  1105  		MOVOU (16*6)(ctx), T0
  1106  		PXOR T0, B6
  1107  		MOVOU (16*7)(ctx), T0
  1108  		PXOR T0, B7
  1109  
  1110  		MOVOU B0, (16*0)(ptx)
  1111  		MOVOU B1, (16*1)(ptx)
  1112  		MOVOU B2, (16*2)(ptx)
  1113  		MOVOU B3, (16*3)(ptx)
  1114  		MOVOU B4, (16*4)(ptx)
  1115  		MOVOU B5, (16*5)(ptx)
  1116  		MOVOU B6, (16*6)(ptx)
  1117  		MOVOU B7, (16*7)(ptx)
  1118  
  1119  		LEAQ 128(ptx), ptx
  1120  		LEAQ 128(ctx), ctx
  1121  
  1122  		JMP gcmAesDecOctetsLoop
  1123  
  1124  gcmAesDecEndOctets:
  1125  
  1126  	SUBQ $7, aluCTR
  1127  
  1128  gcmAesDecSingles:
  1129  
  1130  	MOVOU (16*1)(ks), B1
  1131  	MOVOU (16*2)(ks), B2
  1132  	MOVOU (16*3)(ks), B3
  1133  	MOVOU (16*4)(ks), B4
  1134  	MOVOU (16*5)(ks), B5
  1135  	MOVOU (16*6)(ks), B6
  1136  	MOVOU (16*7)(ks), B7
  1137  
  1138  	MOVOU (16*14)(pTbl), T2
  1139  
  1140  gcmAesDecSinglesLoop:
  1141  
  1142  		CMPQ ptxLen, $16
  1143  		JB gcmAesDecTail
  1144  		SUBQ $16, ptxLen
  1145  
  1146  		MOVOU (ctx), B0
  1147  		MOVOU B0, T1
  1148  		PSHUFB BSWAP, B0
  1149  		PXOR ACC0, B0
  1150  
  1151  		MOVOU T2, ACC0
  1152  		MOVOU T2, ACC1
  1153  		MOVOU (16*15)(pTbl), ACCM
  1154  
  1155  		PCLMULQDQ $0x00, B0, ACC0
  1156  		PCLMULQDQ $0x11, B0, ACC1
  1157  		PSHUFD $78, B0, T0
  1158  		PXOR B0, T0
  1159  		PCLMULQDQ $0x00, T0, ACCM
  1160  
  1161  		PXOR ACC0, ACCM
  1162  		PXOR ACC1, ACCM
  1163  		MOVOU ACCM, T0
  1164  		PSRLDQ $8, ACCM
  1165  		PSLLDQ $8, T0
  1166  		PXOR ACCM, ACC1
  1167  		PXOR T0, ACC0
  1168  
  1169  		reduceRound(ACC0)
  1170  		reduceRound(ACC0)
  1171  		PXOR ACC1, ACC0
  1172  
  1173  		MOVOU (0*16)(SP), B0
  1174  		increment(0)
  1175  		AESENC B1, B0
  1176  		AESENC B2, B0
  1177  		AESENC B3, B0
  1178  		AESENC B4, B0
  1179  		AESENC B5, B0
  1180  		AESENC B6, B0
  1181  		AESENC B7, B0
  1182  		MOVOU (16*8)(ks), T0
  1183  		AESENC T0, B0
  1184  		MOVOU (16*9)(ks), T0
  1185  		AESENC T0, B0
  1186  		MOVOU (16*10)(ks), T0
  1187  		CMPQ NR, $12
  1188  		JB decLast2
  1189  		AESENC T0, B0
  1190  		MOVOU (16*11)(ks), T0
  1191  		AESENC T0, B0
  1192  		MOVOU (16*12)(ks), T0
  1193  		JE decLast2
  1194  		AESENC T0, B0
  1195  		MOVOU (16*13)(ks), T0
  1196  		AESENC T0, B0
  1197  		MOVOU (16*14)(ks), T0
  1198  decLast2:
  1199  		AESENCLAST T0, B0
  1200  
  1201  		PXOR T1, B0
  1202  		MOVOU B0, (ptx)
  1203  
  1204  		LEAQ (16*1)(ptx), ptx
  1205  		LEAQ (16*1)(ctx), ctx
  1206  
  1207  	JMP gcmAesDecSinglesLoop
  1208  
  1209  gcmAesDecTail:
  1210  
  1211  	TESTQ ptxLen, ptxLen
  1212  	JE gcmAesDecDone
  1213  
  1214  	MOVQ ptxLen, aluTMP
  1215  	SHLQ $4, aluTMP
  1216  	LEAQ andMask<>(SB), aluCTR
  1217  	MOVOU -16(aluCTR)(aluTMP*1), T1
  1218  
  1219  	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1220  	PAND T1, B0
  1221  
  1222  	MOVOU B0, T1
  1223  	PSHUFB BSWAP, B0
  1224  	PXOR ACC0, B0
  1225  
  1226  	MOVOU (16*14)(pTbl), ACC0
  1227  	MOVOU (16*15)(pTbl), ACCM
  1228  	MOVOU ACC0, ACC1
  1229  
  1230  	PCLMULQDQ $0x00, B0, ACC0
  1231  	PCLMULQDQ $0x11, B0, ACC1
  1232  	PSHUFD $78, B0, T0
  1233  	PXOR B0, T0
  1234  	PCLMULQDQ $0x00, T0, ACCM
  1235  
  1236  	PXOR ACC0, ACCM
  1237  	PXOR ACC1, ACCM
  1238  	MOVOU ACCM, T0
  1239  	PSRLDQ $8, ACCM
  1240  	PSLLDQ $8, T0
  1241  	PXOR ACCM, ACC1
  1242  	PXOR T0, ACC0
  1243  
  1244  	reduceRound(ACC0)
  1245  	reduceRound(ACC0)
  1246  	PXOR ACC1, ACC0
  1247  
  1248  	MOVOU (0*16)(SP), B0
  1249  	increment(0)
  1250  	AESENC B1, B0
  1251  	AESENC B2, B0
  1252  	AESENC B3, B0
  1253  	AESENC B4, B0
  1254  	AESENC B5, B0
  1255  	AESENC B6, B0
  1256  	AESENC B7, B0
  1257  	MOVOU (16*8)(ks), T0
  1258  	AESENC T0, B0
  1259  	MOVOU (16*9)(ks), T0
  1260  	AESENC T0, B0
  1261  	MOVOU (16*10)(ks), T0
  1262  	CMPQ NR, $12
  1263  	JB decLast3
  1264  	AESENC T0, B0
  1265  	MOVOU (16*11)(ks), T0
  1266  	AESENC T0, B0
  1267  	MOVOU (16*12)(ks), T0
  1268  	JE decLast3
  1269  	AESENC T0, B0
  1270  	MOVOU (16*13)(ks), T0
  1271  	AESENC T0, B0
  1272  	MOVOU (16*14)(ks), T0
  1273  decLast3:
  1274  	AESENCLAST T0, B0
  1275  	PXOR T1, B0
  1276  
  1277  ptxStoreLoop:
  1278  		PEXTRB $0, B0, (ptx)
  1279  		PSRLDQ $1, B0
  1280  		LEAQ 1(ptx), ptx
  1281  		DECQ ptxLen
  1282  
  1283  	JNE ptxStoreLoop
  1284  
  1285  gcmAesDecDone:
  1286  
  1287  	MOVOU ACC0, (tPtr)
  1288  	RET
  1289  

View as plain text