Source file src/crypto/internal/fips/sha256/_asm/sha256block_amd64_avx2.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	. "github.com/mmcloughlin/avo/build"
     9  	. "github.com/mmcloughlin/avo/operand"
    10  	. "github.com/mmcloughlin/avo/reg"
    11  )
    12  
    13  // The avx2-version is described in an Intel White-Paper:
    14  // "Fast SHA-256 Implementations on Intel Architecture Processors"
    15  // To find it, surf to http://www.intel.com/p/en_US/embedded
    16  // and search for that title.
    17  // AVX2 version by Intel, same algorithm as code in Linux kernel:
    18  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
    19  // by
    20  //     James Guilford <james.guilford@intel.com>
    21  //     Kirk Yap <kirk.s.yap@intel.com>
    22  //     Tim Chen <tim.c.chen@linux.intel.com>
    23  
    24  func blockAVX2() {
    25  	Implement("blockAVX2")
    26  	AllocLocal(536)
    27  
    28  	Load(Param("dig"), CTX) // d.h[8]
    29  	Load(Param("p").Base(), INP)
    30  	Load(Param("p").Len(), NUM_BYTES)
    31  
    32  	LEAQ(Mem{Base: INP, Index: NUM_BYTES, Scale: 1, Disp: -64}, NUM_BYTES) // Pointer to the last block
    33  	MOVQ(NUM_BYTES, Mem{Base: SP}.Offset(_INP_END))
    34  
    35  	CMPQ(NUM_BYTES, INP)
    36  	JE(LabelRef("avx2_only_one_block"))
    37  
    38  	Comment("Load initial digest")
    39  	CTX := Mem{Base: CTX}
    40  	MOVL(CTX.Offset(0), a)  //  a = H0
    41  	MOVL(CTX.Offset(4), b)  //  b = H1
    42  	MOVL(CTX.Offset(8), c)  //  c = H2
    43  	MOVL(CTX.Offset(12), d) //  d = H3
    44  	MOVL(CTX.Offset(16), e) //  e = H4
    45  	MOVL(CTX.Offset(20), f) //  f = H5
    46  	MOVL(CTX.Offset(24), g) //  g = H6
    47  	MOVL(CTX.Offset(28), h) //  h = H7
    48  
    49  	avx2_loop0()
    50  	avx2_last_block_enter()
    51  	avx2_loop1()
    52  	avx2_loop2()
    53  	avx2_loop3()
    54  	avx2_do_last_block()
    55  	avx2_only_one_block()
    56  	done_hash()
    57  }
    58  
    59  func avx2_loop0() {
    60  	Label("avx2_loop0")
    61  	Comment("at each iteration works with one block (512 bit)")
    62  	VMOVDQU(Mem{Base: INP}.Offset(0*32), XTMP0)
    63  	VMOVDQU(Mem{Base: INP}.Offset(1*32), XTMP1)
    64  	VMOVDQU(Mem{Base: INP}.Offset(2*32), XTMP2)
    65  	VMOVDQU(Mem{Base: INP}.Offset(3*32), XTMP3)
    66  
    67  	flip_mask := flip_mask_DATA()
    68  
    69  	VMOVDQU(flip_mask, BYTE_FLIP_MASK)
    70  
    71  	Comment("Apply Byte Flip Mask: LE -> BE")
    72  	VPSHUFB(BYTE_FLIP_MASK, XTMP0, XTMP0)
    73  	VPSHUFB(BYTE_FLIP_MASK, XTMP1, XTMP1)
    74  	VPSHUFB(BYTE_FLIP_MASK, XTMP2, XTMP2)
    75  	VPSHUFB(BYTE_FLIP_MASK, XTMP3, XTMP3)
    76  
    77  	Comment("Transpose data into high/low parts")
    78  	VPERM2I128(Imm(0x20), XTMP2, XTMP0, XDWORD0) //  w3,  w2,  w1,  w0
    79  	VPERM2I128(Imm(0x31), XTMP2, XTMP0, XDWORD1) //  w7,  w6,  w5,  w4
    80  	VPERM2I128(Imm(0x20), XTMP3, XTMP1, XDWORD2) // w11, w10,  w9,  w8
    81  	VPERM2I128(Imm(0x31), XTMP3, XTMP1, XDWORD3) // w15, w14, w13, w12
    82  
    83  	K256 := K256_DATA()
    84  	LEAQ(K256, TBL) // Loading address of table with round-specific constants
    85  }
    86  
    87  func avx2_last_block_enter() {
    88  	Label("avx2_last_block_enter")
    89  	ADDQ(Imm(64), INP)
    90  	MOVQ(INP, Mem{Base: SP}.Offset(_INP))
    91  	XORQ(SRND, SRND)
    92  }
    93  
    94  // for w0 - w47
    95  func avx2_loop1() {
    96  	Label("avx2_loop1")
    97  
    98  	Comment("Do 4 rounds and scheduling")
    99  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((0 * 32)), XDWORD0, XFER)
   100  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
   101  	roundAndSchedN0(_XFER+0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   102  	roundAndSchedN1(_XFER+0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   103  	roundAndSchedN2(_XFER+0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   104  	roundAndSchedN3(_XFER+0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   105  
   106  	Comment("Do 4 rounds and scheduling")
   107  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
   108  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
   109  	roundAndSchedN0(_XFER+1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   110  	roundAndSchedN1(_XFER+1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   111  	roundAndSchedN2(_XFER+1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   112  	roundAndSchedN3(_XFER+1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   113  
   114  	Comment("Do 4 rounds and scheduling")
   115  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((2 * 32)), XDWORD2, XFER)
   116  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+2*32))
   117  	roundAndSchedN0(_XFER+2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   118  	roundAndSchedN1(_XFER+2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   119  	roundAndSchedN2(_XFER+2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   120  	roundAndSchedN3(_XFER+2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   121  
   122  	Comment("Do 4 rounds and scheduling")
   123  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((3 * 32)), XDWORD3, XFER)
   124  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+3*32))
   125  	roundAndSchedN0(_XFER+3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   126  	roundAndSchedN1(_XFER+3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   127  	roundAndSchedN2(_XFER+3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   128  	roundAndSchedN3(_XFER+3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   129  
   130  	ADDQ(Imm(4*32), SRND)
   131  	CMPQ(SRND, U32(3*4*32))
   132  	JB(LabelRef("avx2_loop1"))
   133  }
   134  
   135  // w48 - w63 processed with no scheduling (last 16 rounds)
   136  func avx2_loop2() {
   137  	Label("avx2_loop2")
   138  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(0*32), XDWORD0, XFER)
   139  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
   140  	doRoundN0(_XFER+0*32, a, b, c, d, e, f, g, h, h)
   141  	doRoundN1(_XFER+0*32, h, a, b, c, d, e, f, g, h)
   142  	doRoundN2(_XFER+0*32, g, h, a, b, c, d, e, f, g)
   143  	doRoundN3(_XFER+0*32, f, g, h, a, b, c, d, e, f)
   144  
   145  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
   146  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
   147  	doRoundN0(_XFER+1*32, e, f, g, h, a, b, c, d, e)
   148  	doRoundN1(_XFER+1*32, d, e, f, g, h, a, b, c, d)
   149  	doRoundN2(_XFER+1*32, c, d, e, f, g, h, a, b, c)
   150  	doRoundN3(_XFER+1*32, b, c, d, e, f, g, h, a, b)
   151  
   152  	ADDQ(Imm(2*32), SRND)
   153  
   154  	VMOVDQU(XDWORD2, XDWORD0)
   155  	VMOVDQU(XDWORD3, XDWORD1)
   156  
   157  	CMPQ(SRND, U32(4*4*32))
   158  	JB(LabelRef("avx2_loop2"))
   159  
   160  	Load(Param("dig"), CTX) // d.h[8]
   161  	MOVQ(Mem{Base: SP}.Offset(_INP), INP)
   162  
   163  	registers := []GPPhysical{a, b, c, d, e, f, g, h}
   164  	for i, reg := range registers {
   165  		addm(Mem{Base: CTX}.Offset(i*4), reg)
   166  	}
   167  
   168  	CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
   169  	JB(LabelRef("done_hash"))
   170  
   171  	XORQ(SRND, SRND)
   172  }
   173  
   174  // Do second block using previously scheduled results
   175  func avx2_loop3() {
   176  	Label("avx2_loop3")
   177  	doRoundN0(_XFER+0*32+16, a, b, c, d, e, f, g, h, a)
   178  	doRoundN1(_XFER+0*32+16, h, a, b, c, d, e, f, g, h)
   179  	doRoundN2(_XFER+0*32+16, g, h, a, b, c, d, e, f, g)
   180  	doRoundN3(_XFER+0*32+16, f, g, h, a, b, c, d, e, f)
   181  
   182  	doRoundN0(_XFER+1*32+16, e, f, g, h, a, b, c, d, e)
   183  	doRoundN1(_XFER+1*32+16, d, e, f, g, h, a, b, c, d)
   184  	doRoundN2(_XFER+1*32+16, c, d, e, f, g, h, a, b, c)
   185  	doRoundN3(_XFER+1*32+16, b, c, d, e, f, g, h, a, b)
   186  
   187  	ADDQ(Imm(2*32), SRND)
   188  	CMPQ(SRND, U32(4*4*32))
   189  	JB(LabelRef("avx2_loop3"))
   190  
   191  	Load(Param("dig"), CTX) // d.h[8]
   192  	MOVQ(Mem{Base: SP}.Offset(_INP), INP)
   193  	ADDQ(Imm(64), INP)
   194  
   195  	registers := []GPPhysical{a, b, c, d, e, f, g, h}
   196  	for i, reg := range registers {
   197  		addm(Mem{Base: CTX}.Offset(i*4), reg)
   198  	}
   199  
   200  	CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
   201  	JA(LabelRef("avx2_loop0"))
   202  	JB(LabelRef("done_hash"))
   203  }
   204  
   205  func avx2_do_last_block() {
   206  	Label("avx2_do_last_block")
   207  	VMOVDQU(Mem{Base: INP}.Offset(0), XWORD0)
   208  	VMOVDQU(Mem{Base: INP}.Offset(16), XWORD1)
   209  	VMOVDQU(Mem{Base: INP}.Offset(32), XWORD2)
   210  	VMOVDQU(Mem{Base: INP}.Offset(48), XWORD3)
   211  
   212  	flip_mask := flip_mask_DATA()
   213  	VMOVDQU(flip_mask, BYTE_FLIP_MASK)
   214  
   215  	VPSHUFB(X_BYTE_FLIP_MASK, XWORD0, XWORD0)
   216  	VPSHUFB(X_BYTE_FLIP_MASK, XWORD1, XWORD1)
   217  	VPSHUFB(X_BYTE_FLIP_MASK, XWORD2, XWORD2)
   218  	VPSHUFB(X_BYTE_FLIP_MASK, XWORD3, XWORD3)
   219  
   220  	K256 := K256_DATA()
   221  	LEAQ(K256, TBL)
   222  
   223  	JMP(LabelRef("avx2_last_block_enter"))
   224  }
   225  
   226  // Load initial digest
   227  func avx2_only_one_block() {
   228  	Label("avx2_only_one_block")
   229  	registers := []GPPhysical{a, b, c, d, e, f, g, h}
   230  	for i, reg := range registers {
   231  		MOVL(Mem{Base: CTX}.Offset(i*4), reg)
   232  	}
   233  	JMP(LabelRef("avx2_do_last_block"))
   234  }
   235  
   236  func done_hash() {
   237  	Label("done_hash")
   238  	VZEROUPPER()
   239  	RET()
   240  }
   241  
   242  // addm (mem), reg
   243  //   - Add reg to mem using reg-mem add and store
   244  func addm(P1 Mem, P2 GPPhysical) {
   245  	ADDL(P2, P1)
   246  	MOVL(P1, P2)
   247  }
   248  
   249  var (
   250  	XDWORD0 VecPhysical = Y4
   251  	XDWORD1             = Y5
   252  	XDWORD2             = Y6
   253  	XDWORD3             = Y7
   254  
   255  	XWORD0 = X4
   256  	XWORD1 = X5
   257  	XWORD2 = X6
   258  	XWORD3 = X7
   259  
   260  	XTMP0 = Y0
   261  	XTMP1 = Y1
   262  	XTMP2 = Y2
   263  	XTMP3 = Y3
   264  	XTMP4 = Y8
   265  	XTMP5 = Y11
   266  
   267  	XFER = Y9
   268  
   269  	BYTE_FLIP_MASK   = Y13 // mask to convert LE -> BE
   270  	X_BYTE_FLIP_MASK = X13
   271  
   272  	NUM_BYTES GPPhysical = RDX
   273  	INP                  = RDI
   274  
   275  	CTX = RSI // Beginning of digest in memory (a, b, c, ... , h)
   276  
   277  	a = EAX
   278  	b = EBX
   279  	c = ECX
   280  	d = R8L
   281  	e = EDX
   282  	f = R9L
   283  	g = R10L
   284  	h = R11L
   285  
   286  	old_h = R11L
   287  
   288  	TBL = RBP
   289  
   290  	SRND = RSI // SRND is same register as CTX
   291  
   292  	T1 = R12L
   293  
   294  	y0 = R13L
   295  	y1 = R14L
   296  	y2 = R15L
   297  	y3 = EDI
   298  
   299  	// Offsets
   300  	XFER_SIZE    = 2 * 64 * 4
   301  	INP_END_SIZE = 8
   302  	INP_SIZE     = 8
   303  
   304  	_XFER      = 0
   305  	_INP_END   = _XFER + XFER_SIZE
   306  	_INP       = _INP_END + INP_END_SIZE
   307  	STACK_SIZE = _INP + INP_SIZE
   308  )
   309  
   310  func roundAndSchedN0(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
   311  	//                                                                 #############################  RND N + 0 ############################//
   312  	MOVL(a, y3)           //                                           y3 = a
   313  	RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
   314  	RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
   315  
   316  	ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
   317  	ORL(c, y3)                                                      // y3 = a|c
   318  	VPALIGNR(Imm(4), XDWORD2, XDWORD3, XTMP0)                       // XTMP0 = W[-7]
   319  	MOVL(f, y2)                                                     // y2 = f
   320  	RORXL(Imm(13), a, T1)                                           // T1 = a >> 13
   321  
   322  	XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
   323  	XORL(g, y2)                   //                                   y2 = f^g
   324  	VPADDD(XDWORD0, XTMP0, XTMP0) //                                   XTMP0 = W[-7] + W[-16]
   325  	RORXL(Imm(6), e, y1)          //                                   y1 = (e >> 6)
   326  
   327  	ANDL(e, y2)           //                                           y2 = (f^g)&e
   328  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
   329  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
   330  	ADDL(h, d)            //                                           d = k + w + h + d
   331  
   332  	ANDL(b, y3)                               //                       y3 = (a|c)&b
   333  	VPALIGNR(Imm(4), XDWORD0, XDWORD1, XTMP1) //                       XTMP1 = W[-15]
   334  	XORL(T1, y1)                              //                       y1 = (a>>22) ^ (a>>13)
   335  	RORXL(Imm(2), a, T1)                      //                       T1 = (a >> 2)
   336  
   337  	XORL(g, y2)                  //                                    y2 = CH = ((f^g)&e)^g
   338  	VPSRLD(Imm(7), XTMP1, XTMP2) //
   339  	XORL(T1, y1)                 //                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
   340  	MOVL(a, T1)                  //                                    T1 = a
   341  	ANDL(c, T1)                  //                                    T1 = a&c
   342  
   343  	ADDL(y0, y2)                    //                                 y2 = S1 + CH
   344  	VPSLLD(Imm(32-7), XTMP1, XTMP3) //
   345  	ORL(T1, y3)                     //                                 y3 = MAJ = (a|c)&b)|(a&c)
   346  	ADDL(y1, h)                     //                                 h = k + w + h + S0
   347  
   348  	ADDL(y2, d)               //                                       d = k + w + h + d + S1 + CH = d + t1
   349  	VPOR(XTMP2, XTMP3, XTMP3) //                                       XTMP3 = W[-15] ror 7
   350  
   351  	VPSRLD(Imm(18), XTMP1, XTMP2)
   352  	ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
   353  	ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
   354  }
   355  
   356  func roundAndSchedN1(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
   357  	//                                                                 ################################### RND N + 1 ############################
   358  	MOVL(a, y3)                                                     // y3 = a
   359  	RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
   360  	RORXL(Imm(11), e, y1)                                           // y1 = e >> 11
   361  	ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
   362  	ORL(c, y3)                                                      // y3 = a|c
   363  
   364  	VPSRLD(Imm(3), XTMP1, XTMP4) //                                    XTMP4 = W[-15] >> 3
   365  	MOVL(f, y2)                  //                                    y2 = f
   366  	RORXL(Imm(13), a, T1)        //                                    T1 = a >> 13
   367  	XORL(y1, y0)                 //                                    y0 = (e>>25) ^ (e>>11)
   368  	XORL(g, y2)                  //                                    y2 = f^g
   369  
   370  	RORXL(Imm(6), e, y1)  //                                           y1 = (e >> 6)
   371  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
   372  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
   373  	ANDL(e, y2)           //                                           y2 = (f^g)&e
   374  	ADDL(h, d)            //                                           d = k + w + h + d
   375  
   376  	VPSLLD(Imm(32-18), XTMP1, XTMP1)
   377  	ANDL(b, y3)  //                                                    y3 = (a|c)&b
   378  	XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13)
   379  
   380  	VPXOR(XTMP1, XTMP3, XTMP3)
   381  	RORXL(Imm(2), a, T1) //                                            T1 = (a >> 2)
   382  	XORL(g, y2)          //                                            y2 = CH = ((f^g)&e)^g
   383  
   384  	VPXOR(XTMP2, XTMP3, XTMP3) //                                      XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
   385  	XORL(T1, y1)               //                                      y1 = (a>>22) ^ (a>>13) ^ (a>>2)
   386  	MOVL(a, T1)                //                                      T1 = a
   387  	ANDL(c, T1)                //                                      T1 = a&c
   388  	ADDL(y0, y2)               //                                      y2 = S1 + CH
   389  
   390  	VPXOR(XTMP4, XTMP3, XTMP1)         //                              XTMP1 = s0
   391  	VPSHUFD(Imm(0xFA), XDWORD3, XTMP2) //                              XTMP2 = W[-2] {BBAA}
   392  	ORL(T1, y3)                        //                              y3 = MAJ = (a|c)&b)|(a&c)
   393  	ADDL(y1, h)                        //                              h = k + w + h + S0
   394  
   395  	VPADDD(XTMP1, XTMP0, XTMP0) //                                     XTMP0 = W[-16] + W[-7] + s0
   396  	ADDL(y2, d)                 //                                     d = k + w + h + d + S1 + CH = d + t1
   397  	ADDL(y2, h)                 //                                     h = k + w + h + S0 + S1 + CH = t1 + S0
   398  	ADDL(y3, h)                 //                                     h = t1 + S0 + MAJ
   399  
   400  	VPSRLD(Imm(10), XTMP2, XTMP4) //                                   XTMP4 = W[-2] >> 10 {BBAA}
   401  }
   402  
   403  func roundAndSchedN2(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
   404  	//                                                                 ################################### RND N + 2 ############################
   405  	var shuff_00BA Mem = shuff_00BA_DATA()
   406  
   407  	MOVL(a, y3)                                                     // y3 = a
   408  	RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
   409  	ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
   410  
   411  	VPSRLQ(Imm(19), XTMP2, XTMP3) //                                   XTMP3 = W[-2] ror 19 {xBxA}
   412  	RORXL(Imm(11), e, y1)         //                                   y1 = e >> 11
   413  	ORL(c, y3)                    //                                   y3 = a|c
   414  	MOVL(f, y2)                   //                                   y2 = f
   415  	XORL(g, y2)                   //                                   y2 = f^g
   416  
   417  	RORXL(Imm(13), a, T1)         //                                   T1 = a >> 13
   418  	XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
   419  	VPSRLQ(Imm(17), XTMP2, XTMP2) //                                   XTMP2 = W[-2] ror 17 {xBxA}
   420  	ANDL(e, y2)                   //                                   y2 = (f^g)&e
   421  
   422  	RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
   423  	VPXOR(XTMP3, XTMP2, XTMP2)
   424  	ADDL(h, d)  //                                                     d = k + w + h + d
   425  	ANDL(b, y3) //                                                     y3 = (a|c)&b
   426  
   427  	XORL(y1, y0)               //                                      y0 = (e>>25) ^ (e>>11) ^ (e>>6)
   428  	RORXL(Imm(22), a, y1)      //                                      y1 = a >> 22
   429  	VPXOR(XTMP2, XTMP4, XTMP4) //                                      XTMP4 = s1 {xBxA}
   430  	XORL(g, y2)                //                                      y2 = CH = ((f^g)&e)^g
   431  
   432  	VPSHUFB(shuff_00BA, XTMP4, XTMP4) //                               XTMP4 = s1 {00BA}
   433  
   434  	XORL(T1, y1)                //                                     y1 = (a>>22) ^ (a>>13)
   435  	RORXL(Imm(2), a, T1)        //                                     T1 = (a >> 2)
   436  	VPADDD(XTMP4, XTMP0, XTMP0) //                                     XTMP0 = {..., ..., W[1], W[0]}
   437  
   438  	XORL(T1, y1)                   //                                  y1 = (a>>22) ^ (a>>13) ^ (a>>2)
   439  	MOVL(a, T1)                    //                                  T1 = a
   440  	ANDL(c, T1)                    //                                  T1 = a&c
   441  	ADDL(y0, y2)                   //                                  y2 = S1 + CH
   442  	VPSHUFD(Imm(80), XTMP0, XTMP2) //                                  XTMP2 = W[-2] {DDCC}
   443  
   444  	ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
   445  	ADDL(y1, h) //                                                     h = k + w + h + S0
   446  	ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
   447  	ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
   448  
   449  	ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
   450  }
   451  
   452  func roundAndSchedN3(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
   453  	//                                                                 ################################### RND N + 3 ############################
   454  	var shuff_DC00 Mem = shuff_DC00_DATA()
   455  
   456  	MOVL(a, y3)                                                     // y3 = a
   457  	RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
   458  	RORXL(Imm(11), e, y1)                                           // y1 = e >> 11
   459  	ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
   460  	ORL(c, y3)                                                      // y3 = a|c
   461  
   462  	VPSRLD(Imm(10), XTMP2, XTMP5) //                                   XTMP5 = W[-2] >> 10 {DDCC}
   463  	MOVL(f, y2)                   //                                   y2 = f
   464  	RORXL(Imm(13), a, T1)         //                                   T1 = a >> 13
   465  	XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
   466  	XORL(g, y2)                   //                                   y2 = f^g
   467  
   468  	VPSRLQ(Imm(19), XTMP2, XTMP3) //                                   XTMP3 = W[-2] ror 19 {xDxC}
   469  	RORXL(Imm(6), e, y1)          //                                   y1 = (e >> 6)
   470  	ANDL(e, y2)                   //                                   y2 = (f^g)&e
   471  	ADDL(h, d)                    //                                   d = k + w + h + d
   472  	ANDL(b, y3)                   //                                   y3 = (a|c)&b
   473  
   474  	VPSRLQ(Imm(17), XTMP2, XTMP2) //                                   XTMP2 = W[-2] ror 17 {xDxC}
   475  	XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11) ^ (e>>6)
   476  	XORL(g, y2)                   //                                   y2 = CH = ((f^g)&e)^g
   477  
   478  	VPXOR(XTMP3, XTMP2, XTMP2)
   479  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
   480  	ADDL(y0, y2)          //                                           y2 = S1 + CH
   481  
   482  	VPXOR(XTMP2, XTMP5, XTMP5) //                                      XTMP5 = s1 {xDxC}
   483  	XORL(T1, y1)               //                                      y1 = (a>>22) ^ (a>>13)
   484  	ADDL(y2, d)                //                                      d = k + w + h + d + S1 + CH = d + t1
   485  
   486  	RORXL(Imm(2), a, T1) //                                            T1 = (a >> 2)
   487  
   488  	VPSHUFB(shuff_DC00, XTMP5, XTMP5) //                               XTMP5 = s1 {DC00}
   489  
   490  	VPADDD(XTMP0, XTMP5, XDWORD0) //                                   XDWORD0 = {W[3], W[2], W[1], W[0]}
   491  	XORL(T1, y1)                  //                                   y1 = (a>>22) ^ (a>>13) ^ (a>>2)
   492  	MOVL(a, T1)                   //                                   T1 = a
   493  	ANDL(c, T1)                   //                                   T1 = a&c
   494  	ORL(T1, y3)                   //                                   y3 = MAJ = (a|c)&b)|(a&c)
   495  
   496  	ADDL(y1, h) //                                                     h = k + w + h + S0
   497  	ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
   498  	ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
   499  }
   500  
   501  func doRoundN0(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
   502  	//                                                                 ################################### RND N + 0 ###########################
   503  	MOVL(f, y2)           //                                           y2 = f
   504  	RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
   505  	RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
   506  	XORL(g, y2)           //                                           y2 = f^g
   507  
   508  	XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
   509  	RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
   510  	ANDL(e, y2)          //                                            y2 = (f^g)&e
   511  
   512  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
   513  	RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
   514  	XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
   515  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
   516  	MOVL(a, y3)           //                                           y3 = a
   517  
   518  	XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
   519  	RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
   520  	ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
   521  	ORL(c, y3)                                                      // y3 = a|c
   522  
   523  	XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
   524  	MOVL(a, T1)  //                                                    T1 = a
   525  	ANDL(b, y3)  //                                                    y3 = (a|c)&b
   526  	ANDL(c, T1)  //                                                    T1 = a&c
   527  	ADDL(y0, y2) //                                                    y2 = S1 + CH
   528  
   529  	ADDL(h, d)  //                                                     d = k + w + h + d
   530  	ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
   531  	ADDL(y1, h) //                                                     h = k + w + h + S0
   532  	ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
   533  }
   534  
   535  func doRoundN1(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
   536  	//                                                                 ################################### RND N + 1 ###########################
   537  	ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
   538  	MOVL(f, y2)           //                                           y2 = f
   539  	RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
   540  	RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
   541  	XORL(g, y2)           //                                           y2 = f^g
   542  
   543  	XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
   544  	RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
   545  	ANDL(e, y2)          //                                            y2 = (f^g)&e
   546  	ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
   547  
   548  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
   549  	RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
   550  	XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
   551  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
   552  	MOVL(a, y3)           //                                           y3 = a
   553  
   554  	XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
   555  	RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
   556  	ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
   557  	ORL(c, y3)                                                      // y3 = a|c
   558  
   559  	XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
   560  	MOVL(a, T1)  //                                                    T1 = a
   561  	ANDL(b, y3)  //                                                    y3 = (a|c)&b
   562  	ANDL(c, T1)  //                                                    T1 = a&c
   563  	ADDL(y0, y2) //                                                    y2 = S1 + CH
   564  
   565  	ADDL(h, d)  //                                                     d = k + w + h + d
   566  	ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
   567  	ADDL(y1, h) //                                                     h = k + w + h + S0
   568  
   569  	ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
   570  }
   571  
   572  func doRoundN2(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
   573  	//                                                                 ################################### RND N + 2 ##############################
   574  	ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
   575  	MOVL(f, y2)           //                                           y2 = f
   576  	RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
   577  	RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
   578  	XORL(g, y2)           //                                           y2 = f^g
   579  
   580  	XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
   581  	RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
   582  	ANDL(e, y2)          //                                            y2 = (f^g)&e
   583  	ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
   584  
   585  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
   586  	RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
   587  	XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
   588  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
   589  	MOVL(a, y3)           //                                           y3 = a
   590  
   591  	XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
   592  	RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
   593  	ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
   594  	ORL(c, y3)                                                      // y3 = a|c
   595  
   596  	XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
   597  	MOVL(a, T1)  //                                                    T1 = a
   598  	ANDL(b, y3)  //                                                    y3 = (a|c)&b
   599  	ANDL(c, T1)  //                                                    T1 = a&c
   600  	ADDL(y0, y2) //                                                    y2 = S1 + CH
   601  
   602  	ADDL(h, d)  //                                                     d = k + w + h + d
   603  	ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
   604  	ADDL(y1, h) //                                                     h = k + w + h + S0
   605  
   606  	ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
   607  }
   608  
   609  func doRoundN3(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
   610  	//                                                                 ################################### RND N + 3 ###########################
   611  	ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
   612  	MOVL(f, y2)           //                                           y2 = f
   613  	RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
   614  	RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
   615  	XORL(g, y2)           //                                           y2 = f^g
   616  
   617  	XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
   618  	RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
   619  	ANDL(e, y2)          //                                            y2 = (f^g)&e
   620  	ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
   621  
   622  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
   623  	RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
   624  	XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
   625  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
   626  	MOVL(a, y3)           //                                           y3 = a
   627  
   628  	XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
   629  	RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
   630  	ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
   631  	ORL(c, y3)                                                      // y3 = a|c
   632  
   633  	XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
   634  	MOVL(a, T1)  //                                                    T1 = a
   635  	ANDL(b, y3)  //                                                    y3 = (a|c)&b
   636  	ANDL(c, T1)  //                                                    T1 = a&c
   637  	ADDL(y0, y2) //                                                    y2 = S1 + CH
   638  
   639  	ADDL(h, d)  //                                                     d = k + w + h + d
   640  	ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
   641  	ADDL(y1, h) //                                                     h = k + w + h + S0
   642  
   643  	ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
   644  
   645  	ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
   646  
   647  	ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
   648  }
   649  
   650  // Pointers for memoizing Data section symbols
   651  var flip_maskPtr, shuff_00BAPtr, shuff_DC00Ptr, K256Ptr *Mem
   652  
   653  // shuffle byte order from LE to BE
   654  func flip_mask_DATA() Mem {
   655  	if flip_maskPtr != nil {
   656  		return *flip_maskPtr
   657  	}
   658  
   659  	flip_mask := GLOBL("flip_mask", RODATA)
   660  	flip_maskPtr = &flip_mask
   661  
   662  	DATA(0x00, U64(0x0405060700010203))
   663  	DATA(0x08, U64(0x0c0d0e0f08090a0b))
   664  	DATA(0x10, U64(0x0405060700010203))
   665  	DATA(0x18, U64(0x0c0d0e0f08090a0b))
   666  	return flip_mask
   667  }
   668  
   669  // shuffle xBxA -> 00BA
   670  func shuff_00BA_DATA() Mem {
   671  	if shuff_00BAPtr != nil {
   672  		return *shuff_00BAPtr
   673  	}
   674  
   675  	shuff_00BA := GLOBL("shuff_00BA", RODATA)
   676  	shuff_00BAPtr = &shuff_00BA
   677  
   678  	DATA(0x00, U64(0x0b0a090803020100))
   679  	DATA(0x08, U64(0xFFFFFFFFFFFFFFFF))
   680  	DATA(0x10, U64(0x0b0a090803020100))
   681  	DATA(0x18, U64(0xFFFFFFFFFFFFFFFF))
   682  	return shuff_00BA
   683  }
   684  
   685  // shuffle xDxC -> DC00
   686  func shuff_DC00_DATA() Mem {
   687  	if shuff_DC00Ptr != nil {
   688  		return *shuff_DC00Ptr
   689  	}
   690  
   691  	shuff_DC00 := GLOBL("shuff_DC00", RODATA)
   692  	shuff_DC00Ptr = &shuff_DC00
   693  
   694  	DATA(0x00, U64(0xFFFFFFFFFFFFFFFF))
   695  	DATA(0x08, U64(0x0b0a090803020100))
   696  	DATA(0x10, U64(0xFFFFFFFFFFFFFFFF))
   697  	DATA(0x18, U64(0x0b0a090803020100))
   698  	return shuff_DC00
   699  }
   700  
   701  // Round specific constants
   702  func K256_DATA() Mem {
   703  	if K256Ptr != nil {
   704  		return *K256Ptr
   705  	}
   706  
   707  	K256 := GLOBL("K256", NOPTR+RODATA)
   708  	K256Ptr = &K256
   709  
   710  	offset_idx := 0
   711  
   712  	for i := 0; i < len(_K); i += 4 {
   713  		DATA((offset_idx+0)*4, U32(_K[i+0])) // k1
   714  		DATA((offset_idx+1)*4, U32(_K[i+1])) // k2
   715  		DATA((offset_idx+2)*4, U32(_K[i+2])) // k3
   716  		DATA((offset_idx+3)*4, U32(_K[i+3])) // k4
   717  
   718  		DATA((offset_idx+4)*4, U32(_K[i+0])) // k1
   719  		DATA((offset_idx+5)*4, U32(_K[i+1])) // k2
   720  		DATA((offset_idx+6)*4, U32(_K[i+2])) // k3
   721  		DATA((offset_idx+7)*4, U32(_K[i+3])) // k4
   722  		offset_idx += 8
   723  	}
   724  	return K256
   725  }
   726  

View as plain text