Source file src/crypto/internal/fips/sha256/_asm/sha256block_amd64_asm.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"os"
     9  
    10  	. "github.com/mmcloughlin/avo/build"
    11  	. "github.com/mmcloughlin/avo/operand"
    12  	. "github.com/mmcloughlin/avo/reg"
    13  )
    14  
    15  //go:generate go run . -out ../sha256block_amd64.s
    16  
    17  // SHA256 block routine. See sha256block.go for Go equivalent.
    18  //
    19  // The algorithm is detailed in FIPS 180-4:
    20  //
    21  //  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    22  
    23  // Wt = Mt; for 0 <= t <= 15
    24  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    25  //
    26  // a = H0
    27  // b = H1
    28  // c = H2
    29  // d = H3
    30  // e = H4
    31  // f = H5
    32  // g = H6
    33  // h = H7
    34  //
    35  // for t = 0 to 63 {
    36  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    37  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    38  //    h = g
    39  //    g = f
    40  //    f = e
    41  //    e = d + T1
    42  //    d = c
    43  //    c = b
    44  //    b = a
    45  //    a = T1 + T2
    46  // }
    47  //
    48  // H0 = a + H0
    49  // H1 = b + H1
    50  // H2 = c + H2
    51  // H3 = d + H3
    52  // H4 = e + H4
    53  // H5 = f + H5
    54  // H6 = g + H6
    55  // H7 = h + H7
    56  
    57  func main() {
    58  	// https://github.com/mmcloughlin/avo/issues/450
    59  	os.Setenv("GOOS", "linux")
    60  	os.Setenv("GOARCH", "amd64")
    61  
    62  	Package("crypto/internal/fips/sha256")
    63  	ConstraintExpr("!purego")
    64  	blockAMD64()
    65  	blockAVX2()
    66  	blockSHANI()
    67  	Generate()
    68  }
    69  
    70  // Wt = Mt; for 0 <= t <= 15
    71  func msgSchedule0(index int) {
    72  	MOVL(Mem{Base: SI}.Offset(index*4), EAX)
    73  	BSWAPL(EAX)
    74  	MOVL(EAX, Mem{Base: BP}.Offset(index*4))
    75  }
    76  
    77  // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    78  //
    79  //	SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
    80  //	SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
    81  func msgSchedule1(index int) {
    82  	MOVL(Mem{Base: BP}.Offset((index-2)*4), EAX)
    83  	MOVL(EAX, ECX)
    84  	RORL(Imm(17), EAX)
    85  	MOVL(ECX, EDX)
    86  	RORL(Imm(19), ECX)
    87  	SHRL(Imm(10), EDX)
    88  	MOVL(Mem{Base: BP}.Offset((index-15)*4), EBX)
    89  	XORL(ECX, EAX)
    90  	MOVL(EBX, ECX)
    91  	XORL(EDX, EAX)
    92  	RORL(Imm(7), EBX)
    93  	MOVL(ECX, EDX)
    94  	SHRL(Imm(3), EDX)
    95  	RORL(Imm(18), ECX)
    96  	ADDL(Mem{Base: BP}.Offset((index-7)*4), EAX)
    97  	XORL(ECX, EBX)
    98  	XORL(EDX, EBX)
    99  	ADDL(Mem{Base: BP}.Offset((index-16)*4), EBX)
   100  	ADDL(EBX, EAX)
   101  	MOVL(EAX, Mem{Base: BP}.Offset((index)*4))
   102  }
   103  
   104  // Calculate T1 in AX - uses AX, CX and DX registers.
   105  // h is also used as an accumulator. Wt is passed in AX.
   106  //
   107  //	T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
   108  //	  BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
   109  //	  Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
   110  func sha256T1(konst uint32, e, f, g, h GPPhysical) {
   111  	ADDL(EAX, h)
   112  	MOVL(e, EAX)
   113  	ADDL(U32(konst), h)
   114  	MOVL(e, ECX)
   115  	RORL(U8(6), EAX)
   116  	MOVL(e, EDX)
   117  	RORL(U8(11), ECX)
   118  	XORL(ECX, EAX)
   119  	MOVL(e, ECX)
   120  	RORL(U8(25), EDX)
   121  	ANDL(f, ECX)
   122  	XORL(EAX, EDX)
   123  	MOVL(e, EAX)
   124  	NOTL(EAX)
   125  	ADDL(EDX, h)
   126  	ANDL(g, EAX)
   127  	XORL(ECX, EAX)
   128  	ADDL(h, EAX)
   129  }
   130  
   131  // Calculate T2 in BX - uses BX, CX, DX and DI registers.
   132  //
   133  //	T2 = BIGSIGMA0(a) + Maj(a, b, c)
   134  //	  BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
   135  //	  Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   136  func sha256T2(a, b, c GPPhysical) {
   137  	MOVL(a, EDI)
   138  	MOVL(c, EBX)
   139  	RORL(U8(2), EDI)
   140  	MOVL(a, EDX)
   141  	ANDL(b, EBX)
   142  	RORL(U8(13), EDX)
   143  	MOVL(a, ECX)
   144  	ANDL(c, ECX)
   145  	XORL(EDX, EDI)
   146  	XORL(ECX, EBX)
   147  	MOVL(a, EDX)
   148  	MOVL(b, ECX)
   149  	RORL(U8(22), EDX)
   150  	ANDL(a, ECX)
   151  	XORL(ECX, EBX)
   152  	XORL(EDX, EDI)
   153  	ADDL(EDI, EBX)
   154  }
   155  
   156  // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   157  // The values for e and a are stored in d and h, ready for rotation.
   158  func sha256Round(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) {
   159  	sha256T1(konst, e, f, g, h)
   160  	sha256T2(a, b, c)
   161  	MOVL(EBX, h)
   162  	ADDL(EAX, d)
   163  	ADDL(EAX, h)
   164  }
   165  
   166  func sha256Round0(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) {
   167  	msgSchedule0(index)
   168  	sha256Round(index, konst, a, b, c, d, e, f, g, h)
   169  }
   170  
   171  func sha256Round1(index int, konst uint32, a, b, c, d, e, f, g, h GPPhysical) {
   172  	msgSchedule1(index)
   173  	sha256Round(index, konst, a, b, c, d, e, f, g, h)
   174  }
   175  
   176  func blockAMD64() {
   177  	Implement("blockAMD64")
   178  	AllocLocal(256 + 8)
   179  
   180  	Load(Param("p").Base(), RSI)
   181  	Load(Param("p").Len(), RDX)
   182  	SHRQ(Imm(6), RDX)
   183  	SHLQ(Imm(6), RDX)
   184  
   185  	// Return if p is empty
   186  	LEAQ(Mem{Base: RSI, Index: RDX, Scale: 1}, RDI)
   187  	MOVQ(RDI, Mem{Base: SP}.Offset(256))
   188  	CMPQ(RSI, RDI)
   189  	JEQ(LabelRef("end"))
   190  
   191  	BP := Mem{Base: BP}
   192  	Load(Param("dig"), RBP)
   193  	MOVL(BP.Offset(0*4), R8L)  // a = H0
   194  	MOVL(BP.Offset(1*4), R9L)  // b = H1
   195  	MOVL(BP.Offset(2*4), R10L) // c = H2
   196  	MOVL(BP.Offset(3*4), R11L) // d = H3
   197  	MOVL(BP.Offset(4*4), R12L) // e = H4
   198  	MOVL(BP.Offset(5*4), R13L) // f = H5
   199  	MOVL(BP.Offset(6*4), R14L) // g = H6
   200  	MOVL(BP.Offset(7*4), R15L) // h = H7
   201  
   202  	loop()
   203  	end()
   204  }
   205  
   206  func rotateRight(slice *[]GPPhysical) []GPPhysical {
   207  	n := len(*slice)
   208  	new := make([]GPPhysical, n)
   209  	for i, reg := range *slice {
   210  		new[(i+1)%n] = reg
   211  	}
   212  	return new
   213  }
   214  
   215  func loop() {
   216  	Label("loop")
   217  	MOVQ(RSP, RBP)
   218  
   219  	regs := []GPPhysical{R8L, R9L, R10L, R11L, R12L, R13L, R14L, R15L}
   220  	n := len(_K)
   221  
   222  	for i := 0; i < 16; i++ {
   223  		sha256Round0(i, _K[i], regs[0], regs[1], regs[2], regs[3], regs[4], regs[5], regs[6], regs[7])
   224  		regs = rotateRight(&regs)
   225  	}
   226  
   227  	for i := 16; i < n; i++ {
   228  		sha256Round1(i, _K[i], regs[0], regs[1], regs[2], regs[3], regs[4], regs[5], regs[6], regs[7])
   229  		regs = rotateRight(&regs)
   230  	}
   231  
   232  	Load(Param("dig"), RBP)
   233  	BP := Mem{Base: BP}
   234  	ADDL(BP.Offset(0*4), R8L) //  H0 = a + H0
   235  	MOVL(R8L, BP.Offset(0*4))
   236  	ADDL(BP.Offset(1*4), R9L) //  H1 = b + H1
   237  	MOVL(R9L, BP.Offset(1*4))
   238  	ADDL(BP.Offset(2*4), R10L) // H2 = c + H2
   239  	MOVL(R10L, BP.Offset(2*4))
   240  	ADDL(BP.Offset(3*4), R11L) // H3 = d + H3
   241  	MOVL(R11L, BP.Offset(3*4))
   242  	ADDL(BP.Offset(4*4), R12L) // H4 = e + H4
   243  	MOVL(R12L, BP.Offset(4*4))
   244  	ADDL(BP.Offset(5*4), R13L) // H5 = f + H5
   245  	MOVL(R13L, BP.Offset(5*4))
   246  	ADDL(BP.Offset(6*4), R14L) // H6 = g + H6
   247  	MOVL(R14L, BP.Offset(6*4))
   248  	ADDL(BP.Offset(7*4), R15L) // H7 = h + H7
   249  	MOVL(R15L, BP.Offset(7*4))
   250  
   251  	ADDQ(Imm(64), RSI)
   252  	CMPQ(RSI, Mem{Base: SP}.Offset(256))
   253  	JB(LabelRef("loop"))
   254  }
   255  
   256  func end() {
   257  	Label("end")
   258  	RET()
   259  }
   260  
   261  var _K = []uint32{
   262  	0x428a2f98,
   263  	0x71374491,
   264  	0xb5c0fbcf,
   265  	0xe9b5dba5,
   266  	0x3956c25b,
   267  	0x59f111f1,
   268  	0x923f82a4,
   269  	0xab1c5ed5,
   270  	0xd807aa98,
   271  	0x12835b01,
   272  	0x243185be,
   273  	0x550c7dc3,
   274  	0x72be5d74,
   275  	0x80deb1fe,
   276  	0x9bdc06a7,
   277  	0xc19bf174,
   278  	0xe49b69c1,
   279  	0xefbe4786,
   280  	0x0fc19dc6,
   281  	0x240ca1cc,
   282  	0x2de92c6f,
   283  	0x4a7484aa,
   284  	0x5cb0a9dc,
   285  	0x76f988da,
   286  	0x983e5152,
   287  	0xa831c66d,
   288  	0xb00327c8,
   289  	0xbf597fc7,
   290  	0xc6e00bf3,
   291  	0xd5a79147,
   292  	0x06ca6351,
   293  	0x14292967,
   294  	0x27b70a85,
   295  	0x2e1b2138,
   296  	0x4d2c6dfc,
   297  	0x53380d13,
   298  	0x650a7354,
   299  	0x766a0abb,
   300  	0x81c2c92e,
   301  	0x92722c85,
   302  	0xa2bfe8a1,
   303  	0xa81a664b,
   304  	0xc24b8b70,
   305  	0xc76c51a3,
   306  	0xd192e819,
   307  	0xd6990624,
   308  	0xf40e3585,
   309  	0x106aa070,
   310  	0x19a4c116,
   311  	0x1e376c08,
   312  	0x2748774c,
   313  	0x34b0bcb5,
   314  	0x391c0cb3,
   315  	0x4ed8aa4a,
   316  	0x5b9cca4f,
   317  	0x682e6ff3,
   318  	0x748f82ee,
   319  	0x78a5636f,
   320  	0x84c87814,
   321  	0x8cc70208,
   322  	0x90befffa,
   323  	0xa4506ceb,
   324  	0xbef9a3f7,
   325  	0xc67178f2,
   326  }
   327  

View as plain text