// Copyright 2024 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package main import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" ) // The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2 // It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version // // Reference // S. Gulley, et al, "New Instructions Supporting the Secure Hash // Algorithm on IntelĀ® Architecture Processors", July 2013 // https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html func blockSHANI() { Implement("blockSHANI") Load(Param("dig"), digestPtr) // init digest hash vector H0, H1,..., H7 pointer Load(Param("p").Base(), dataPtr) // init input data base pointer Load(Param("p").Len(), numBytes) // get number of input bytes to hash SHRQ(Imm(6), numBytes) // force modulo 64 input buffer length SHLQ(Imm(6), numBytes) CMPQ(numBytes, Imm(0)) // exit early for zero-length input buffer JEQ(LabelRef("done")) ADDQ(dataPtr, numBytes) // point numBytes to end of input buffer VMOVDQU(Mem{Base: digestPtr}.Offset(0*16), state0) // load initial hash values and reorder VMOVDQU(Mem{Base: digestPtr}.Offset(1*16), state1) // DCBA, HGFE -> ABEF, CDGH PSHUFD(Imm(0xb1), state0, state0) // CDAB PSHUFD(Imm(0x1b), state1, state1) // EFGH VMOVDQA(state0, m4) PALIGNR(Imm(8), state1, state0) // ABEF PBLENDW(Imm(0xf0), m4, state1) // CDGH flip_mask := flip_mask_DATA() VMOVDQA(flip_mask, shufMask) LEAQ(K256_DATA(), sha256Constants) roundLoop() done() } func roundLoop() { Label("roundLoop") Comment("save hash values for addition after rounds") VMOVDQA(state0, abefSave) VMOVDQA(state1, cdghSave) Comment("do rounds 0-59") rounds0to11(m0, nil, 0, nop) // 0-3 rounds0to11(m1, m0, 1, sha256msg1) // 4-7 rounds0to11(m2, m1, 2, sha256msg1) // 8-11 VMOVDQU(Mem{Base: dataPtr}.Offset(3*16), msg) PSHUFB(shufMask, msg) rounds12to59(m3, 3, m2, m0, sha256msg1, vmovrev) // 12-15 rounds12to59(m0, 4, m3, m1, sha256msg1, vmov) // 16-19 rounds12to59(m1, 5, m0, m2, sha256msg1, vmov) // 20-23 rounds12to59(m2, 6, m1, m3, sha256msg1, vmov) // 24-27 rounds12to59(m3, 7, m2, m0, sha256msg1, vmov) // 28-31 rounds12to59(m0, 8, m3, m1, sha256msg1, vmov) // 32-35 rounds12to59(m1, 9, m0, m2, sha256msg1, vmov) // 36-39 rounds12to59(m2, 10, m1, m3, sha256msg1, vmov) // 40-43 rounds12to59(m3, 11, m2, m0, sha256msg1, vmov) // 44-47 rounds12to59(m0, 12, m3, m1, sha256msg1, vmov) // 48-51 rounds12to59(m1, 13, m0, m2, nop, vmov) // 52-55 rounds12to59(m2, 14, m1, m3, nop, vmov) // 56-59 Comment("do rounds 60-63") VMOVDQA(m3, msg) PADDD(Mem{Base: sha256Constants}.Offset(15*32), msg) SHA256RNDS2(msg, state0, state1) PSHUFD(Imm(0x0e), msg, msg) SHA256RNDS2(msg, state1, state0) Comment("add current hash values with previously saved") PADDD(abefSave, state0) PADDD(cdghSave, state1) Comment("advance data pointer; loop until buffer empty") ADDQ(Imm(64), dataPtr) CMPQ(numBytes, dataPtr) JNE(LabelRef("roundLoop")) Comment("write hash values back in the correct order") PSHUFD(Imm(0x1b), state0, state0) PSHUFD(Imm(0xb1), state1, state1) VMOVDQA(state0, m4) PBLENDW(Imm(0xf0), state1, state0) PALIGNR(Imm(8), m4, state1) VMOVDQU(state0, Mem{Base: digestPtr}.Offset(0*16)) VMOVDQU(state1, Mem{Base: digestPtr}.Offset(1*16)) } func done() { Label("done") RET() } var ( digestPtr GPPhysical = RDI // input/output, base pointer to digest hash vector H0, H1, ..., H7 dataPtr = RSI // input, base pointer to first input data block numBytes = RDX // input, number of input bytes to be processed sha256Constants = RAX // round contents from K256 table, indexed by round number x 32 msg VecPhysical = X0 // input data state0 = X1 // round intermediates and outputs state1 = X2 m0 = X3 // m0, m1,... m4 -- round message temps m1 = X4 m2 = X5 m3 = X6 m4 = X7 shufMask = X8 // input data endian conversion control mask abefSave = X9 // digest hash vector inter-block buffer abef cdghSave = X10 // digest hash vector inter-block buffer cdgh ) // nop instead of final SHA256MSG1 for first and last few rounds func nop(m, a VecPhysical) { } // final SHA256MSG1 for middle rounds that require it func sha256msg1(m, a VecPhysical) { SHA256MSG1(m, a) } // msg copy for all but rounds 12-15 func vmov(a, b VecPhysical) { VMOVDQA(a, b) } // reverse copy for rounds 12-15 func vmovrev(a, b VecPhysical) { VMOVDQA(b, a) } type VecFunc func(a, b VecPhysical) // sha rounds 0 to 11 // // identical with the exception of the final msg op // which is replaced with a nop for rounds where it is not needed // refer to Gulley, et al for more information func rounds0to11(m, a VecPhysical, c int, sha256msg1 VecFunc) { VMOVDQU(Mem{Base: dataPtr}.Offset(c*16), msg) PSHUFB(shufMask, msg) VMOVDQA(msg, m) PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg) SHA256RNDS2(msg, state0, state1) PSHUFD(U8(0x0e), msg, msg) SHA256RNDS2(msg, state1, state0) sha256msg1(m, a) } // sha rounds 12 to 59 // // identical with the exception of the final msg op // and the reverse copy(m,msg) in round 12 which is required // after the last data load // refer to Gulley, et al for more information func rounds12to59(m VecPhysical, c int, a, t VecPhysical, sha256msg1, movop VecFunc) { movop(m, msg) PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg) SHA256RNDS2(msg, state0, state1) VMOVDQA(m, m4) PALIGNR(Imm(4), a, m4) PADDD(m4, t) SHA256MSG2(m, t) PSHUFD(Imm(0x0e), msg, msg) SHA256RNDS2(msg, state1, state0) sha256msg1(m, a) }