// Copyright 2023 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // // RISCV64 version of md5block.go // derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go //go:build !purego #include "textflag.h" #define LOAD32U(base, offset, tmp, dest) \ MOVBU (offset+0*1)(base), dest; \ MOVBU (offset+1*1)(base), tmp; \ SLL $8, tmp; \ OR tmp, dest; \ MOVBU (offset+2*1)(base), tmp; \ SLL $16, tmp; \ OR tmp, dest; \ MOVBU (offset+3*1)(base), tmp; \ SLL $24, tmp; \ OR tmp, dest #define LOAD64U(base, offset, tmp1, tmp2, dst) \ LOAD32U(base, offset, tmp1, dst); \ LOAD32U(base, offset+4, tmp1, tmp2); \ SLL $32, tmp2; \ OR tmp2, dst #define ROUND1EVN(a, b, c, d, x, const, shift) \ MOV $const, X23; \ ADDW x, a; \ ADDW X23, a; \ XOR c, d, X23; \ AND b, X23; \ XOR d, X23; \ ADDW X23, a; \ RORIW $(32-shift), a; \ ADDW b, a #define ROUND1ODD(a, b, c, d, x, const, shift) \ MOV $const, X23; \ ADDW X23, a; \ SRL $32, x, X23; \ ADDW X23, a; \ XOR c, d, X23; \ AND b, X23; \ XOR d, X23; \ ADDW X23, a; \ RORIW $(32-shift), a; \ ADDW b, a #define ROUND2EVN(a, b, c, d, x, const, shift) \ MOV $const, X23; \ ADDW x, a; \ ADDW X23, a; \ XOR b, c, X23; \ AND d, X23; \ XOR c, X23; \ ADDW X23, a; \ RORIW $(32-shift), a; \ ADDW b, a #define ROUND2ODD(a, b, c, d, x, const, shift) \ MOV $const, X23; \ ADDW X23, a; \ SRL $32, x, X23; \ ADDW X23, a; \ XOR b, c, X23; \ AND d, X23; \ XOR c, X23; \ ADDW X23, a; \ RORIW $(32-shift), a; \ ADDW b, a #define ROUND3EVN(a, b, c, d, x, const, shift) \ MOV $const, X23; \ ADDW x, a; \ ADDW X23, a; \ XOR c, d, X23; \ XOR b, X23; \ ADDW X23, a; \ RORIW $(32-shift), a; \ ADDW b, a #define ROUND3ODD(a, b, c, d, x, const, shift) \ MOV $const, X23; \ ADDW X23, a; \ SRL $32, x, X23; \ ADDW X23, a; \ XOR c, d, X23; \ XOR b, X23; \ ADDW X23, a; \ RORIW $(32-shift), a; \ ADDW b, a #define ROUND4EVN(a, b, c, d, x, const, shift) \ MOV $const, X23; \ ADDW x, a; \ ADDW X23, a; \ ORN d, b, X23; \ XOR c, X23; \ ADDW X23, a; \ RORIW $(32-shift), a; \ ADDW b, a #define ROUND4ODD(a, b, c, d, x, const, shift) \ MOV $const, X23; \ ADDW X23, a; \ SRL $32, x, X23; \ ADDW X23, a; \ ORN d, b, X23; \ XOR c, X23; \ ADDW X23, a; \ RORIW $(32-shift), a; \ ADDW b, a // Register use for the block function // // X5 - X12 : contain the 16 32 bit data items in the block we're // processing. Odd numbered values, e.g., x1, x3 are stored in // the upper 32 bits of the register. // X13 - X16 : a, b, c, d // X17 - X20 : used to store the old values of a, b, c, d, i.e., aa, bb, cc, // dd. X17 and X18 are also used as temporary registers when // loading unaligned data. // X22 : pointer to dig.s // X23 : temporary register // X28 : pointer to the first byte beyond the end of p // X29 : pointer to current 64 byte block of data, initially set to // &p[0] // X30 : temporary register TEXT ·block(SB),NOSPLIT,$0-32 MOV p+8(FP), X29 MOV p_len+16(FP), X30 SRL $6, X30 SLL $6, X30 BEQZ X30, zero ADD X29, X30, X28 MOV dig+0(FP), X22 MOVWU (0*4)(X22), X13 // a = s[0] MOVWU (1*4)(X22), X14 // b = s[1] MOVWU (2*4)(X22), X15 // c = s[2] MOVWU (3*4)(X22), X16 // d = s[3] loop: // Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12. // Different paths are taken to load the values depending on whether the // buffer is 8 byte aligned or not. We load all the values up front // here at the start of the loop to avoid multiple alignment checks and // to reduce code size. It takes 10 instructions to load an unaligned // 32 bit value and this value will be used 4 times in the main body // of the loop below. AND $7, X29, X30 BEQZ X30, aligned LOAD64U(X29,0, X17, X18, X5) LOAD64U(X29,8, X17, X18, X6) LOAD64U(X29,16, X17, X18, X7) LOAD64U(X29,24, X17, X18, X8) LOAD64U(X29,32, X17, X18, X9) LOAD64U(X29,40, X17, X18, X10) LOAD64U(X29,48, X17, X18, X11) LOAD64U(X29,56, X17, X18, X12) JMP block_loaded aligned: MOV (0*8)(X29), X5 MOV (1*8)(X29), X6 MOV (2*8)(X29), X7 MOV (3*8)(X29), X8 MOV (4*8)(X29), X9 MOV (5*8)(X29), X10 MOV (6*8)(X29), X11 MOV (7*8)(X29), X12 block_loaded: MOV X13, X17 MOV X14, X18 MOV X15, X19 MOV X16, X20 // Some of the hex constants below are too large to fit into a // signed 32 bit value. The assembler will handle these // constants in a special way to ensure that they are // zero extended. Our algorithm is only interested in the // bottom 32 bits and doesn't care whether constants are // sign or zero extended when moved into 64 bit registers. // So we use signed constants instead of hex when bit 31 is // set so all constants can be loaded by lui+addi. ROUND1EVN(X13,X14,X15,X16,X5, -680876936, 7); // 0xd76aa478 ROUND1ODD(X16,X13,X14,X15,X5, -389564586,12); // 0xe8c7b756 ROUND1EVN(X15,X16,X13,X14,X6, 0x242070db,17); // 0x242070db ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee ROUND1EVN(X13,X14,X15,X16,X7, -176418897, 7); // 0xf57c0faf ROUND1ODD(X16,X13,X14,X15,X7, 0x4787c62a,12); // 0x4787c62a ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613 ROUND1ODD(X14,X15,X16,X13,X8, -45705983,22); // 0xfd469501 ROUND1EVN(X13,X14,X15,X16,X9, 0x698098d8, 7); // 0x698098d8 ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af ROUND1EVN(X15,X16,X13,X14,X10, -42063,17); // 0xffff5bb1 ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122 ROUND1ODD(X16,X13,X14,X15,X11, -40341101,12); // 0xfd987193 ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821 ROUND2ODD(X13,X14,X15,X16,X5, -165796510, 5); // f61e2562 ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340 ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51 ROUND2EVN(X14,X15,X16,X13,X5, -373897302,20); // e9b6c7aa ROUND2ODD(X13,X14,X15,X16,X7, -701558691, 5); // d62f105d ROUND2EVN(X16,X13,X14,X15,X10, 0x2441453, 9); // 2441453 ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681 ROUND2EVN(X14,X15,X16,X13,X7, -405537848,20); // e7d3fbc8 ROUND2ODD(X13,X14,X15,X16,X9, 0x21e1cde6, 5); // 21e1cde6 ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6 ROUND2ODD(X15,X16,X13,X14,X6, -187363961,14); // f4d50d87 ROUND2EVN(X14,X15,X16,X13,X9, 0x455a14ed,20); // 455a14ed ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905 ROUND2EVN(X16,X13,X14,X15,X6, -51403784, 9); // fcefa3f8 ROUND2ODD(X15,X16,X13,X14,X8, 0x676f02d9,14); // 676f02d9 ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a ROUND3ODD(X13,X14,X15,X16,X7, -378558, 4); // fffa3942 ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681 ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122 ROUND3EVN(X14,X15,X16,X13,X12, -35309556,23); // fde5380c ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44 ROUND3EVN(X16,X13,X14,X15,X7, 0x4bdecfa9,11); // 4bdecfa9 ROUND3ODD(X15,X16,X13,X14,X8, -155497632,16); // f6bb4b60 ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70 ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6 ROUND3EVN(X16,X13,X14,X15,X5, -358537222,11); // eaa127fa ROUND3ODD(X15,X16,X13,X14,X6, -722521979,16); // d4ef3085 ROUND3EVN(X14,X15,X16,X13,X8, 0x4881d05,23); // 4881d05 ROUND3ODD(X13,X14,X15,X16,X9, -640364487, 4); // d9d4d039 ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5 ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8 ROUND3EVN(X14,X15,X16,X13,X6, -995338651,23); // c4ac5665 ROUND4EVN(X13,X14,X15,X16,X5, -198630844, 6); // f4292244 ROUND4ODD(X16,X13,X14,X15,X8, 0x432aff97,10); // 432aff97 ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7 ROUND4ODD(X14,X15,X16,X13,X7, -57434055,21); // fc93a039 ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3 ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92 ROUND4EVN(X15,X16,X13,X14,X10 ,-1051523,15); // ffeff47d ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1 ROUND4EVN(X13,X14,X15,X16,X9, 0x6fa87e4f, 6); // 6fa87e4f ROUND4ODD(X16,X13,X14,X15,X12, -30611744,10); // fe2ce6e0 ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314 ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1 ROUND4EVN(X13,X14,X15,X16,X7, -145523070, 6); // f7537e82 ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235 ROUND4EVN(X15,X16,X13,X14,X6, 0x2ad7d2bb,15); // 2ad7d2bb ROUND4ODD(X14,X15,X16,X13,X9, -343485551,21); // eb86d391 ADDW X17, X13 ADDW X18, X14 ADDW X19, X15 ADDW X20, X16 ADD $64, X29 BNE X28, X29, loop MOVW X13, (0*4)(X22) MOVW X14, (1*4)(X22) MOVW X15, (2*4)(X22) MOVW X16, (3*4)(X22) zero: RET