md5block_riscv64.s

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  //
     5  // RISCV64 version of md5block.go
     6  // derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go
     7  
     8  //go:build !purego
     9  
    10  #include "textflag.h"
    11  
    12  #define LOAD32U(base, offset, tmp, dest) \
    13  	MOVBU	(offset+0*1)(base), dest; \
    14  	MOVBU	(offset+1*1)(base), tmp; \
    15  	SLL	$8, tmp; \
    16  	OR	tmp, dest; \
    17  	MOVBU	(offset+2*1)(base), tmp; \
    18  	SLL	$16, tmp; \
    19  	OR	tmp, dest; \
    20  	MOVBU	(offset+3*1)(base), tmp; \
    21  	SLL	$24, tmp; \
    22  	OR	tmp, dest
    23  
    24  #define LOAD64U(base, offset, tmp1, tmp2, dst) \
    25  	LOAD32U(base, offset, tmp1, dst); \
    26  	LOAD32U(base, offset+4, tmp1, tmp2); \
    27  	SLL	$32, tmp2; \
    28  	OR	tmp2, dst
    29  
    30  #define ROUND1EVN(a, b, c, d, x, const, shift) \
    31  	MOV	$const, X23; \
    32  	ADDW	x, a; \
    33  	ADDW	X23, a; \
    34  	XOR	c, d, X23; \
    35  	AND	b, X23; \
    36  	XOR	d, X23; \
    37  	ADDW	X23, a; \
    38  	RORIW	$(32-shift), a; \
    39  	ADDW	b, a
    40  
    41  #define ROUND1ODD(a, b, c, d, x, const, shift) \
    42  	MOV	$const, X23; \
    43  	ADDW	X23, a; \
    44  	SRL	$32, x, X23; \
    45  	ADDW	X23, a; \
    46  	XOR	c, d, X23; \
    47  	AND	b, X23; \
    48  	XOR	d, X23; \
    49  	ADDW	X23, a; \
    50  	RORIW	$(32-shift), a; \
    51  	ADDW	b, a
    52  
    53  #define ROUND2EVN(a, b, c, d, x, const, shift) \
    54  	MOV	$const, X23; \
    55  	ADDW	x, a; \
    56  	ADDW	X23, a; \
    57  	XOR	b, c, X23; \
    58  	AND	d, X23; \
    59  	XOR	c, X23; \
    60  	ADDW	X23, a; \
    61  	RORIW	$(32-shift), a; \
    62  	ADDW	b, a
    63  
    64  #define ROUND2ODD(a, b, c, d, x, const, shift) \
    65  	MOV	$const, X23; \
    66  	ADDW	X23, a; \
    67  	SRL	$32, x, X23; \
    68  	ADDW	X23, a; \
    69  	XOR	b, c, X23; \
    70  	AND	d, X23; \
    71  	XOR	c, X23; \
    72  	ADDW	X23, a; \
    73  	RORIW	$(32-shift), a; \
    74  	ADDW	b, a
    75  
    76  #define ROUND3EVN(a, b, c, d, x, const, shift) \
    77  	MOV	$const, X23; \
    78  	ADDW	x, a; \
    79  	ADDW	X23, a; \
    80  	XOR	c, d, X23; \
    81  	XOR	b, X23; \
    82  	ADDW	X23, a; \
    83  	RORIW	$(32-shift), a; \
    84  	ADDW	b, a
    85  
    86  #define ROUND3ODD(a, b, c, d, x, const, shift) \
    87  	MOV	$const, X23; \
    88  	ADDW	X23, a; \
    89  	SRL	$32, x, X23; \
    90  	ADDW	X23, a; \
    91  	XOR	c, d, X23; \
    92  	XOR	b, X23; \
    93  	ADDW	X23, a; \
    94  	RORIW	$(32-shift), a; \
    95  	ADDW	b, a
    96  
    97  #define ROUND4EVN(a, b, c, d, x, const, shift) \
    98  	MOV	$const, X23; \
    99  	ADDW	x, a; \
   100  	ADDW	X23, a; \
   101  	ORN	d, b, X23; \
   102  	XOR	c, X23; \
   103  	ADDW	X23, a; \
   104  	RORIW	$(32-shift), a; \
   105  	ADDW	b, a
   106  
   107  #define ROUND4ODD(a, b, c, d, x, const, shift) \
   108  	MOV	$const, X23; \
   109  	ADDW	X23, a; \
   110  	SRL	$32, x, X23; \
   111  	ADDW	X23, a; \
   112  	ORN	d, b, X23; \
   113  	XOR	c, X23; \
   114  	ADDW	X23, a; \
   115  	RORIW	$(32-shift), a; \
   116  	ADDW	b, a
   117  
   118  // Register use for the block function
   119  //
   120  // X5 - X12	: contain the 16 32 bit data items in the block we're
   121  //		  processing.  Odd numbered values, e.g., x1, x3 are stored in
   122  //		  the upper 32 bits of the register.
   123  // X13 - X16	: a, b, c, d
   124  // X17 - X20	: used to store the old values of a, b, c, d, i.e., aa, bb, cc,
   125  //		  dd.  X17 and X18 are also used as temporary registers when
   126  //		  loading unaligned data.
   127  // X22		: pointer to dig.s
   128  // X23		: temporary register
   129  // X28		: pointer to the first byte beyond the end of p
   130  // X29		: pointer to current 64 byte block of data, initially set to
   131  //		  &p[0]
   132  // X30		: temporary register
   133  
   134  TEXT	·block(SB),NOSPLIT,$0-32
   135  	MOV	p+8(FP), X29
   136  	MOV	p_len+16(FP), X30
   137  	SRL	$6, X30
   138  	SLL	$6, X30
   139  	BEQZ	X30, zero
   140  
   141  	ADD	X29, X30, X28
   142  
   143  	MOV	dig+0(FP), X22
   144  	MOVWU	(0*4)(X22), X13	// a = s[0]
   145  	MOVWU	(1*4)(X22), X14	// b = s[1]
   146  	MOVWU	(2*4)(X22), X15	// c = s[2]
   147  	MOVWU	(3*4)(X22), X16	// d = s[3]
   148  
   149  loop:
   150  
   151  	// Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12.
   152  	// Different paths are taken to load the values depending on whether the
   153  	// buffer is 8 byte aligned or not.  We load all the values up front
   154  	// here at the start of the loop to avoid multiple alignment checks and
   155  	// to reduce code size.  It takes 10 instructions to load an unaligned
   156  	// 32 bit value and this value will be used 4 times in the main body
   157  	// of the loop below.
   158  
   159  	AND	$7, X29, X30
   160  	BEQZ	X30, aligned
   161  
   162  	LOAD64U(X29,0, X17, X18, X5)
   163  	LOAD64U(X29,8, X17, X18, X6)
   164  	LOAD64U(X29,16, X17, X18, X7)
   165  	LOAD64U(X29,24, X17, X18, X8)
   166  	LOAD64U(X29,32, X17, X18, X9)
   167  	LOAD64U(X29,40, X17, X18, X10)
   168  	LOAD64U(X29,48, X17, X18, X11)
   169  	LOAD64U(X29,56, X17, X18, X12)
   170  	JMP block_loaded
   171  
   172  aligned:
   173  	MOV	(0*8)(X29), X5
   174  	MOV	(1*8)(X29), X6
   175  	MOV	(2*8)(X29), X7
   176  	MOV	(3*8)(X29), X8
   177  	MOV	(4*8)(X29), X9
   178  	MOV	(5*8)(X29), X10
   179  	MOV	(6*8)(X29), X11
   180  	MOV	(7*8)(X29), X12
   181  
   182  block_loaded:
   183  	MOV	X13, X17
   184  	MOV	X14, X18
   185  	MOV	X15, X19
   186  	MOV	X16, X20
   187  
   188  	// Some of the hex constants below are too large to fit into a
   189  	// signed 32 bit value.  The assembler will handle these
   190  	// constants in a special way to ensure that they are
   191  	// zero extended.  Our algorithm is only interested in the
   192  	// bottom 32 bits and doesn't care whether constants are
   193  	// sign or zero extended when moved into 64 bit registers.
   194  	// So we use signed constants instead of hex when bit 31 is
   195  	// set so all constants can be loaded by lui+addi.
   196  
   197  	ROUND1EVN(X13,X14,X15,X16,X5,  -680876936, 7); // 0xd76aa478
   198  	ROUND1ODD(X16,X13,X14,X15,X5,  -389564586,12); // 0xe8c7b756
   199  	ROUND1EVN(X15,X16,X13,X14,X6,  0x242070db,17); // 0x242070db
   200  	ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee
   201  	ROUND1EVN(X13,X14,X15,X16,X7,  -176418897, 7); // 0xf57c0faf
   202  	ROUND1ODD(X16,X13,X14,X15,X7,  0x4787c62a,12); // 0x4787c62a
   203  	ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613
   204  	ROUND1ODD(X14,X15,X16,X13,X8,   -45705983,22); // 0xfd469501
   205  	ROUND1EVN(X13,X14,X15,X16,X9,  0x698098d8, 7); // 0x698098d8
   206  	ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af
   207  	ROUND1EVN(X15,X16,X13,X14,X10,     -42063,17); // 0xffff5bb1
   208  	ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be
   209  	ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122
   210  	ROUND1ODD(X16,X13,X14,X15,X11,  -40341101,12); // 0xfd987193
   211  	ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e
   212  	ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821
   213  
   214  	ROUND2ODD(X13,X14,X15,X16,X5,  -165796510, 5); // f61e2562
   215  	ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340
   216  	ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51
   217  	ROUND2EVN(X14,X15,X16,X13,X5,  -373897302,20); // e9b6c7aa
   218  	ROUND2ODD(X13,X14,X15,X16,X7,  -701558691, 5); // d62f105d
   219  	ROUND2EVN(X16,X13,X14,X15,X10,  0x2441453, 9); // 2441453
   220  	ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681
   221  	ROUND2EVN(X14,X15,X16,X13,X7,  -405537848,20); // e7d3fbc8
   222  	ROUND2ODD(X13,X14,X15,X16,X9,  0x21e1cde6, 5); // 21e1cde6
   223  	ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6
   224  	ROUND2ODD(X15,X16,X13,X14,X6,  -187363961,14); // f4d50d87
   225  	ROUND2EVN(X14,X15,X16,X13,X9,  0x455a14ed,20); // 455a14ed
   226  	ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905
   227  	ROUND2EVN(X16,X13,X14,X15,X6,   -51403784, 9); // fcefa3f8
   228  	ROUND2ODD(X15,X16,X13,X14,X8,  0x676f02d9,14); // 676f02d9
   229  	ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a
   230  
   231  	ROUND3ODD(X13,X14,X15,X16,X7,     -378558, 4); // fffa3942
   232  	ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681
   233  	ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122
   234  	ROUND3EVN(X14,X15,X16,X13,X12,  -35309556,23); // fde5380c
   235  	ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44
   236  	ROUND3EVN(X16,X13,X14,X15,X7,  0x4bdecfa9,11); // 4bdecfa9
   237  	ROUND3ODD(X15,X16,X13,X14,X8,  -155497632,16); // f6bb4b60
   238  	ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70
   239  	ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6
   240  	ROUND3EVN(X16,X13,X14,X15,X5,  -358537222,11); // eaa127fa
   241  	ROUND3ODD(X15,X16,X13,X14,X6,  -722521979,16); // d4ef3085
   242  	ROUND3EVN(X14,X15,X16,X13,X8,   0x4881d05,23); // 4881d05
   243  	ROUND3ODD(X13,X14,X15,X16,X9,  -640364487, 4); // d9d4d039
   244  	ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5
   245  	ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8
   246  	ROUND3EVN(X14,X15,X16,X13,X6,  -995338651,23); // c4ac5665
   247  
   248  	ROUND4EVN(X13,X14,X15,X16,X5,  -198630844, 6); // f4292244
   249  	ROUND4ODD(X16,X13,X14,X15,X8,  0x432aff97,10); // 432aff97
   250  	ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7
   251  	ROUND4ODD(X14,X15,X16,X13,X7,   -57434055,21); // fc93a039
   252  	ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3
   253  	ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92
   254  	ROUND4EVN(X15,X16,X13,X14,X10   ,-1051523,15); // ffeff47d
   255  	ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1
   256  	ROUND4EVN(X13,X14,X15,X16,X9,  0x6fa87e4f, 6); // 6fa87e4f
   257  	ROUND4ODD(X16,X13,X14,X15,X12,  -30611744,10); // fe2ce6e0
   258  	ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314
   259  	ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1
   260  	ROUND4EVN(X13,X14,X15,X16,X7,  -145523070, 6); // f7537e82
   261  	ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235
   262  	ROUND4EVN(X15,X16,X13,X14,X6,  0x2ad7d2bb,15); // 2ad7d2bb
   263  	ROUND4ODD(X14,X15,X16,X13,X9,  -343485551,21); // eb86d391
   264  
   265  	ADDW	X17, X13
   266  	ADDW	X18, X14
   267  	ADDW	X19, X15
   268  	ADDW	X20, X16
   269  
   270  	ADD	$64, X29
   271  	BNE	X28, X29, loop
   272  
   273  	MOVW	X13, (0*4)(X22)
   274  	MOVW	X14, (1*4)(X22)
   275  	MOVW	X15, (2*4)(X22)
   276  	MOVW	X16, (3*4)(X22)
   277  
   278  zero:
   279  	RET
   280
View as plain text