1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 DATA ·chachaConst+0x00(SB)/4, $0x61707865
9 DATA ·chachaConst+0x04(SB)/4, $0x3320646e
10 DATA ·chachaConst+0x08(SB)/4, $0x79622d32
11 DATA ·chachaConst+0x0c(SB)/4, $0x6b206574
12 GLOBL ·chachaConst(SB), NOPTR|RODATA, $32
13
14 DATA ·chachaIncRot+0x00(SB)/4, $0x00000000
15 DATA ·chachaIncRot+0x04(SB)/4, $0x00000001
16 DATA ·chachaIncRot+0x08(SB)/4, $0x00000002
17 DATA ·chachaIncRot+0x0c(SB)/4, $0x00000003
18 GLOBL ·chachaIncRot(SB), NOPTR|RODATA, $32
19
20 // QR is the ChaCha8 quarter-round on a, b, c, and d.
21 #define QR(a, b, c, d) \
22 VADDW a, b, a; \
23 VXORV d, a, d; \
24 VROTRW $16, d; \
25 VADDW c, d, c; \
26 VXORV b, c, b; \
27 VROTRW $20, b; \
28 VADDW a, b, a; \
29 VXORV d, a, d; \
30 VROTRW $24, d; \
31 VADDW c, d, c; \
32 VXORV b, c, b; \
33 VROTRW $25, b
34
35
36 // func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32)
37 TEXT ·block<ABIInternal>(SB), NOSPLIT, $0
38 // seed in R4
39 // blocks in R5
40 // counter in R6
41
42 MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
43 BNE R7, lsx_chacha8
44 JMP ·block_generic<ABIInternal>(SB)
45 RET
46
47 lsx_chacha8:
48 MOVV $·chachaConst(SB), R10
49 MOVV $·chachaIncRot(SB), R11
50
51 // load contants
52 // VLDREPL.W $0, R10, V0
53 WORD $0x30200140
54 // VLDREPL.W $1, R10, V1
55 WORD $0x30200541
56 // VLDREPL.W $2, R10, V2
57 WORD $0x30200942
58 // VLDREPL.W $3, R10, V3
59 WORD $0x30200d43
60
61 // load 4-32bit data from incRotMatrix added to counter
62 VMOVQ (R11), V30
63
64 // load seed
65 // VLDREPL.W $0, R4, V4
66 WORD $0x30200084
67 // VLDREPL.W $1, R4, V5
68 WORD $0x30200485
69 // VLDREPL.W $2, R4, V6
70 WORD $0x30200886
71 // VLDREPL.W $3, R4, V7
72 WORD $0x30200c87
73 // VLDREPL.W $4, R4, V8
74 WORD $0x30201088
75 // VLDREPL.W $5, R4, V9
76 WORD $0x30201489
77 // VLDREPL.W $6, R4, V10
78 WORD $0x3020188a
79 // VLDREPL.W $7, R4, V11
80 WORD $0x30201c8b
81
82 // load counter and update counter
83 VMOVQ R6, V12.W4
84 VADDW V12, V30, V12
85
86 // zeros for remaining three matrix entries
87 VXORV V13, V13, V13
88 VXORV V14, V14, V14
89 VXORV V15, V15, V15
90
91 // save seed state for adding back later
92 VORV V4, V13, V20
93 VORV V5, V13, V21
94 VORV V6, V13, V22
95 VORV V7, V13, V23
96 VORV V8, V13, V24
97 VORV V9, V13, V25
98 VORV V10, V13, V26
99 VORV V11, V13, V27
100
101 // 4 iterations. Each iteration is 8 quarter-rounds.
102 MOVV $4, R7
103 loop:
104 QR(V0, V4, V8, V12)
105 QR(V1, V5, V9, V13)
106 QR(V2, V6, V10, V14)
107 QR(V3, V7, V11, V15)
108
109 QR(V0, V5, V10, V15)
110 QR(V1, V6, V11, V12)
111 QR(V2, V7, V8, V13)
112 QR(V3, V4, V9, V14)
113
114 SUBV $1, R7
115 BNE R7, R0, loop
116
117 // add seed back
118 VADDW V4, V20, V4
119 VADDW V5, V21, V5
120 VADDW V6, V22, V6
121 VADDW V7, V23, V7
122 VADDW V8, V24, V8
123 VADDW V9, V25, V9
124 VADDW V10, V26, V10
125 VADDW V11, V27, V11
126
127 // store blocks back to output buffer
128 VMOVQ V0, (R5)
129 VMOVQ V1, 16(R5)
130 VMOVQ V2, 32(R5)
131 VMOVQ V3, 48(R5)
132 VMOVQ V4, 64(R5)
133 VMOVQ V5, 80(R5)
134 VMOVQ V6, 96(R5)
135 VMOVQ V7, 112(R5)
136 VMOVQ V8, 128(R5)
137 VMOVQ V9, 144(R5)
138 VMOVQ V10, 160(R5)
139 VMOVQ V11, 176(R5)
140 VMOVQ V12, 192(R5)
141 VMOVQ V13, 208(R5)
142 VMOVQ V14, 224(R5)
143 VMOVQ V15, 240(R5)
144
145 RET
146
View as plain text