Text file
src/runtime/memmove_arm64.s
1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 // See memmove Go doc for important implementation constraints.
8
9 // Register map
10 //
11 // dstin R0
12 // src R1
13 // count R2
14 // dst R3 (same as R0, but gets modified in unaligned cases)
15 // srcend R4
16 // dstend R5
17 // data R6-R17
18 // tmp1 R14
19
20 // Copies are split into 3 main cases: small copies of up to 32 bytes, medium
21 // copies of up to 128 bytes, and large copies. The overhead of the overlap
22 // check is negligible since it is only required for large copies.
23 //
24 // Large copies use a software pipelined loop processing 64 bytes per iteration.
25 // The destination pointer is 16-byte aligned to minimize unaligned accesses.
26 // The loop tail is handled by always copying 64 bytes from the end.
27
28 // func memmove(to, from unsafe.Pointer, n uintptr)
29 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
30 CBZ R2, copy0
31
32 // Small copies: 1..16 bytes
33 CMP $16, R2
34 BLE copy16
35
36 // Large copies
37 CMP $128, R2
38 BHI copy_long
39 CMP $32, R2
40 BHI copy32_128
41
42 // Small copies: 17..32 bytes.
43 LDP (R1), (R6, R7)
44 ADD R1, R2, R4 // R4 points just past the last source byte
45 LDP -16(R4), (R12, R13)
46 STP (R6, R7), (R0)
47 ADD R0, R2, R5 // R5 points just past the last destination byte
48 STP (R12, R13), -16(R5)
49 RET
50
51 // Small copies: 1..16 bytes.
52 copy16:
53 ADD R1, R2, R4 // R4 points just past the last source byte
54 ADD R0, R2, R5 // R5 points just past the last destination byte
55 CMP $8, R2
56 BLT copy7
57 MOVD (R1), R6
58 MOVD -8(R4), R7
59 MOVD R6, (R0)
60 MOVD R7, -8(R5)
61 RET
62
63 copy7:
64 TBZ $2, R2, copy3
65 MOVWU (R1), R6
66 MOVWU -4(R4), R7
67 MOVW R6, (R0)
68 MOVW R7, -4(R5)
69 RET
70
71 copy3:
72 TBZ $1, R2, copy1
73 MOVHU (R1), R6
74 MOVHU -2(R4), R7
75 MOVH R6, (R0)
76 MOVH R7, -2(R5)
77 RET
78
79 copy1:
80 MOVBU (R1), R6
81 MOVB R6, (R0)
82
83 copy0:
84 RET
85
86 // Medium copies: 33..128 bytes.
87 copy32_128:
88 ADD R1, R2, R4 // R4 points just past the last source byte
89 ADD R0, R2, R5 // R5 points just past the last destination byte
90 LDP (R1), (R6, R7)
91 LDP 16(R1), (R8, R9)
92 LDP -32(R4), (R10, R11)
93 LDP -16(R4), (R12, R13)
94 CMP $64, R2
95 BHI copy128
96 STP (R6, R7), (R0)
97 STP (R8, R9), 16(R0)
98 STP (R10, R11), -32(R5)
99 STP (R12, R13), -16(R5)
100 RET
101
102 // Copy 65..128 bytes.
103 copy128:
104 LDP 32(R1), (R14, R15)
105 LDP 48(R1), (R16, R17)
106 CMP $96, R2
107 BLS copy96
108 LDP -64(R4), (R2, R3)
109 LDP -48(R4), (R1, R4)
110 STP (R2, R3), -64(R5)
111 STP (R1, R4), -48(R5)
112
113 copy96:
114 STP (R6, R7), (R0)
115 STP (R8, R9), 16(R0)
116 STP (R14, R15), 32(R0)
117 STP (R16, R17), 48(R0)
118 STP (R10, R11), -32(R5)
119 STP (R12, R13), -16(R5)
120 RET
121
122 // Copy more than 128 bytes.
123 copy_long:
124 ADD R1, R2, R4 // R4 points just past the last source byte
125 ADD R0, R2, R5 // R5 points just past the last destination byte
126 MOVD ZR, R7
127 MOVD ZR, R8
128
129 CMP $1024, R2
130 BLT backward_check
131 // feature detect to decide how to align
132 MOVBU runtime·arm64UseAlignedLoads(SB), R6
133 CBNZ R6, use_aligned_loads
134 MOVD R0, R7
135 MOVD R5, R8
136 B backward_check
137 use_aligned_loads:
138 MOVD R1, R7
139 MOVD R4, R8
140 // R7 and R8 are used here for the realignment calculation. In
141 // the use_aligned_loads case, R7 is the src pointer and R8 is
142 // srcend pointer, which is used in the backward copy case.
143 // When doing aligned stores, R7 is the dst pointer and R8 is
144 // the dstend pointer.
145
146 backward_check:
147 // Use backward copy if there is an overlap.
148 SUB R1, R0, R14
149 CBZ R14, copy0
150 CMP R2, R14
151 BCC copy_long_backward
152
153 // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
154 LDP (R1), (R12, R13) // Load A
155 AND $15, R7, R14 // Calculate the realignment offset
156 SUB R14, R1, R1
157 SUB R14, R0, R3 // move dst back same amount as src
158 ADD R14, R2, R2
159 LDP 16(R1), (R6, R7) // Load B
160 STP (R12, R13), (R0) // Store A
161 LDP 32(R1), (R8, R9) // Load C
162 LDP 48(R1), (R10, R11) // Load D
163 LDP.W 64(R1), (R12, R13) // Load E
164 // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
165 SUBS $144, R2, R2
166 BLS copy64_from_end
167
168 loop64:
169 STP (R6, R7), 16(R3) // Store B
170 LDP 16(R1), (R6, R7) // Load B (next iteration)
171 STP (R8, R9), 32(R3) // Store C
172 LDP 32(R1), (R8, R9) // Load C
173 STP (R10, R11), 48(R3) // Store D
174 LDP 48(R1), (R10, R11) // Load D
175 STP.W (R12, R13), 64(R3) // Store E
176 LDP.W 64(R1), (R12, R13) // Load E
177 SUBS $64, R2, R2
178 BHI loop64
179
180 // Write the last iteration and copy 64 bytes from the end.
181 copy64_from_end:
182 LDP -64(R4), (R14, R15) // Load F
183 STP (R6, R7), 16(R3) // Store B
184 LDP -48(R4), (R6, R7) // Load G
185 STP (R8, R9), 32(R3) // Store C
186 LDP -32(R4), (R8, R9) // Load H
187 STP (R10, R11), 48(R3) // Store D
188 LDP -16(R4), (R10, R11) // Load I
189 STP (R12, R13), 64(R3) // Store E
190 STP (R14, R15), -64(R5) // Store F
191 STP (R6, R7), -48(R5) // Store G
192 STP (R8, R9), -32(R5) // Store H
193 STP (R10, R11), -16(R5) // Store I
194 RET
195
196 // Large backward copy for overlapping copies.
197 // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
198 copy_long_backward:
199 LDP -16(R4), (R12, R13)
200 AND $15, R8, R14
201 SUB R14, R4, R4
202 SUB R14, R2, R2
203 LDP -16(R4), (R6, R7)
204 STP (R12, R13), -16(R5)
205 LDP -32(R4), (R8, R9)
206 LDP -48(R4), (R10, R11)
207 LDP.W -64(R4), (R12, R13)
208 SUB R14, R5, R5
209 SUBS $128, R2, R2
210 BLS copy64_from_start
211
212 loop64_backward:
213 STP (R6, R7), -16(R5)
214 LDP -16(R4), (R6, R7)
215 STP (R8, R9), -32(R5)
216 LDP -32(R4), (R8, R9)
217 STP (R10, R11), -48(R5)
218 LDP -48(R4), (R10, R11)
219 STP.W (R12, R13), -64(R5)
220 LDP.W -64(R4), (R12, R13)
221 SUBS $64, R2, R2
222 BHI loop64_backward
223
224 // Write the last iteration and copy 64 bytes from the start.
225 copy64_from_start:
226 LDP 48(R1), (R2, R3)
227 STP (R6, R7), -16(R5)
228 LDP 32(R1), (R6, R7)
229 STP (R8, R9), -32(R5)
230 LDP 16(R1), (R8, R9)
231 STP (R10, R11), -48(R5)
232 LDP (R1), (R10, R11)
233 STP (R12, R13), -64(R5)
234 STP (R2, R3), 48(R0)
235 STP (R6, R7), 32(R0)
236 STP (R8, R9), 16(R0)
237 STP (R10, R11), (R0)
238 RET
239
View as plain text