// Copyright 2022 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "textflag.h" // See memmove Go doc for important implementation constraints. // Register map // // to R4 // from R5 // n(aka count) R6 // to-end R7 // from-end R8 // data R11-R18 // tmp R9 // Algorithm: // // Memory alignment check is only performed for copy size greater // than 64 bytes to minimize overhead. // // when copy size <= 64 bytes, jump to label tail, according to the // copy size to select the appropriate case and copy directly. // Based on the common memory access instructions of loong64, the // currently implemented cases are: // move_0, move_1, move_2, move_3, move_4, move_5through7, move_8, // move_9through16, move_17through32, move_33through64 // // when copy size > 64 bytes, use the destination-aligned copying, // adopt the following strategy to copy in 3 parts: // 1. Head: do the memory alignment // 2. Body: a 64-byte loop structure // 3. Tail: processing of the remaining part (<= 64 bytes) // // forward: // // Dst NewDst Dstend // | |<----count after correction---->| // |<-------------count before correction---------->| // |<--8-(Dst&7)-->| |<---64 bytes--->| // +------------------------------------------------+ // | Head | Body | Tail | // +---------------+---------------+----------------+ // NewDst = Dst - (Dst & 7) + 8 // count = count - 8 + (Dst & 7) // Src = Src - (Dst & 7) + 8 // // backward: // // Dst NewDstend Dstend // |<-----count after correction------>| | // |<------------count before correction--------------->| // |<---64 bytes--->| |<---Dstend&7--->| // +----------------------------------------------------+ // | Tail | Body | Head | // +----------------+------------------+----------------+ // NewDstend = Dstend - (Dstend & 7) // count = count - (Dstend & 7) // Srcend = Srcend - (Dstend & 7) // func memmove(to, from unsafe.Pointer, n uintptr) TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 BEQ R4, R5, move_0 BEQ R6, move_0 ADDV R4, R6, R7 // to-end pointer ADDV R5, R6, R8 // from-end pointer tail: //copy size <= 64 bytes, copy directly, not check aligned // < 2 bytes SGTU $2, R6, R9 BNE R9, move_1 // < 3 bytes SGTU $3, R6, R9 BNE R9, move_2 // < 4 bytes SGTU $4, R6, R9 BNE R9, move_3 // < 5 bytes SGTU $5, R6, R9 BNE R9, move_4 // >= 5 bytes and < 8 bytes SGTU $8, R6, R9 BNE R9, move_5through7 // < 9 bytes SGTU $9, R6, R9 BNE R9, move_8 // >= 9 bytes and < 17 bytes SGTU $17, R6, R9 BNE R9, move_9through16 // >= 17 bytes and < 33 bytes SGTU $33, R6, R9 BNE R9, move_17through32 // >= 33 bytes and < 65 bytes SGTU $65, R6, R9 BNE R9, move_33through64 // if (dst > src) && (dst < src + count), regarded as memory // overlap, jump to backward // else, jump to forward BGEU R5, R4, forward ADDV R5, R6, R10 BLTU R4, R10, backward forward: AND $7, R4, R9 // dst & 7 BEQ R9, body head: MOVV $8, R10 SUBV R9, R10 // head = 8 - (dst & 7) MOVB (R5), R11 SUBV $1, R10 ADDV $1, R5 MOVB R11, (R4) ADDV $1, R4 BNE R10, -5(PC) ADDV R9, R6 ADDV $-8, R6 // newcount = count + (dst & 7) - 8 // if newcount < 65 bytes, use move_33through64 to copy is enough SGTU $65, R6, R9 BNE R9, move_33through64 body: MOVV (R5), R11 MOVV 8(R5), R12 MOVV 16(R5), R13 MOVV 24(R5), R14 MOVV 32(R5), R15 MOVV 40(R5), R16 MOVV 48(R5), R17 MOVV 56(R5), R18 MOVV R11, (R4) MOVV R12, 8(R4) MOVV R13, 16(R4) MOVV R14, 24(R4) MOVV R15, 32(R4) MOVV R16, 40(R4) MOVV R17, 48(R4) MOVV R18, 56(R4) ADDV $-64, R6 ADDV $64, R4 ADDV $64, R5 SGTU $64, R6, R9 // if the remaining part >= 64 bytes, jmp to body BEQ R9, body // if the remaining part == 0 bytes, use move_0 to return BEQ R6, move_0 // if the remaining part in (0, 63] bytes, jmp to tail JMP tail // The backward copy algorithm is the same as the forward copy, // except for the direction. backward: AND $7, R7, R9 // dstend & 7 BEQ R9, b_body b_head: MOVV -8(R8), R11 SUBV R9, R6 // newcount = count - (dstend & 7) SUBV R9, R8 // newsrcend = srcend - (dstend & 7) MOVV -8(R8), R12 MOVV R11, -8(R7) SUBV R9, R7 // newdstend = dstend - (dstend & 7) MOVV R12, -8(R7) SUBV $8, R6 SUBV $8, R7 SUBV $8, R8 SGTU $65, R6, R9 BNE R9, move_33through64 b_body: MOVV -8(R8), R11 MOVV -16(R8), R12 MOVV -24(R8), R13 MOVV -32(R8), R14 MOVV -40(R8), R15 MOVV -48(R8), R16 MOVV -56(R8), R17 MOVV -64(R8), R18 MOVV R11, -8(R7) MOVV R12, -16(R7) MOVV R13, -24(R7) MOVV R14, -32(R7) MOVV R15, -40(R7) MOVV R16, -48(R7) MOVV R17, -56(R7) MOVV R18, -64(R7) ADDV $-64, R6 ADDV $-64, R7 ADDV $-64, R8 SGTU $64, R6, R9 BEQ R9, b_body BEQ R6, move_0 JMP tail move_0: RET move_1: MOVB (R5), R11 MOVB R11, (R4) RET move_2: MOVH (R5), R11 MOVH R11, (R4) RET move_3: MOVH (R5), R11 MOVB -1(R8), R12 MOVH R11, (R4) MOVB R12, -1(R7) RET move_4: MOVW (R5), R11 MOVW R11, (R4) RET move_5through7: MOVW (R5), R11 MOVW -4(R8), R12 MOVW R11, (R4) MOVW R12, -4(R7) RET move_8: MOVV (R5), R11 MOVV R11, (R4) RET move_9through16: MOVV (R5), R11 MOVV -8(R8), R12 MOVV R11, (R4) MOVV R12, -8(R7) RET move_17through32: MOVV (R5), R11 MOVV 8(R5), R12 MOVV -16(R8), R13 MOVV -8(R8), R14 MOVV R11, (R4) MOVV R12, 8(R4) MOVV R13, -16(R7) MOVV R14, -8(R7) RET move_33through64: MOVV (R5), R11 MOVV 8(R5), R12 MOVV 16(R5), R13 MOVV 24(R5), R14 MOVV -32(R8), R15 MOVV -24(R8), R16 MOVV -16(R8), R17 MOVV -8(R8), R18 MOVV R11, (R4) MOVV R12, 8(R4) MOVV R13, 16(R4) MOVV R14, 24(R4) MOVV R15, -32(R7) MOVV R16, -24(R7) MOVV R17, -16(R7) MOVV R18, -8(R7) RET