Source file src/cmd/internal/obj/loong64/doc.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  /*
     6  Package loong64 implements an LoongArch64 assembler. Go assembly syntax is different from
     7  GNU LoongArch64 syntax, but we can still follow the general rules to map between them.
     8  
     9  # Instructions mnemonics mapping rules
    10  
    11  1. Bit widths represented by various instruction suffixes and prefixes
    12  V (vlong)     = 64 bit
    13  WU (word)     = 32 bit unsigned
    14  W (word)      = 32 bit
    15  H (half word) = 16 bit
    16  HU            = 16 bit unsigned
    17  B (byte)      = 8 bit
    18  BU            = 8 bit unsigned
    19  F (float)     = 32 bit float
    20  D (double)    = 64 bit float
    21  
    22  V  (LSX)      = 128 bit
    23  XV (LASX)     = 256 bit
    24  
    25  Examples:
    26  
    27  	MOVB  (R2), R3  // Load 8 bit memory data into R3 register
    28  	MOVH  (R2), R3  // Load 16 bit memory data into R3 register
    29  	MOVW  (R2), R3  // Load 32 bit memory data into R3 register
    30  	MOVV  (R2), R3  // Load 64 bit memory data into R3 register
    31  	VMOVQ  (R2), V1 // Load 128 bit memory data into V1 register
    32  	XVMOVQ (R2), X1 // Load 256 bit memory data into X1 register
    33  
    34  2. Align directive
    35  Go asm supports the PCALIGN directive, which indicates that the next instruction should
    36  be aligned to a specified boundary by padding with NOOP instruction. The alignment value
    37  supported on loong64 must be a power of 2 and in the range of [8, 2048].
    38  
    39  Examples:
    40  
    41  	PCALIGN	$16
    42  	MOVV	$2, R4	// This instruction is aligned with 16 bytes.
    43  	PCALIGN	$1024
    44  	MOVV	$3, R5	// This instruction is aligned with 1024 bytes.
    45  
    46  # On loong64, auto-align loop heads to 16-byte boundaries
    47  
    48  Examples:
    49  
    50  	TEXT ·Add(SB),NOSPLIT|NOFRAME,$0
    51  
    52  start:
    53  
    54  	MOVV	$1, R4	// This instruction is aligned with 16 bytes.
    55  	MOVV	$-1, R5
    56  	BNE	R5, start
    57  	RET
    58  
    59  # Register mapping rules
    60  
    61  1. All generial-prupose register names are written as Rn.
    62  
    63  2. All floating-point register names are written as Fn.
    64  
    65  3. All LSX register names are written as Vn.
    66  
    67  4. All LASX register names are written as Xn.
    68  
    69  # Argument mapping rules
    70  
    71  1. The operands appear in left-to-right assignment order.
    72  
    73  Go reverses the arguments of most instructions.
    74  
    75  Examples:
    76  
    77  	ADDV	R11, R12, R13 <=> add.d R13, R12, R11
    78  	LLV	(R4), R7      <=> ll.d R7, R4
    79  	OR	R5, R6        <=> or R6, R6, R5
    80  
    81  Special Cases.
    82  (1) Argument order is the same as in the GNU Loong64 syntax: jump instructions,
    83  
    84  Examples:
    85  
    86  	BEQ	R0, R4, lable1  <=>  beq R0, R4, lable1
    87  	JMP	lable1          <=>  b lable1
    88  
    89  (2) BSTRINSW, BSTRINSV, BSTRPICKW, BSTRPICKV $<msb>, <Rj>, $<lsb>, <Rd>
    90  
    91  Examples:
    92  
    93  	BSTRPICKW $15, R4, $6, R5  <=>  bstrpick.w r5, r4, 15, 6
    94  
    95  2. Expressions for special arguments.
    96  
    97  Memory references: a base register and an offset register is written as (Rbase)(Roff).
    98  
    99  Examples:
   100  
   101  	MOVB (R4)(R5), R6  <=>  ldx.b R6, R4, R5
   102  	MOVV (R4)(R5), R6  <=>  ldx.d R6, R4, R5
   103  	MOVD (R4)(R5), F6  <=>  fldx.d F6, R4, R5
   104  	MOVB R6, (R4)(R5)  <=>  stx.b R6, R5, R5
   105  	MOVV R6, (R4)(R5)  <=>  stx.d R6, R5, R5
   106  	MOVV F6, (R4)(R5)  <=>  fstx.d F6, R5, R5
   107  
   108  3. Alphabetical list of SIMD instructions
   109  
   110  Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate),
   111  "ui3", "ui2", and "ui1" represent the related "index".
   112  
   113  3.1 Move general-purpose register to a vector element:
   114  
   115  	Instruction format:
   116  	        VMOVQ  Rj, <Vd>.<T>[index]
   117  
   118  	Mapping between Go and platform assembly:
   119  	       Go assembly       |      platform assembly     |          semantics
   120  	-------------------------------------------------------------------------------------
   121  	 VMOVQ  Rj, Vd.B[index]  |  vinsgr2vr.b  Vd, Rj, ui4  |  VR[vd].b[ui4] = GR[rj][7:0]
   122  	 VMOVQ  Rj, Vd.H[index]  |  vinsgr2vr.h  Vd, Rj, ui3  |  VR[vd].h[ui3] = GR[rj][15:0]
   123  	 VMOVQ  Rj, Vd.W[index]  |  vinsgr2vr.w  Vd, Rj, ui2  |  VR[vd].w[ui2] = GR[rj][31:0]
   124  	 VMOVQ  Rj, Vd.V[index]  |  vinsgr2vr.d  Vd, Rj, ui1  |  VR[vd].d[ui1] = GR[rj][63:0]
   125  	XVMOVQ  Rj, Xd.W[index]  | xvinsgr2vr.w  Xd, Rj, ui3  |  XR[xd].w[ui3] = GR[rj][31:0]
   126  	XVMOVQ  Rj, Xd.V[index]  | xvinsgr2vr.d  Xd, Rj, ui2  |  XR[xd].d[ui2] = GR[rj][63:0]
   127  
   128  3.2 Move vector element to general-purpose register
   129  
   130  	Instruction format:
   131  	        VMOVQ     <Vj>.<T>[index], Rd
   132  
   133  	Mapping between Go and platform assembly:
   134  	        Go assembly       |       platform assembly      |            semantics
   135  	---------------------------------------------------------------------------------------------
   136  	 VMOVQ  Vj.B[index],  Rd  |   vpickve2gr.b   rd, vj, ui4 | GR[rd] = SignExtend(VR[vj].b[ui4])
   137  	 VMOVQ  Vj.H[index],  Rd  |   vpickve2gr.h   rd, vj, ui3 | GR[rd] = SignExtend(VR[vj].h[ui3])
   138  	 VMOVQ  Vj.W[index],  Rd  |   vpickve2gr.w   rd, vj, ui2 | GR[rd] = SignExtend(VR[vj].w[ui2])
   139  	 VMOVQ  Vj.V[index],  Rd  |   vpickve2gr.d   rd, vj, ui1 | GR[rd] = SignExtend(VR[vj].d[ui1])
   140  	 VMOVQ  Vj.BU[index], Rd  |   vpickve2gr.bu  rd, vj, ui4 | GR[rd] = ZeroExtend(VR[vj].bu[ui4])
   141  	 VMOVQ  Vj.HU[index], Rd  |   vpickve2gr.hu  rd, vj, ui3 | GR[rd] = ZeroExtend(VR[vj].hu[ui3])
   142  	 VMOVQ  Vj.WU[index], Rd  |   vpickve2gr.wu  rd, vj, ui2 | GR[rd] = ZeroExtend(VR[vj].wu[ui2])
   143  	 VMOVQ  Vj.VU[index], Rd  |   vpickve2gr.du  rd, vj, ui1 | GR[rd] = ZeroExtend(VR[vj].du[ui1])
   144  	XVMOVQ  Xj.W[index],  Rd  |  xvpickve2gr.w   rd, xj, ui3 | GR[rd] = SignExtend(VR[xj].w[ui3])
   145  	XVMOVQ  Xj.V[index],  Rd  |  xvpickve2gr.d   rd, xj, ui2 | GR[rd] = SignExtend(VR[xj].d[ui2])
   146  	XVMOVQ  Xj.WU[index], Rd  |  xvpickve2gr.wu  rd, xj, ui3 | GR[rd] = ZeroExtend(VR[xj].wu[ui3])
   147  	XVMOVQ  Xj.VU[index], Rd  |  xvpickve2gr.du  rd, xj, ui2 | GR[rd] = ZeroExtend(VR[xj].du[ui2])
   148  
   149  3.3 Duplicate general-purpose register to vector.
   150  
   151  	Instruction format:
   152  	        VMOVQ    Rj, <Vd>.<T>
   153  
   154  	Mapping between Go and platform assembly:
   155  	   Go assembly      |    platform assembly    |                    semantics
   156  	------------------------------------------------------------------------------------------------
   157  	 VMOVQ  Rj, Vd.B16  |   vreplgr2vr.b  Vd, Rj  |  for i in range(16): VR[vd].b[i] = GR[rj][7:0]
   158  	 VMOVQ  Rj, Vd.H8   |   vreplgr2vr.h  Vd, Rj  |  for i in range(8) : VR[vd].h[i] = GR[rj][16:0]
   159  	 VMOVQ  Rj, Vd.W4   |   vreplgr2vr.w  Vd, Rj  |  for i in range(4) : VR[vd].w[i] = GR[rj][31:0]
   160  	 VMOVQ  Rj, Vd.V2   |   vreplgr2vr.d  Vd, Rj  |  for i in range(2) : VR[vd].d[i] = GR[rj][63:0]
   161  	XVMOVQ  Rj, Xd.B32  |  xvreplgr2vr.b  Xd, Rj  |  for i in range(32): XR[xd].b[i] = GR[rj][7:0]
   162  	XVMOVQ  Rj, Xd.H16  |  xvreplgr2vr.h  Xd, Rj  |  for i in range(16): XR[xd].h[i] = GR[rj][16:0]
   163  	XVMOVQ  Rj, Xd.W8   |  xvreplgr2vr.w  Xd, Rj  |  for i in range(8) : XR[xd].w[i] = GR[rj][31:0]
   164  	XVMOVQ  Rj, Xd.V4   |  xvreplgr2vr.d  Xd, Rj  |  for i in range(4) : XR[xd].d[i] = GR[rj][63:0]
   165  
   166  3.4 Replace vector elements
   167  
   168  	Instruction format:
   169  	        XVMOVQ    Xj, <Xd>.<T>
   170  
   171  	Mapping between Go and platform assembly:
   172  	   Go assembly      |   platform assembly   |                semantics
   173  	------------------------------------------------------------------------------------------------
   174  	XVMOVQ  Xj, Xd.B32  |  xvreplve0.b  Xd, Xj  | for i in range(32): XR[xd].b[i] = XR[xj].b[0]
   175  	XVMOVQ  Xj, Xd.H16  |  xvreplve0.h  Xd, Xj  | for i in range(16): XR[xd].h[i] = XR[xj].h[0]
   176  	XVMOVQ  Xj, Xd.W8   |  xvreplve0.w  Xd, Xj  | for i in range(8) : XR[xd].w[i] = XR[xj].w[0]
   177  	XVMOVQ  Xj, Xd.V4   |  xvreplve0.d  Xd, Xj  | for i in range(4) : XR[xd].d[i] = XR[xj].d[0]
   178  	XVMOVQ  Xj, Xd.Q2   |  xvreplve0.q  Xd, Xj  | for i in range(2) : XR[xd].q[i] = XR[xj].q[0]
   179  
   180  3.5 Move vector element to scalar
   181  
   182  	Instruction format:
   183  	        XVMOVQ  Xj, <Xd>.<T>[index]
   184  	        XVMOVQ  Xj.<T>[index], Xd
   185  
   186  	Mapping between Go and platform assembly:
   187  	       Go assembly        |     platform assembly     |               semantics
   188  	------------------------------------------------------------------------------------------------
   189  	 XVMOVQ  Xj, Xd.W[index]  |  xvinsve0.w   xd, xj, ui3 | XR[xd].w[ui3] = XR[xj].w[0]
   190  	 XVMOVQ  Xj, Xd.V[index]  |  xvinsve0.d   xd, xj, ui2 | XR[xd].d[ui2] = XR[xj].d[0]
   191  	 XVMOVQ  Xj.W[index], Xd  |  xvpickve.w   xd, xj, ui3 | XR[xd].w[0] = XR[xj].w[ui3], XR[xd][255:32] = 0
   192  	 XVMOVQ  Xj.V[index], Xd  |  xvpickve.d   xd, xj, ui2 | XR[xd].d[0] = XR[xj].d[ui2], XR[xd][255:64] = 0
   193  
   194  3.6 Move vector element to vector register.
   195  
   196  	Instruction format:
   197  	VMOVQ     <Vn>.<T>[index], Vn.<T>
   198  
   199  	Mapping between Go and platform assembly:
   200  	         Go assembly      |    platform assembly   |               semantics
   201  	VMOVQ Vj.B[index], Vd.B16 | vreplvei.b vd, vj, ui4 | for i in range(16): VR[vd].b[i] = VR[vj].b[ui4]
   202  	VMOVQ Vj.H[index], Vd.H8  | vreplvei.h vd, vj, ui3 | for i in range(8) : VR[vd].h[i] = VR[vj].h[ui3]
   203  	VMOVQ Vj.W[index], Vd.W4  | vreplvei.w vd, vj, ui2 | for i in range(4) : VR[vd].w[i] = VR[vj].w[ui2]
   204  	VMOVQ Vj.V[index], Vd.V2  | vreplvei.d vd, vj, ui1 | for i in range(2) : VR[vd].d[i] = VR[vj].d[ui1]
   205  
   206  3.7 Load data from memory and broadcast to each element of a vector register.
   207  
   208  	Instruction format:
   209  	        VMOVQ    offset(Rj), <Vd>.<T>
   210  
   211  	Mapping between Go and platform assembly:
   212  	   Go assembly              |     platform assembly      |                                semantics
   213  	-------------------------------------------------------------------------------------------------------------------------------------------------------
   214  	 VMOVQ  offset(Rj), Vd.B16  |   vldrepl.b  Vd, Rj, si12  |  for i in range(16): VR[vd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
   215  	 VMOVQ  offset(Rj), Vd.H8   |   vldrepl.h  Vd, Rj, si11  |  for i in range(8) : VR[vd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
   216  	 VMOVQ  offset(Rj), Vd.W4   |   vldrepl.w  Vd, Rj, si10  |  for i in range(4) : VR[vd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
   217  	 VMOVQ  offset(Rj), Vd.V2   |   vldrepl.d  Vd, Rj, si9   |  for i in range(2) : VR[vd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
   218  	XVMOVQ  offset(Rj), Xd.B32  |  xvldrepl.b  Xd, Rj, si12  |  for i in range(32): XR[xd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
   219  	XVMOVQ  offset(Rj), Xd.H16  |  xvldrepl.h  Xd, Rj, si11  |  for i in range(16): XR[xd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
   220  	XVMOVQ  offset(Rj), Xd.W8   |  xvldrepl.w  Xd, Rj, si10  |  for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
   221  	XVMOVQ  offset(Rj), Xd.V4   |  xvldrepl.d  Xd, Rj, si9   |  for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
   222  
   223  	note: In Go assembly, for ease of understanding, offset representing the actual address offset.
   224  	      However, during platform encoding, the offset is shifted to increase the encodable offset range, as follows:
   225  
   226  	   Go assembly           |      platform assembly
   227           VMOVQ  1(R4), V5.B16    |      vldrepl.b  v5, r4, $1
   228           VMOVQ  2(R4), V5.H8     |      vldrepl.h  v5, r4, $1
   229           VMOVQ  8(R4), V5.W4     |      vldrepl.w  v5, r4, $2
   230           VMOVQ  8(R4), V5.V2     |      vldrepl.d  v5, r4, $1
   231  
   232  # Special instruction encoding definition and description on LoongArch
   233  
   234   1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased
   235      from the Linux kernel implementation: https://git.kernel.org/torvalds/c/e031a5f3f1ed
   236  
   237      - Bit4: ordering or completion (0: completion, 1: ordering)
   238      - Bit3: barrier for previous read (0: true, 1: false)
   239      - Bit2: barrier for previous write (0: true, 1: false)
   240      - Bit1: barrier for succeeding read (0: true, 1: false)
   241      - Bit0: barrier for succeeding write (0: true, 1: false)
   242      - Hint 0x700: barrier for "read after read" from the same address
   243  
   244      Traditionally, on microstructures that do not support dbar grading such as LA464
   245      (Loongson 3A5000, 3C5000) all variants are treated as “dbar 0” (full barrier).
   246  
   247  2. Notes on using atomic operation instructions
   248  
   249    - AM*_DB.W[U]/V[U] instructions such as AMSWAPDBW not only complete the corresponding
   250      atomic operation sequence, but also implement the complete full data barrier function.
   251  
   252    - When using the AM*_.W[U]/D[U] instruction, registers rd and rj cannot be the same,
   253      otherwise an exception is triggered, and rd and rk cannot be the same, otherwise
   254      the execution result is uncertain.
   255  
   256  3. Prefetch instructions
   257      Instruction format:
   258        PRELD	offset(Rbase), $hint
   259        PRELDX	offset(Rbase), $n, $hint
   260  
   261      Mapping between Go and platform assembly:
   262                 Go assembly            |    platform assembly
   263        PRELD  offset(Rbase), $hint     | preld hint, Rbase, offset
   264        PRELDX offset(Rbase), $n, $hint | move rk, $x; preldx hint, Rbase, rk
   265  
   266        note: $x is the value after $n and offset are reassembled
   267  
   268      Definition of hint value:
   269        0: load to L1
   270        2: load to L3
   271        8: store to L1
   272  
   273        The meaning of the rest of values is not defined yet, and the processor executes it as NOP
   274  
   275      Definition of $n in the PRELDX instruction:
   276        bit[0]: address sequence, 0 indicating ascending and 1 indicating descending
   277        bits[11:1]:  block size, the value range is [16, 1024], and it must be an integer multiple of 16
   278        bits[20:12]: block num, the value range is [1, 256]
   279        bits[36:21]: stride, the value range is [0, 0xffff]
   280  
   281  4. ShiftAdd instructions
   282      Mapping between Go and platform assembly:
   283                  Go assembly            |    platform assembly
   284       ALSL.W/WU/V $Imm, Rj, Rk, Rd      |    alsl.w/wu/d rd, rj, rk, $imm
   285  
   286      Instruction encoding format is as follows:
   287  
   288  	| 31 ~ 17 | 16 ~ 15 | 14 ~ 10 | 9 ~ 5 | 4 ~ 0 |
   289  	|  opcode |   sa2   |   rk    |   rj  |   rd  |
   290  
   291      The alsl.w/wu/v series of instructions shift the data in rj left by sa+1, add the value
   292      in rk, and write the result to rd.
   293  
   294      To allow programmers to directly write the desired shift amount in assembly code, we actually write
   295      the value of sa2+1 in the assembly code and then include the value of sa2 in the instruction encoding.
   296  
   297      For example:
   298  
   299              Go assembly      | instruction Encoding
   300          ALSLV $4, r4, r5, R6 |      002d9486
   301  
   302  5. Note of special memory access instructions
   303      Instruction format:
   304        MOVWP	offset(Rj), Rd
   305        MOVVP	offset(Rj), Rd
   306        MOVWP	Rd, offset(Rj)
   307        MOVVP	Rd, offset(Rj)
   308  
   309      Mapping between Go and platform assembly:
   310                 Go assembly      |      platform assembly
   311        MOVWP  offset(Rj), Rd     |    ldptr.w  rd, rj, si14
   312        MOVVP  offset(Rj), Rd     |    ldptr.d  rd, rj, si14
   313        MOVWP  Rd, offset(Rj)     |    stptr.w  rd, rj, si14
   314        MOVVP  Rd, offset(Rj)     |    stptr.d  rd, rj, si14
   315  
   316        note: In Go assembly, for ease of understanding, offset is a 16-bit immediate number representing
   317              the actual address offset, but in platform assembly, it need a 14-bit immediate number.
   318  	    si14 = offset>>2
   319  
   320      The addressing calculation for the above instruction involves logically left-shifting the 14-bit
   321      immediate number si14 by 2 bits, then sign-extending it, and finally adding it to the value in the
   322      general-purpose register rj to obtain the sum.
   323  
   324      For example:
   325  
   326              Go assembly      |      platform assembly
   327           MOVWP  8(R4), R5    |      ldptr.w r5, r4, $2
   328  
   329  6. Note of special add instrction
   330      Mapping between Go and platform assembly:
   331                Go assembly        |      platform assembly
   332        ADDV16  si16<<16, Rj, Rd   |    addu16i.d  rd, rj, si16
   333  
   334        note: si16 is a 16-bit immediate number, and si16<<16 is the actual operand.
   335  
   336      The addu16i.d instruction logically left-shifts the 16-bit immediate number si16 by 16 bits, then
   337      sign-extends it. The resulting data is added to the [63:0] bits of data in the general-purpose register
   338      rj, and the sum is written into the general-purpose register rd.
   339      The addu16i.d instruction is used in conjunction with the ldptr.w/d and stptr.w/d instructions to
   340      accelerate access based on the GOT table in position-independent code.
   341  */
   342  
   343  package loong64
   344  

View as plain text