doc.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  /*
     6  Package loong64 implements an LoongArch64 assembler. Go assembly syntax is different from
     7  GNU LoongArch64 syntax, but we can still follow the general rules to map between them.
     8  
     9  # Instructions mnemonics mapping rules
    10  
    11  1. Bit widths represented by various instruction suffixes and prefixes
    12  V (vlong)     = 64 bit
    13  WU (word)     = 32 bit unsigned
    14  W (word)      = 32 bit
    15  H (half word) = 16 bit
    16  HU            = 16 bit unsigned
    17  B (byte)      = 8 bit
    18  BU            = 8 bit unsigned
    19  F (float)     = 32 bit float
    20  D (double)    = 64 bit float
    21  
    22  V  (LSX)      = 128 bit
    23  XV (LASX)     = 256 bit
    24  
    25  Examples:
    26  
    27  	MOVB  (R2), R3  // Load 8 bit memory data into R3 register
    28  	MOVH  (R2), R3  // Load 16 bit memory data into R3 register
    29  	MOVW  (R2), R3  // Load 32 bit memory data into R3 register
    30  	MOVV  (R2), R3  // Load 64 bit memory data into R3 register
    31  	VMOVQ  (R2), V1 // Load 128 bit memory data into V1 register
    32  	XVMOVQ (R2), X1 // Load 256 bit memory data into X1 register
    33  
    34  2. Align directive
    35  Go asm supports the PCALIGN directive, which indicates that the next instruction should
    36  be aligned to a specified boundary by padding with NOOP instruction. The alignment value
    37  supported on loong64 must be a power of 2 and in the range of [8, 2048].
    38  
    39  Examples:
    40  
    41  	PCALIGN	$16
    42  	MOVV	$2, R4	// This instruction is aligned with 16 bytes.
    43  	PCALIGN	$1024
    44  	MOVV	$3, R5	// This instruction is aligned with 1024 bytes.
    45  
    46  # On loong64, auto-align loop heads to 16-byte boundaries
    47  
    48  Examples:
    49  
    50  	TEXT ·Add(SB),NOSPLIT|NOFRAME,$0
    51  
    52  start:
    53  
    54  	MOVV	$1, R4	// This instruction is aligned with 16 bytes.
    55  	MOVV	$-1, R5
    56  	BNE	R5, start
    57  	RET
    58  
    59  # Register mapping rules
    60  
    61  1. All generial-prupose register names are written as Rn.
    62  
    63  2. All floating-point register names are written as Fn.
    64  
    65  3. All LSX register names are written as Vn.
    66  
    67  4. All LASX register names are written as Xn.
    68  
    69  # Argument mapping rules
    70  
    71  1. The operands appear in left-to-right assignment order.
    72  
    73  Go reverses the arguments of most instructions.
    74  
    75  Examples:
    76  
    77  	ADDV	R11, R12, R13 <=> add.d R13, R12, R11
    78  	LLV	(R4), R7      <=> ll.d R7, R4
    79  	OR	R5, R6        <=> or R6, R6, R5
    80  
    81  Special Cases.
    82  (1) Argument order is the same as in the GNU Loong64 syntax: jump instructions,
    83  
    84  Examples:
    85  
    86  	BEQ	R0, R4, lable1  <=>  beq R0, R4, lable1
    87  	JMP	lable1          <=>  b lable1
    88  
    89  (2) BSTRINSW, BSTRINSV, BSTRPICKW, BSTRPICKV $<msb>, <Rj>, $<lsb>, <Rd>
    90  
    91  Examples:
    92  
    93  	BSTRPICKW $15, R4, $6, R5  <=>  bstrpick.w r5, r4, 15, 6
    94  
    95  2. Expressions for special arguments.
    96  
    97  Memory references: a base register and an offset register is written as (Rbase)(Roff).
    98  
    99  Examples:
   100  
   101  	MOVB (R4)(R5), R6  <=>  ldx.b R6, R4, R5
   102  	MOVV (R4)(R5), R6  <=>  ldx.d R6, R4, R5
   103  	MOVD (R4)(R5), F6  <=>  fldx.d F6, R4, R5
   104  	MOVB R6, (R4)(R5)  <=>  stx.b R6, R5, R5
   105  	MOVV R6, (R4)(R5)  <=>  stx.d R6, R5, R5
   106  	MOVV F6, (R4)(R5)  <=>  fstx.d F6, R5, R5
   107  
   108  3. Alphabetical list of SIMD instructions
   109  
   110  Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate),
   111  "ui3", "ui2", and "ui1" represent the related "index".
   112  
   113  3.1 Move general-purpose register to a vector element:
   114  
   115  	Instruction format:
   116  	        VMOVQ  Rj, <Vd>.<T>[index]
   117  
   118  	Mapping between Go and platform assembly:
   119  	       Go assembly       |      platform assembly     |          semantics
   120  	-------------------------------------------------------------------------------------
   121  	 VMOVQ  Rj, Vd.B[index]  |  vinsgr2vr.b  Vd, Rj, ui4  |  VR[vd].b[ui4] = GR[rj][7:0]
   122  	 VMOVQ  Rj, Vd.H[index]  |  vinsgr2vr.h  Vd, Rj, ui3  |  VR[vd].h[ui3] = GR[rj][15:0]
   123  	 VMOVQ  Rj, Vd.W[index]  |  vinsgr2vr.w  Vd, Rj, ui2  |  VR[vd].w[ui2] = GR[rj][31:0]
   124  	 VMOVQ  Rj, Vd.V[index]  |  vinsgr2vr.d  Vd, Rj, ui1  |  VR[vd].d[ui1] = GR[rj][63:0]
   125  	XVMOVQ  Rj, Xd.W[index]  | xvinsgr2vr.w  Xd, Rj, ui3  |  XR[xd].w[ui3] = GR[rj][31:0]
   126  	XVMOVQ  Rj, Xd.V[index]  | xvinsgr2vr.d  Xd, Rj, ui2  |  XR[xd].d[ui2] = GR[rj][63:0]
   127  
   128  3.2 Move vector element to general-purpose register
   129  
   130  	Instruction format:
   131  	        VMOVQ     <Vj>.<T>[index], Rd
   132  
   133  	Mapping between Go and platform assembly:
   134  	        Go assembly       |       platform assembly      |            semantics
   135  	---------------------------------------------------------------------------------------------
   136  	 VMOVQ  Vj.B[index],  Rd  |   vpickve2gr.b   rd, vj, ui4 | GR[rd] = SignExtend(VR[vj].b[ui4])
   137  	 VMOVQ  Vj.H[index],  Rd  |   vpickve2gr.h   rd, vj, ui3 | GR[rd] = SignExtend(VR[vj].h[ui3])
   138  	 VMOVQ  Vj.W[index],  Rd  |   vpickve2gr.w   rd, vj, ui2 | GR[rd] = SignExtend(VR[vj].w[ui2])
   139  	 VMOVQ  Vj.V[index],  Rd  |   vpickve2gr.d   rd, vj, ui1 | GR[rd] = SignExtend(VR[vj].d[ui1])
   140  	 VMOVQ  Vj.BU[index], Rd  |   vpickve2gr.bu  rd, vj, ui4 | GR[rd] = ZeroExtend(VR[vj].bu[ui4])
   141  	 VMOVQ  Vj.HU[index], Rd  |   vpickve2gr.hu  rd, vj, ui3 | GR[rd] = ZeroExtend(VR[vj].hu[ui3])
   142  	 VMOVQ  Vj.WU[index], Rd  |   vpickve2gr.wu  rd, vj, ui2 | GR[rd] = ZeroExtend(VR[vj].wu[ui2])
   143  	 VMOVQ  Vj.VU[index], Rd  |   vpickve2gr.du  rd, vj, ui1 | GR[rd] = ZeroExtend(VR[vj].du[ui1])
   144  	XVMOVQ  Xj.W[index],  Rd  |  xvpickve2gr.w   rd, xj, ui3 | GR[rd] = SignExtend(VR[xj].w[ui3])
   145  	XVMOVQ  Xj.V[index],  Rd  |  xvpickve2gr.d   rd, xj, ui2 | GR[rd] = SignExtend(VR[xj].d[ui2])
   146  	XVMOVQ  Xj.WU[index], Rd  |  xvpickve2gr.wu  rd, xj, ui3 | GR[rd] = ZeroExtend(VR[xj].wu[ui3])
   147  	XVMOVQ  Xj.VU[index], Rd  |  xvpickve2gr.du  rd, xj, ui2 | GR[rd] = ZeroExtend(VR[xj].du[ui2])
   148  
   149  3.3 Duplicate general-purpose register to vector.
   150  
   151  	Instruction format:
   152  	        VMOVQ    Rj, <Vd>.<T>
   153  
   154  	Mapping between Go and platform assembly:
   155  	   Go assembly      |    platform assembly    |                    semantics
   156  	------------------------------------------------------------------------------------------------
   157  	 VMOVQ  Rj, Vd.B16  |   vreplgr2vr.b  Vd, Rj  |  for i in range(16): VR[vd].b[i] = GR[rj][7:0]
   158  	 VMOVQ  Rj, Vd.H8   |   vreplgr2vr.h  Vd, Rj  |  for i in range(8) : VR[vd].h[i] = GR[rj][16:0]
   159  	 VMOVQ  Rj, Vd.W4   |   vreplgr2vr.w  Vd, Rj  |  for i in range(4) : VR[vd].w[i] = GR[rj][31:0]
   160  	 VMOVQ  Rj, Vd.V2   |   vreplgr2vr.d  Vd, Rj  |  for i in range(2) : VR[vd].d[i] = GR[rj][63:0]
   161  	XVMOVQ  Rj, Xd.B32  |  xvreplgr2vr.b  Xd, Rj  |  for i in range(32): XR[xd].b[i] = GR[rj][7:0]
   162  	XVMOVQ  Rj, Xd.H16  |  xvreplgr2vr.h  Xd, Rj  |  for i in range(16): XR[xd].h[i] = GR[rj][16:0]
   163  	XVMOVQ  Rj, Xd.W8   |  xvreplgr2vr.w  Xd, Rj  |  for i in range(8) : XR[xd].w[i] = GR[rj][31:0]
   164  	XVMOVQ  Rj, Xd.V4   |  xvreplgr2vr.d  Xd, Rj  |  for i in range(4) : XR[xd].d[i] = GR[rj][63:0]
   165  
   166  3.4 Replace vector elements
   167  
   168  	Instruction format:
   169  	        XVMOVQ    Xj, <Xd>.<T>
   170  
   171  	Mapping between Go and platform assembly:
   172  	   Go assembly      |   platform assembly   |                semantics
   173  	------------------------------------------------------------------------------------------------
   174  	XVMOVQ  Xj, Xd.B32  |  xvreplve0.b  Xd, Xj  | for i in range(32): XR[xd].b[i] = XR[xj].b[0]
   175  	XVMOVQ  Xj, Xd.H16  |  xvreplve0.h  Xd, Xj  | for i in range(16): XR[xd].h[i] = XR[xj].h[0]
   176  	XVMOVQ  Xj, Xd.W8   |  xvreplve0.w  Xd, Xj  | for i in range(8) : XR[xd].w[i] = XR[xj].w[0]
   177  	XVMOVQ  Xj, Xd.V4   |  xvreplve0.d  Xd, Xj  | for i in range(4) : XR[xd].d[i] = XR[xj].d[0]
   178  	XVMOVQ  Xj, Xd.Q2   |  xvreplve0.q  Xd, Xj  | for i in range(2) : XR[xd].q[i] = XR[xj].q[0]
   179  
   180  3.5 Move vector element to scalar
   181  
   182  	Instruction format:
   183  	        XVMOVQ  Xj, <Xd>.<T>[index]
   184  	        XVMOVQ  Xj.<T>[index], Xd
   185  
   186  	Mapping between Go and platform assembly:
   187  	       Go assembly        |     platform assembly     |               semantics
   188  	------------------------------------------------------------------------------------------------
   189  	 XVMOVQ  Xj, Xd.W[index]  |  xvinsve0.w   xd, xj, ui3 | XR[xd].w[ui3] = XR[xj].w[0]
   190  	 XVMOVQ  Xj, Xd.V[index]  |  xvinsve0.d   xd, xj, ui2 | XR[xd].d[ui2] = XR[xj].d[0]
   191  	 XVMOVQ  Xj.W[index], Xd  |  xvpickve.w   xd, xj, ui3 | XR[xd].w[0] = XR[xj].w[ui3], XR[xd][255:32] = 0
   192  	 XVMOVQ  Xj.V[index], Xd  |  xvpickve.d   xd, xj, ui2 | XR[xd].d[0] = XR[xj].d[ui2], XR[xd][255:64] = 0
   193  
   194  3.6 Move vector element to vector register.
   195  
   196  	Instruction format:
   197  	VMOVQ     <Vn>.<T>[index], Vn.<T>
   198  
   199  	Mapping between Go and platform assembly:
   200  	         Go assembly      |    platform assembly   |               semantics
   201  	VMOVQ Vj.B[index], Vd.B16 | vreplvei.b vd, vj, ui4 | for i in range(16): VR[vd].b[i] = VR[vj].b[ui4]
   202  	VMOVQ Vj.H[index], Vd.H8  | vreplvei.h vd, vj, ui3 | for i in range(8) : VR[vd].h[i] = VR[vj].h[ui3]
   203  	VMOVQ Vj.W[index], Vd.W4  | vreplvei.w vd, vj, ui2 | for i in range(4) : VR[vd].w[i] = VR[vj].w[ui2]
   204  	VMOVQ Vj.V[index], Vd.V2  | vreplvei.d vd, vj, ui1 | for i in range(2) : VR[vd].d[i] = VR[vj].d[ui1]
   205  
   206  # Special instruction encoding definition and description on LoongArch
   207  
   208   1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased
   209      from the Linux kernel implementation: https://git.kernel.org/torvalds/c/e031a5f3f1ed
   210  
   211      - Bit4: ordering or completion (0: completion, 1: ordering)
   212      - Bit3: barrier for previous read (0: true, 1: false)
   213      - Bit2: barrier for previous write (0: true, 1: false)
   214      - Bit1: barrier for succeeding read (0: true, 1: false)
   215      - Bit0: barrier for succeeding write (0: true, 1: false)
   216      - Hint 0x700: barrier for "read after read" from the same address
   217  
   218      Traditionally, on microstructures that do not support dbar grading such as LA464
   219      (Loongson 3A5000, 3C5000) all variants are treated as “dbar 0” (full barrier).
   220  
   221  2. Notes on using atomic operation instructions
   222  
   223    - AM*_DB.W[U]/V[U] instructions such as AMSWAPDBW not only complete the corresponding
   224      atomic operation sequence, but also implement the complete full data barrier function.
   225  
   226    - When using the AM*_.W[U]/D[U] instruction, registers rd and rj cannot be the same,
   227      otherwise an exception is triggered, and rd and rk cannot be the same, otherwise
   228      the execution result is uncertain.
   229  */
   230  package loong64
   231
View as plain text