Source file src/cmd/internal/obj/loong64/doc.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  /*
     6  Package loong64 implements an LoongArch64 assembler. Go assembly syntax is different from
     7  GNU LoongArch64 syntax, but we can still follow the general rules to map between them.
     8  
     9  # Instructions mnemonics mapping rules
    10  
    11  1. Bit widths represented by various instruction suffixes and prefixes
    12  V (vlong)     = 64 bit
    13  WU (word)     = 32 bit unsigned
    14  W (word)      = 32 bit
    15  H (half word) = 16 bit
    16  HU            = 16 bit unsigned
    17  B (byte)      = 8 bit
    18  BU            = 8 bit unsigned
    19  F (float)     = 32 bit float
    20  D (double)    = 64 bit float
    21  
    22  V  (LSX)      = 128 bit
    23  XV (LASX)     = 256 bit
    24  
    25  Examples:
    26  
    27  	MOVB  (R2), R3  // Load 8 bit memory data into R3 register
    28  	MOVH  (R2), R3  // Load 16 bit memory data into R3 register
    29  	MOVW  (R2), R3  // Load 32 bit memory data into R3 register
    30  	MOVV  (R2), R3  // Load 64 bit memory data into R3 register
    31  	VMOVQ  (R2), V1 // Load 128 bit memory data into V1 register
    32  	XVMOVQ (R2), X1 // Load 256 bit memory data into X1 register
    33  
    34  2. Align directive
    35  Go asm supports the PCALIGN directive, which indicates that the next instruction should
    36  be aligned to a specified boundary by padding with NOOP instruction. The alignment value
    37  supported on loong64 must be a power of 2 and in the range of [8, 2048].
    38  
    39  Examples:
    40  
    41  	PCALIGN	$16
    42  	MOVV	$2, R4	// This instruction is aligned with 16 bytes.
    43  	PCALIGN	$1024
    44  	MOVV	$3, R5	// This instruction is aligned with 1024 bytes.
    45  
    46  # On loong64, auto-align loop heads to 16-byte boundaries
    47  
    48  Examples:
    49  
    50  	TEXT ·Add(SB),NOSPLIT|NOFRAME,$0
    51  
    52  start:
    53  
    54  	MOVV	$1, R4	// This instruction is aligned with 16 bytes.
    55  	MOVV	$-1, R5
    56  	BNE	R5, start
    57  	RET
    58  
    59  # Register mapping rules
    60  
    61  1. All generial-prupose register names are written as Rn.
    62  
    63  2. All floating-point register names are written as Fn.
    64  
    65  3. All LSX register names are written as Vn.
    66  
    67  4. All LASX register names are written as Xn.
    68  
    69  # Argument mapping rules
    70  
    71  1. The operands appear in left-to-right assignment order.
    72  
    73  Go reverses the arguments of most instructions.
    74  
    75  Examples:
    76  
    77  	ADDV	R11, R12, R13 <=> add.d R13, R12, R11
    78  	LLV	(R4), R7      <=> ll.d R7, R4
    79  	OR	R5, R6        <=> or R6, R6, R5
    80  
    81  Special Cases.
    82  (1) Argument order is the same as in the GNU Loong64 syntax: jump instructions,
    83  
    84  Examples:
    85  
    86  	BEQ	R0, R4, lable1  <=>  beq R0, R4, lable1
    87  	JMP	lable1          <=>  b lable1
    88  
    89  (2) BSTRINSW, BSTRINSV, BSTRPICKW, BSTRPICKV $<msb>, <Rj>, $<lsb>, <Rd>
    90  
    91  Examples:
    92  
    93  	BSTRPICKW $15, R4, $6, R5  <=>  bstrpick.w r5, r4, 15, 6
    94  
    95  2. Expressions for special arguments.
    96  
    97  Memory references: a base register and an offset register is written as (Rbase)(Roff).
    98  
    99  Examples:
   100  
   101  	MOVB (R4)(R5), R6  <=>  ldx.b R6, R4, R5
   102  	MOVV (R4)(R5), R6  <=>  ldx.d R6, R4, R5
   103  	MOVD (R4)(R5), F6  <=>  fldx.d F6, R4, R5
   104  	MOVB R6, (R4)(R5)  <=>  stx.b R6, R5, R5
   105  	MOVV R6, (R4)(R5)  <=>  stx.d R6, R5, R5
   106  	MOVV F6, (R4)(R5)  <=>  fstx.d F6, R5, R5
   107  
   108  3. Alphabetical list of SIMD instructions
   109  
   110  Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate),
   111  "ui3", "ui2", and "ui1" represent the related "index".
   112  
   113  3.1 Move general-purpose register to a vector element:
   114  
   115  	Instruction format:
   116  	        VMOVQ  Rj, <Vd>.<T>[index]
   117  
   118  	Mapping between Go and platform assembly:
   119  	       Go assembly       |      platform assembly     |          semantics
   120  	-------------------------------------------------------------------------------------
   121  	 VMOVQ  Rj, Vd.B[index]  |  vinsgr2vr.b  Vd, Rj, ui4  |  VR[vd].b[ui4] = GR[rj][7:0]
   122  	 VMOVQ  Rj, Vd.H[index]  |  vinsgr2vr.h  Vd, Rj, ui3  |  VR[vd].h[ui3] = GR[rj][15:0]
   123  	 VMOVQ  Rj, Vd.W[index]  |  vinsgr2vr.w  Vd, Rj, ui2  |  VR[vd].w[ui2] = GR[rj][31:0]
   124  	 VMOVQ  Rj, Vd.V[index]  |  vinsgr2vr.d  Vd, Rj, ui1  |  VR[vd].d[ui1] = GR[rj][63:0]
   125  	XVMOVQ  Rj, Xd.W[index]  | xvinsgr2vr.w  Xd, Rj, ui3  |  XR[xd].w[ui3] = GR[rj][31:0]
   126  	XVMOVQ  Rj, Xd.V[index]  | xvinsgr2vr.d  Xd, Rj, ui2  |  XR[xd].d[ui2] = GR[rj][63:0]
   127  
   128  3.2 Move vector element to general-purpose register
   129  
   130  	Instruction format:
   131  	        VMOVQ     <Vj>.<T>[index], Rd
   132  
   133  	Mapping between Go and platform assembly:
   134  	        Go assembly       |       platform assembly      |            semantics
   135  	---------------------------------------------------------------------------------------------
   136  	 VMOVQ  Vj.B[index],  Rd  |   vpickve2gr.b   rd, vj, ui4 | GR[rd] = SignExtend(VR[vj].b[ui4])
   137  	 VMOVQ  Vj.H[index],  Rd  |   vpickve2gr.h   rd, vj, ui3 | GR[rd] = SignExtend(VR[vj].h[ui3])
   138  	 VMOVQ  Vj.W[index],  Rd  |   vpickve2gr.w   rd, vj, ui2 | GR[rd] = SignExtend(VR[vj].w[ui2])
   139  	 VMOVQ  Vj.V[index],  Rd  |   vpickve2gr.d   rd, vj, ui1 | GR[rd] = SignExtend(VR[vj].d[ui1])
   140  	 VMOVQ  Vj.BU[index], Rd  |   vpickve2gr.bu  rd, vj, ui4 | GR[rd] = ZeroExtend(VR[vj].bu[ui4])
   141  	 VMOVQ  Vj.HU[index], Rd  |   vpickve2gr.hu  rd, vj, ui3 | GR[rd] = ZeroExtend(VR[vj].hu[ui3])
   142  	 VMOVQ  Vj.WU[index], Rd  |   vpickve2gr.wu  rd, vj, ui2 | GR[rd] = ZeroExtend(VR[vj].wu[ui2])
   143  	 VMOVQ  Vj.VU[index], Rd  |   vpickve2gr.du  rd, vj, ui1 | GR[rd] = ZeroExtend(VR[vj].du[ui1])
   144  	XVMOVQ  Xj.W[index],  Rd  |  xvpickve2gr.w   rd, xj, ui3 | GR[rd] = SignExtend(VR[xj].w[ui3])
   145  	XVMOVQ  Xj.V[index],  Rd  |  xvpickve2gr.d   rd, xj, ui2 | GR[rd] = SignExtend(VR[xj].d[ui2])
   146  	XVMOVQ  Xj.WU[index], Rd  |  xvpickve2gr.wu  rd, xj, ui3 | GR[rd] = ZeroExtend(VR[xj].wu[ui3])
   147  	XVMOVQ  Xj.VU[index], Rd  |  xvpickve2gr.du  rd, xj, ui2 | GR[rd] = ZeroExtend(VR[xj].du[ui2])
   148  
   149  3.3 Duplicate general-purpose register to vector.
   150  
   151  	Instruction format:
   152  	        VMOVQ    Rj, <Vd>.<T>
   153  
   154  	Mapping between Go and platform assembly:
   155  	   Go assembly      |    platform assembly    |                    semantics
   156  	------------------------------------------------------------------------------------------------
   157  	 VMOVQ  Rj, Vd.B16  |   vreplgr2vr.b  Vd, Rj  |  for i in range(16): VR[vd].b[i] = GR[rj][7:0]
   158  	 VMOVQ  Rj, Vd.H8   |   vreplgr2vr.h  Vd, Rj  |  for i in range(8) : VR[vd].h[i] = GR[rj][16:0]
   159  	 VMOVQ  Rj, Vd.W4   |   vreplgr2vr.w  Vd, Rj  |  for i in range(4) : VR[vd].w[i] = GR[rj][31:0]
   160  	 VMOVQ  Rj, Vd.V2   |   vreplgr2vr.d  Vd, Rj  |  for i in range(2) : VR[vd].d[i] = GR[rj][63:0]
   161  	XVMOVQ  Rj, Xd.B32  |  xvreplgr2vr.b  Xd, Rj  |  for i in range(32): XR[xd].b[i] = GR[rj][7:0]
   162  	XVMOVQ  Rj, Xd.H16  |  xvreplgr2vr.h  Xd, Rj  |  for i in range(16): XR[xd].h[i] = GR[rj][16:0]
   163  	XVMOVQ  Rj, Xd.W8   |  xvreplgr2vr.w  Xd, Rj  |  for i in range(8) : XR[xd].w[i] = GR[rj][31:0]
   164  	XVMOVQ  Rj, Xd.V4   |  xvreplgr2vr.d  Xd, Rj  |  for i in range(4) : XR[xd].d[i] = GR[rj][63:0]
   165  
   166  3.4 Replace vector elements
   167  
   168  	Instruction format:
   169  	        XVMOVQ    Xj, <Xd>.<T>
   170  
   171  	Mapping between Go and platform assembly:
   172  	   Go assembly      |   platform assembly   |                semantics
   173  	------------------------------------------------------------------------------------------------
   174  	XVMOVQ  Xj, Xd.B32  |  xvreplve0.b  Xd, Xj  | for i in range(32): XR[xd].b[i] = XR[xj].b[0]
   175  	XVMOVQ  Xj, Xd.H16  |  xvreplve0.h  Xd, Xj  | for i in range(16): XR[xd].h[i] = XR[xj].h[0]
   176  	XVMOVQ  Xj, Xd.W8   |  xvreplve0.w  Xd, Xj  | for i in range(8) : XR[xd].w[i] = XR[xj].w[0]
   177  	XVMOVQ  Xj, Xd.V4   |  xvreplve0.d  Xd, Xj  | for i in range(4) : XR[xd].d[i] = XR[xj].d[0]
   178  	XVMOVQ  Xj, Xd.Q2   |  xvreplve0.q  Xd, Xj  | for i in range(2) : XR[xd].q[i] = XR[xj].q[0]
   179  
   180  3.5 Move vector element to scalar
   181  
   182  	Instruction format:
   183  	        XVMOVQ  Xj, <Xd>.<T>[index]
   184  	        XVMOVQ  Xj.<T>[index], Xd
   185  
   186  	Mapping between Go and platform assembly:
   187  	       Go assembly        |     platform assembly     |               semantics
   188  	------------------------------------------------------------------------------------------------
   189  	 XVMOVQ  Xj, Xd.W[index]  |  xvinsve0.w   xd, xj, ui3 | XR[xd].w[ui3] = XR[xj].w[0]
   190  	 XVMOVQ  Xj, Xd.V[index]  |  xvinsve0.d   xd, xj, ui2 | XR[xd].d[ui2] = XR[xj].d[0]
   191  	 XVMOVQ  Xj.W[index], Xd  |  xvpickve.w   xd, xj, ui3 | XR[xd].w[0] = XR[xj].w[ui3], XR[xd][255:32] = 0
   192  	 XVMOVQ  Xj.V[index], Xd  |  xvpickve.d   xd, xj, ui2 | XR[xd].d[0] = XR[xj].d[ui2], XR[xd][255:64] = 0
   193  
   194  3.6 Move vector element to vector register.
   195  
   196  	Instruction format:
   197  	VMOVQ     <Vn>.<T>[index], Vn.<T>
   198  
   199  	Mapping between Go and platform assembly:
   200  	         Go assembly      |    platform assembly   |               semantics
   201  	VMOVQ Vj.B[index], Vd.B16 | vreplvei.b vd, vj, ui4 | for i in range(16): VR[vd].b[i] = VR[vj].b[ui4]
   202  	VMOVQ Vj.H[index], Vd.H8  | vreplvei.h vd, vj, ui3 | for i in range(8) : VR[vd].h[i] = VR[vj].h[ui3]
   203  	VMOVQ Vj.W[index], Vd.W4  | vreplvei.w vd, vj, ui2 | for i in range(4) : VR[vd].w[i] = VR[vj].w[ui2]
   204  	VMOVQ Vj.V[index], Vd.V2  | vreplvei.d vd, vj, ui1 | for i in range(2) : VR[vd].d[i] = VR[vj].d[ui1]
   205  
   206  3.7 Move vector register to vector register.
   207          Instruction format:
   208          VMOVQ     Vj, Vd
   209  
   210          Mapping between Go and platform assembly:
   211            Go assembly   |   platform assembly   |                         semantics
   212          VMOVQ   Vj, Vd  |  vslli.d vd, vj, 0x0  | for i in range(2) : VR[vd].D[i] = SLL(VR[vj].D[i], 0)
   213          VXMOVQ  Xj, Xd  | xvslli.d xd, xj, 0x0  | for i in range(4) : XR[xd].D[i] = SLL(XR[xj].D[i], 0)
   214  
   215  3.7 Load data from memory and broadcast to each element of a vector register.
   216  
   217  	Instruction format:
   218  	        VMOVQ    offset(Rj), <Vd>.<T>
   219  
   220  	Mapping between Go and platform assembly:
   221  	   Go assembly              |     platform assembly      |                                semantics
   222  	-------------------------------------------------------------------------------------------------------------------------------------------------------
   223  	 VMOVQ  offset(Rj), Vd.B16  |   vldrepl.b  Vd, Rj, si12  |  for i in range(16): VR[vd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
   224  	 VMOVQ  offset(Rj), Vd.H8   |   vldrepl.h  Vd, Rj, si11  |  for i in range(8) : VR[vd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
   225  	 VMOVQ  offset(Rj), Vd.W4   |   vldrepl.w  Vd, Rj, si10  |  for i in range(4) : VR[vd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
   226  	 VMOVQ  offset(Rj), Vd.V2   |   vldrepl.d  Vd, Rj, si9   |  for i in range(2) : VR[vd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
   227  	XVMOVQ  offset(Rj), Xd.B32  |  xvldrepl.b  Xd, Rj, si12  |  for i in range(32): XR[xd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
   228  	XVMOVQ  offset(Rj), Xd.H16  |  xvldrepl.h  Xd, Rj, si11  |  for i in range(16): XR[xd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
   229  	XVMOVQ  offset(Rj), Xd.W8   |  xvldrepl.w  Xd, Rj, si10  |  for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
   230  	XVMOVQ  offset(Rj), Xd.V4   |  xvldrepl.d  Xd, Rj, si9   |  for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
   231  
   232  	note: In Go assembly, for ease of understanding, offset representing the actual address offset.
   233  	      However, during platform encoding, the offset is shifted to increase the encodable offset range, as follows:
   234  
   235  	   Go assembly           |      platform assembly
   236           VMOVQ  1(R4), V5.B16    |      vldrepl.b  v5, r4, $1
   237           VMOVQ  2(R4), V5.H8     |      vldrepl.h  v5, r4, $1
   238           VMOVQ  8(R4), V5.W4     |      vldrepl.w  v5, r4, $2
   239           VMOVQ  8(R4), V5.V2     |      vldrepl.d  v5, r4, $1
   240  
   241  3.8 Vector permutation instruction
   242  	Instruction format:
   243  	VPERMIW    ui8, Vj, Vd
   244  
   245  	Mapping between Go and platform assembly:
   246  	     Go assembly     |   platform assembly   |                                 semantics
   247  	VPERMIW  ui8, Vj, Vd |  vpermi.w vd, vj, ui8 | VR[vd].W[0] = VR[vj].W[ui8[1:0]], VR[vd].W[1] = VR[vj].W[ui8[3:2]],
   248  	                     |                       | VR[vd].W[2] = VR[vd].W[ui8[5:4]], VR[vd].W[3] = VR[vd].W[ui8[7:6]]
   249  	XVPERMIW ui8, Xj, Xd | xvpermi.w xd, xj, ui8 | XR[xd].W[0] = XR[xj].W[ui8[1:0]],   XR[xd].W[1] = XR[xj].W[ui8[3:2]],
   250  	                     |                       | XR[xd].W[3] = XR[xd].W[ui8[7:6]],   XR[xd].W[2] = XR[xd].W[ui8[5:4]],
   251  	                     |                       | XR[xd].W[4] = XR[xj].W[ui8[1:0]+4], XR[xd].W[5] = XR[xj].W[ui8[3:2]+4],
   252  	                     |                       | XR[xd].W[6] = XR[xd].W[ui8[5:4]+4], XR[xd].W[7] = XR[xd].W[ui8[7:6]+4]
   253  	XVPERMIV ui8, Xj, Xd | xvpermi.d xd, xj, ui8 | XR[xd].D[0] = XR[xj].D[ui8[1:0]], XR[xd].D[1] = XR[xj].D[ui8[3:2]],
   254  	                     |                       | XR[xd].D[2] = XR[xj].D[ui8[5:4]], XR[xd].D[3] = XR[xj].D[ui8[7:6]]
   255  	XVPERMIQ ui8, Xj, Xd | xvpermi.q xd, xj, ui8 | vec = {XR[xd], XR[xj]}, XR[xd].Q[0] = vec.Q[ui8[1:0]], XR[xd].Q[1] = vec.Q[ui8[5:4]]
   256  
   257  3.9 Vector misc instruction
   258  
   259  3.9.1 {,X}VEXTRINS.{B,H,W,V}
   260  
   261  	Instruction format:
   262  	VEXTRINSB   ui8, Vj, Vd
   263  
   264  	Mapping between Go and platform assembly:
   265  	      Go assembly      |    platform assembly    |             semantics
   266  	 VEXTRINSB ui8, Vj, Vd |  vextrins.b vd, vj, ui8 | VR[vd].B[ui8[7:4]] = VR[vj].B[ui8[3:0]]
   267  	 VEXTRINSH ui8, Vj, Vd |  vextrins.h vd, vj, ui8 | VR[vd].H[ui8[6:4]] = VR[vj].H[ui8[2:0]]
   268  	 VEXTRINSW ui8, Vj, Vd |  vextrins.w vd, vj, ui8 | VR[vd].W[ui8[5:4]] = VR[vj].W[ui8[1:0]]
   269  	 VEXTRINSV ui8, Vj, Vd |  vextrins.d vd, vj, ui8 | VR[vd].D[ui8[4]] = VR[vj].D[ui8[0]]
   270  	XVEXTRINSB ui8, Vj, Vd | xvextrins.b vd, vj, ui8 | XR[xd].B[ui8[7:4]] = XR[xj].B[ui8[3:0]], XR[xd].B[ui8[7:4]+16] = XR[xj].B[ui8[3:0]+16]
   271  	XVEXTRINSH ui8, Vj, Vd | xvextrins.h vd, vj, ui8 | XR[xd].H[ui8[6:4]] = XR[xj].H[ui8[2:0]], XR[xd].H[ui8[6:4]+8] = XR[xj].H[ui8[2:0]+8]
   272  	XVEXTRINSW ui8, Vj, Vd | xvextrins.w vd, vj, ui8 | XR[xd].W[ui8[5:4]] = XR[xj].W[ui8[1:0]], XR[xd].W[ui8[5:4]+4] = XR[xj].W[ui8[1:0]+4]
   273  	XVEXTRINSV ui8, Vj, Vd | xvextrins.d vd, vj, ui8 | XR[xd].D[ui8[4]] = XR[xj].D[ui8[0]],XR[xd].D[ui8[4]+2] = XR[xj].D[ui8[0]+2]
   274  
   275  # Special instruction encoding definition and description on LoongArch
   276  
   277   1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased
   278      from the Linux kernel implementation: https://git.kernel.org/torvalds/c/e031a5f3f1ed
   279  
   280      - Bit4: ordering or completion (0: completion, 1: ordering)
   281      - Bit3: barrier for previous read (0: true, 1: false)
   282      - Bit2: barrier for previous write (0: true, 1: false)
   283      - Bit1: barrier for succeeding read (0: true, 1: false)
   284      - Bit0: barrier for succeeding write (0: true, 1: false)
   285      - Hint 0x700: barrier for "read after read" from the same address
   286  
   287      Traditionally, on microstructures that do not support dbar grading such as LA464
   288      (Loongson 3A5000, 3C5000) all variants are treated as “dbar 0” (full barrier).
   289  
   290  2. Notes on using atomic operation instructions
   291  
   292    - AM*_DB.W[U]/V[U] instructions such as AMSWAPDBW not only complete the corresponding
   293      atomic operation sequence, but also implement the complete full data barrier function.
   294  
   295    - When using the AM*_.W[U]/D[U] instruction, registers rd and rj cannot be the same,
   296      otherwise an exception is triggered, and rd and rk cannot be the same, otherwise
   297      the execution result is uncertain.
   298  
   299  3. Prefetch instructions
   300      Instruction format:
   301        PRELD	offset(Rbase), $hint
   302        PRELDX	offset(Rbase), $n, $hint
   303  
   304      Mapping between Go and platform assembly:
   305                 Go assembly            |    platform assembly
   306        PRELD  offset(Rbase), $hint     | preld hint, Rbase, offset
   307        PRELDX offset(Rbase), $n, $hint | move rk, $x; preldx hint, Rbase, rk
   308  
   309        note: $x is the value after $n and offset are reassembled
   310  
   311      Definition of hint value:
   312        0: load to L1
   313        2: load to L3
   314        8: store to L1
   315  
   316        The meaning of the rest of values is not defined yet, and the processor executes it as NOP
   317  
   318      Definition of $n in the PRELDX instruction:
   319        bit[0]: address sequence, 0 indicating ascending and 1 indicating descending
   320        bits[11:1]:  block size, the value range is [16, 1024], and it must be an integer multiple of 16
   321        bits[20:12]: block num, the value range is [1, 256]
   322        bits[36:21]: stride, the value range is [0, 0xffff]
   323  
   324  4. ShiftAdd instructions
   325      Mapping between Go and platform assembly:
   326                  Go assembly            |    platform assembly
   327       ALSL.W/WU/V $Imm, Rj, Rk, Rd      |    alsl.w/wu/d rd, rj, rk, $imm
   328  
   329      Instruction encoding format is as follows:
   330  
   331  	| 31 ~ 17 | 16 ~ 15 | 14 ~ 10 | 9 ~ 5 | 4 ~ 0 |
   332  	|  opcode |   sa2   |   rk    |   rj  |   rd  |
   333  
   334      The alsl.w/wu/v series of instructions shift the data in rj left by sa+1, add the value
   335      in rk, and write the result to rd.
   336  
   337      To allow programmers to directly write the desired shift amount in assembly code, we actually write
   338      the value of sa2+1 in the assembly code and then include the value of sa2 in the instruction encoding.
   339  
   340      For example:
   341  
   342              Go assembly      | instruction Encoding
   343          ALSLV $4, r4, r5, R6 |      002d9486
   344  
   345  5. Note of special memory access instructions
   346      Instruction format:
   347        MOVWP	offset(Rj), Rd
   348        MOVVP	offset(Rj), Rd
   349        MOVWP	Rd, offset(Rj)
   350        MOVVP	Rd, offset(Rj)
   351  
   352      Mapping between Go and platform assembly:
   353                 Go assembly      |      platform assembly
   354        MOVWP  offset(Rj), Rd     |    ldptr.w  rd, rj, si14
   355        MOVVP  offset(Rj), Rd     |    ldptr.d  rd, rj, si14
   356        MOVWP  Rd, offset(Rj)     |    stptr.w  rd, rj, si14
   357        MOVVP  Rd, offset(Rj)     |    stptr.d  rd, rj, si14
   358  
   359        note: In Go assembly, for ease of understanding, offset is a 16-bit immediate number representing
   360              the actual address offset, but in platform assembly, it need a 14-bit immediate number.
   361  	    si14 = offset>>2
   362  
   363      The addressing calculation for the above instruction involves logically left-shifting the 14-bit
   364      immediate number si14 by 2 bits, then sign-extending it, and finally adding it to the value in the
   365      general-purpose register rj to obtain the sum.
   366  
   367      For example:
   368  
   369              Go assembly      |      platform assembly
   370           MOVWP  8(R4), R5    |      ldptr.w r5, r4, $2
   371  
   372  6. Note of special add instrction
   373      Mapping between Go and platform assembly:
   374                Go assembly        |      platform assembly
   375        ADDV16  si16<<16, Rj, Rd   |    addu16i.d  rd, rj, si16
   376  
   377        note: si16 is a 16-bit immediate number, and si16<<16 is the actual operand.
   378  
   379      The addu16i.d instruction logically left-shifts the 16-bit immediate number si16 by 16 bits, then
   380      sign-extends it. The resulting data is added to the [63:0] bits of data in the general-purpose register
   381      rj, and the sum is written into the general-purpose register rd.
   382      The addu16i.d instruction is used in conjunction with the ldptr.w/d and stptr.w/d instructions to
   383      accelerate access based on the GOT table in position-independent code.
   384  */
   385  
   386  package loong64
   387  

View as plain text