Source file src/cmd/compile/internal/amd64/ssa.go

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package amd64
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"cmd/compile/internal/base"
    12  	"cmd/compile/internal/ir"
    13  	"cmd/compile/internal/logopt"
    14  	"cmd/compile/internal/objw"
    15  	"cmd/compile/internal/ssa"
    16  	"cmd/compile/internal/ssagen"
    17  	"cmd/compile/internal/types"
    18  	"cmd/internal/obj"
    19  	"cmd/internal/obj/x86"
    20  	"internal/abi"
    21  )
    22  
    23  // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
    24  func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
    25  	flive := b.FlagsLiveAtEnd
    26  	for _, c := range b.ControlValues() {
    27  		flive = c.Type.IsFlags() || flive
    28  	}
    29  	for i := len(b.Values) - 1; i >= 0; i-- {
    30  		v := b.Values[i]
    31  		if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
    32  			// The "mark" is any non-nil Aux value.
    33  			v.Aux = ssa.AuxMark
    34  		}
    35  		if v.Type.IsFlags() {
    36  			flive = false
    37  		}
    38  		for _, a := range v.Args {
    39  			if a.Type.IsFlags() {
    40  				flive = true
    41  			}
    42  		}
    43  	}
    44  }
    45  
    46  func isGPReg(r int16) bool {
    47  	return x86.REG_AL <= r && r <= x86.REG_R15
    48  }
    49  
    50  func isFPReg(r int16) bool {
    51  	return x86.REG_X0 <= r && r <= x86.REG_Z31
    52  }
    53  
    54  func isKReg(r int16) bool {
    55  	return x86.REG_K0 <= r && r <= x86.REG_K7
    56  }
    57  
    58  func isLowFPReg(r int16) bool {
    59  	return x86.REG_X0 <= r && r <= x86.REG_X15
    60  }
    61  
    62  // loadByRegWidth returns the load instruction of the given register of a given width.
    63  func loadByRegWidth(r int16, width int64) obj.As {
    64  	// Avoid partial register write for GPR
    65  	if !isFPReg(r) && !isKReg(r) {
    66  		switch width {
    67  		case 1:
    68  			return x86.AMOVBLZX
    69  		case 2:
    70  			return x86.AMOVWLZX
    71  		}
    72  	}
    73  	// Otherwise, there's no difference between load and store opcodes.
    74  	return storeByRegWidth(r, width)
    75  }
    76  
    77  // storeByRegWidth returns the store instruction of the given register of a given width.
    78  // It's also used for loading const to a reg.
    79  func storeByRegWidth(r int16, width int64) obj.As {
    80  	if isFPReg(r) {
    81  		switch width {
    82  		case 4:
    83  			return x86.AMOVSS
    84  		case 8:
    85  			return x86.AMOVSD
    86  		case 16:
    87  			// int128s are in SSE registers
    88  			if isLowFPReg(r) {
    89  				return x86.AMOVUPS
    90  			} else {
    91  				return x86.AVMOVDQU
    92  			}
    93  		case 32:
    94  			return x86.AVMOVDQU
    95  		case 64:
    96  			return x86.AVMOVDQU64
    97  		}
    98  	}
    99  	if isKReg(r) {
   100  		return x86.AKMOVQ
   101  	}
   102  	// gp
   103  	switch width {
   104  	case 1:
   105  		return x86.AMOVB
   106  	case 2:
   107  		return x86.AMOVW
   108  	case 4:
   109  		return x86.AMOVL
   110  	case 8:
   111  		return x86.AMOVQ
   112  	}
   113  	panic(fmt.Sprintf("bad store reg=%v, width=%d", r, width))
   114  }
   115  
   116  // moveByRegsWidth returns the reg->reg move instruction of the given dest/src registers of a given width.
   117  func moveByRegsWidth(dest, src int16, width int64) obj.As {
   118  	// fp -> fp
   119  	if isFPReg(dest) && isFPReg(src) {
   120  		// Moving the whole sse2 register is faster
   121  		// than moving just the correct low portion of it.
   122  		// There is no xmm->xmm move with 1 byte opcode,
   123  		// so use movups, which has 2 byte opcode.
   124  		if isLowFPReg(dest) && isLowFPReg(src) && width <= 16 {
   125  			return x86.AMOVUPS
   126  		}
   127  		if width <= 32 {
   128  			return x86.AVMOVDQU
   129  		}
   130  		return x86.AVMOVDQU64
   131  	}
   132  	// k -> gp, gp -> k, k -> k
   133  	if isKReg(dest) || isKReg(src) {
   134  		if isFPReg(dest) || isFPReg(src) {
   135  			panic(fmt.Sprintf("bad move, src=%v, dest=%v, width=%d", src, dest, width))
   136  		}
   137  		return x86.AKMOVQ
   138  	}
   139  	// gp -> fp, fp -> gp, gp -> gp
   140  	switch width {
   141  	case 1:
   142  		// Avoids partial register write
   143  		return x86.AMOVL
   144  	case 2:
   145  		return x86.AMOVL
   146  	case 4:
   147  		return x86.AMOVL
   148  	case 8:
   149  		return x86.AMOVQ
   150  	case 16:
   151  		if isLowFPReg(dest) && isLowFPReg(src) {
   152  			// int128s are in SSE registers
   153  			return x86.AMOVUPS
   154  		} else {
   155  			return x86.AVMOVDQU
   156  		}
   157  	case 32:
   158  		return x86.AVMOVDQU
   159  	case 64:
   160  		return x86.AVMOVDQU64
   161  	}
   162  	panic(fmt.Sprintf("bad move, src=%v, dest=%v, width=%d", src, dest, width))
   163  }
   164  
   165  // opregreg emits instructions for
   166  //
   167  //	dest := dest(To) op src(From)
   168  //
   169  // and also returns the created obj.Prog so it
   170  // may be further adjusted (offset, scale, etc).
   171  func opregreg(s *ssagen.State, op obj.As, dest, src int16) *obj.Prog {
   172  	p := s.Prog(op)
   173  	p.From.Type = obj.TYPE_REG
   174  	p.To.Type = obj.TYPE_REG
   175  	p.To.Reg = dest
   176  	p.From.Reg = src
   177  	return p
   178  }
   179  
   180  // memIdx fills out a as an indexed memory reference for v.
   181  // It assumes that the base register and the index register
   182  // are v.Args[0].Reg() and v.Args[1].Reg(), respectively.
   183  // The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary.
   184  func memIdx(a *obj.Addr, v *ssa.Value) {
   185  	r, i := v.Args[0].Reg(), v.Args[1].Reg()
   186  	a.Type = obj.TYPE_MEM
   187  	a.Scale = v.Op.Scale()
   188  	if a.Scale == 1 && i == x86.REG_SP {
   189  		r, i = i, r
   190  	}
   191  	a.Reg = r
   192  	a.Index = i
   193  }
   194  
   195  func getgFromTLS(s *ssagen.State, r int16) {
   196  	// See the comments in cmd/internal/obj/x86/obj6.go
   197  	// near CanUse1InsnTLS for a detailed explanation of these instructions.
   198  	if x86.CanUse1InsnTLS(base.Ctxt) {
   199  		// MOVQ (TLS), r
   200  		p := s.Prog(x86.AMOVQ)
   201  		p.From.Type = obj.TYPE_MEM
   202  		p.From.Reg = x86.REG_TLS
   203  		p.To.Type = obj.TYPE_REG
   204  		p.To.Reg = r
   205  	} else {
   206  		// MOVQ TLS, r
   207  		// MOVQ (r)(TLS*1), r
   208  		p := s.Prog(x86.AMOVQ)
   209  		p.From.Type = obj.TYPE_REG
   210  		p.From.Reg = x86.REG_TLS
   211  		p.To.Type = obj.TYPE_REG
   212  		p.To.Reg = r
   213  		q := s.Prog(x86.AMOVQ)
   214  		q.From.Type = obj.TYPE_MEM
   215  		q.From.Reg = r
   216  		q.From.Index = x86.REG_TLS
   217  		q.From.Scale = 1
   218  		q.To.Type = obj.TYPE_REG
   219  		q.To.Reg = r
   220  	}
   221  }
   222  
   223  func ssaGenValue(s *ssagen.State, v *ssa.Value) {
   224  	switch v.Op {
   225  	case ssa.OpAMD64VFMADD231SD, ssa.OpAMD64VFMADD231SS:
   226  		p := s.Prog(v.Op.Asm())
   227  		p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()}
   228  		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
   229  		p.AddRestSourceReg(v.Args[1].Reg())
   230  	case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
   231  		r := v.Reg()
   232  		r1 := v.Args[0].Reg()
   233  		r2 := v.Args[1].Reg()
   234  		switch {
   235  		case r == r1:
   236  			p := s.Prog(v.Op.Asm())
   237  			p.From.Type = obj.TYPE_REG
   238  			p.From.Reg = r2
   239  			p.To.Type = obj.TYPE_REG
   240  			p.To.Reg = r
   241  		case r == r2:
   242  			p := s.Prog(v.Op.Asm())
   243  			p.From.Type = obj.TYPE_REG
   244  			p.From.Reg = r1
   245  			p.To.Type = obj.TYPE_REG
   246  			p.To.Reg = r
   247  		default:
   248  			var asm obj.As
   249  			if v.Op == ssa.OpAMD64ADDQ {
   250  				asm = x86.ALEAQ
   251  			} else {
   252  				asm = x86.ALEAL
   253  			}
   254  			p := s.Prog(asm)
   255  			p.From.Type = obj.TYPE_MEM
   256  			p.From.Reg = r1
   257  			p.From.Scale = 1
   258  			p.From.Index = r2
   259  			p.To.Type = obj.TYPE_REG
   260  			p.To.Reg = r
   261  		}
   262  	// 2-address opcode arithmetic
   263  	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
   264  		ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
   265  		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
   266  		ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
   267  		ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
   268  		ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
   269  		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
   270  		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
   271  		ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
   272  		ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
   273  		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
   274  		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
   275  		ssa.OpAMD64MINSS, ssa.OpAMD64MINSD,
   276  		ssa.OpAMD64POR, ssa.OpAMD64PXOR,
   277  		ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
   278  		ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
   279  		ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ,
   280  		ssa.OpAMD64PCMPEQB, ssa.OpAMD64PSIGNB,
   281  		ssa.OpAMD64PUNPCKLBW:
   282  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   283  
   284  	case ssa.OpAMD64PSHUFLW:
   285  		p := s.Prog(v.Op.Asm())
   286  		imm := v.AuxInt
   287  		if imm < 0 || imm > 255 {
   288  			v.Fatalf("Invalid source selection immediate")
   289  		}
   290  		p.From.Offset = imm
   291  		p.From.Type = obj.TYPE_CONST
   292  		p.AddRestSourceReg(v.Args[0].Reg())
   293  		p.To.Type = obj.TYPE_REG
   294  		p.To.Reg = v.Reg()
   295  
   296  	case ssa.OpAMD64PSHUFBbroadcast:
   297  		// PSHUFB with a control mask of zero copies byte 0 to all
   298  		// bytes in the register.
   299  		//
   300  		// X15 is always zero with ABIInternal.
   301  		if s.ABI != obj.ABIInternal {
   302  			// zero X15 manually
   303  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
   304  		}
   305  
   306  		p := s.Prog(v.Op.Asm())
   307  		p.From.Type = obj.TYPE_REG
   308  		p.To.Type = obj.TYPE_REG
   309  		p.To.Reg = v.Reg()
   310  		p.From.Reg = x86.REG_X15
   311  
   312  	case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
   313  		p := s.Prog(v.Op.Asm())
   314  		lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
   315  		p.From.Type = obj.TYPE_REG
   316  		p.From.Reg = bits
   317  		p.To.Type = obj.TYPE_REG
   318  		p.To.Reg = lo
   319  		p.AddRestSourceReg(hi)
   320  
   321  	case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL,
   322  		ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL,
   323  		ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
   324  		p := s.Prog(v.Op.Asm())
   325  		p.From.Type = obj.TYPE_REG
   326  		p.From.Reg = v.Args[0].Reg()
   327  		p.To.Type = obj.TYPE_REG
   328  		switch v.Op {
   329  		case ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
   330  			p.To.Reg = v.Reg0()
   331  		default:
   332  			p.To.Reg = v.Reg()
   333  		}
   334  
   335  	case ssa.OpAMD64ANDNQ, ssa.OpAMD64ANDNL:
   336  		p := s.Prog(v.Op.Asm())
   337  		p.From.Type = obj.TYPE_REG
   338  		p.From.Reg = v.Args[0].Reg()
   339  		p.To.Type = obj.TYPE_REG
   340  		p.To.Reg = v.Reg()
   341  		p.AddRestSourceReg(v.Args[1].Reg())
   342  
   343  	case ssa.OpAMD64SARXL, ssa.OpAMD64SARXQ,
   344  		ssa.OpAMD64SHLXL, ssa.OpAMD64SHLXQ,
   345  		ssa.OpAMD64SHRXL, ssa.OpAMD64SHRXQ:
   346  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   347  		p.AddRestSourceReg(v.Args[0].Reg())
   348  
   349  	case ssa.OpAMD64SHLXLload, ssa.OpAMD64SHLXQload,
   350  		ssa.OpAMD64SHRXLload, ssa.OpAMD64SHRXQload,
   351  		ssa.OpAMD64SARXLload, ssa.OpAMD64SARXQload:
   352  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
   353  		m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
   354  		ssagen.AddAux(&m, v)
   355  		p.AddRestSource(m)
   356  
   357  	case ssa.OpAMD64SHLXLloadidx1, ssa.OpAMD64SHLXLloadidx4, ssa.OpAMD64SHLXLloadidx8,
   358  		ssa.OpAMD64SHRXLloadidx1, ssa.OpAMD64SHRXLloadidx4, ssa.OpAMD64SHRXLloadidx8,
   359  		ssa.OpAMD64SARXLloadidx1, ssa.OpAMD64SARXLloadidx4, ssa.OpAMD64SARXLloadidx8,
   360  		ssa.OpAMD64SHLXQloadidx1, ssa.OpAMD64SHLXQloadidx8,
   361  		ssa.OpAMD64SHRXQloadidx1, ssa.OpAMD64SHRXQloadidx8,
   362  		ssa.OpAMD64SARXQloadidx1, ssa.OpAMD64SARXQloadidx8:
   363  		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[2].Reg())
   364  		m := obj.Addr{Type: obj.TYPE_MEM}
   365  		memIdx(&m, v)
   366  		ssagen.AddAux(&m, v)
   367  		p.AddRestSource(m)
   368  
   369  	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
   370  		// Arg[0] (the dividend) is in AX.
   371  		// Arg[1] (the divisor) can be in any other register.
   372  		// Result[0] (the quotient) is in AX.
   373  		// Result[1] (the remainder) is in DX.
   374  		r := v.Args[1].Reg()
   375  
   376  		// Zero extend dividend.
   377  		opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
   378  
   379  		// Issue divide.
   380  		p := s.Prog(v.Op.Asm())
   381  		p.From.Type = obj.TYPE_REG
   382  		p.From.Reg = r
   383  
   384  	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
   385  		// Arg[0] (the dividend) is in AX.
   386  		// Arg[1] (the divisor) can be in any other register.
   387  		// Result[0] (the quotient) is in AX.
   388  		// Result[1] (the remainder) is in DX.
   389  		r := v.Args[1].Reg()
   390  
   391  		var opCMP, opNEG, opSXD obj.As
   392  		switch v.Op {
   393  		case ssa.OpAMD64DIVQ:
   394  			opCMP, opNEG, opSXD = x86.ACMPQ, x86.ANEGQ, x86.ACQO
   395  		case ssa.OpAMD64DIVL:
   396  			opCMP, opNEG, opSXD = x86.ACMPL, x86.ANEGL, x86.ACDQ
   397  		case ssa.OpAMD64DIVW:
   398  			opCMP, opNEG, opSXD = x86.ACMPW, x86.ANEGW, x86.ACWD
   399  		}
   400  
   401  		// CPU faults upon signed overflow, which occurs when the most
   402  		// negative int is divided by -1. Handle divide by -1 as a special case.
   403  		var j1, j2 *obj.Prog
   404  		if ssa.DivisionNeedsFixUp(v) {
   405  			c := s.Prog(opCMP)
   406  			c.From.Type = obj.TYPE_REG
   407  			c.From.Reg = r
   408  			c.To.Type = obj.TYPE_CONST
   409  			c.To.Offset = -1
   410  
   411  			// Divisor is not -1, proceed with normal division.
   412  			j1 = s.Prog(x86.AJNE)
   413  			j1.To.Type = obj.TYPE_BRANCH
   414  
   415  			// Divisor is -1, manually compute quotient and remainder via fixup code.
   416  			// n / -1 = -n
   417  			n1 := s.Prog(opNEG)
   418  			n1.To.Type = obj.TYPE_REG
   419  			n1.To.Reg = x86.REG_AX
   420  
   421  			// n % -1 == 0
   422  			opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
   423  
   424  			// TODO(khr): issue only the -1 fixup code we need.
   425  			// For instance, if only the quotient is used, no point in zeroing the remainder.
   426  
   427  			// Skip over normal division.
   428  			j2 = s.Prog(obj.AJMP)
   429  			j2.To.Type = obj.TYPE_BRANCH
   430  		}
   431  
   432  		// Sign extend dividend and perform division.
   433  		p := s.Prog(opSXD)
   434  		if j1 != nil {
   435  			j1.To.SetTarget(p)
   436  		}
   437  		p = s.Prog(v.Op.Asm())
   438  		p.From.Type = obj.TYPE_REG
   439  		p.From.Reg = r
   440  
   441  		if j2 != nil {
   442  			j2.To.SetTarget(s.Pc())
   443  		}
   444  
   445  	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
   446  		// the frontend rewrites constant division by 8/16/32 bit integers into
   447  		// HMUL by a constant
   448  		// SSA rewrites generate the 64 bit versions
   449  
   450  		// Arg[0] is already in AX as it's the only register we allow
   451  		// and DX is the only output we care about (the high bits)
   452  		p := s.Prog(v.Op.Asm())
   453  		p.From.Type = obj.TYPE_REG
   454  		p.From.Reg = v.Args[1].Reg()
   455  
   456  		// IMULB puts the high portion in AH instead of DL,
   457  		// so move it to DL for consistency
   458  		if v.Type.Size() == 1 {
   459  			m := s.Prog(x86.AMOVB)
   460  			m.From.Type = obj.TYPE_REG
   461  			m.From.Reg = x86.REG_AH
   462  			m.To.Type = obj.TYPE_REG
   463  			m.To.Reg = x86.REG_DX
   464  		}
   465  
   466  	case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU:
   467  		// Arg[0] is already in AX as it's the only register we allow
   468  		// results lo in AX
   469  		p := s.Prog(v.Op.Asm())
   470  		p.From.Type = obj.TYPE_REG
   471  		p.From.Reg = v.Args[1].Reg()
   472  
   473  	case ssa.OpAMD64MULQU2:
   474  		// Arg[0] is already in AX as it's the only register we allow
   475  		// results hi in DX, lo in AX
   476  		p := s.Prog(v.Op.Asm())
   477  		p.From.Type = obj.TYPE_REG
   478  		p.From.Reg = v.Args[1].Reg()
   479  
   480  	case ssa.OpAMD64DIVQU2:
   481  		// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
   482  		// results q in AX, r in DX
   483  		p := s.Prog(v.Op.Asm())
   484  		p.From.Type = obj.TYPE_REG
   485  		p.From.Reg = v.Args[2].Reg()
   486  
   487  	case ssa.OpAMD64AVGQU:
   488  		// compute (x+y)/2 unsigned.
   489  		// Do a 64-bit add, the overflow goes into the carry.
   490  		// Shift right once and pull the carry back into the 63rd bit.
   491  		p := s.Prog(x86.AADDQ)
   492  		p.From.Type = obj.TYPE_REG
   493  		p.To.Type = obj.TYPE_REG
   494  		p.To.Reg = v.Reg()
   495  		p.From.Reg = v.Args[1].Reg()
   496  		p = s.Prog(x86.ARCRQ)
   497  		p.From.Type = obj.TYPE_CONST
   498  		p.From.Offset = 1
   499  		p.To.Type = obj.TYPE_REG
   500  		p.To.Reg = v.Reg()
   501  
   502  	case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
   503  		r := v.Reg0()
   504  		r0 := v.Args[0].Reg()
   505  		r1 := v.Args[1].Reg()
   506  		switch r {
   507  		case r0:
   508  			p := s.Prog(v.Op.Asm())
   509  			p.From.Type = obj.TYPE_REG
   510  			p.From.Reg = r1
   511  			p.To.Type = obj.TYPE_REG
   512  			p.To.Reg = r
   513  		case r1:
   514  			p := s.Prog(v.Op.Asm())
   515  			p.From.Type = obj.TYPE_REG
   516  			p.From.Reg = r0
   517  			p.To.Type = obj.TYPE_REG
   518  			p.To.Reg = r
   519  		default:
   520  			v.Fatalf("output not in same register as an input %s", v.LongString())
   521  		}
   522  
   523  	case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ:
   524  		p := s.Prog(v.Op.Asm())
   525  		p.From.Type = obj.TYPE_REG
   526  		p.From.Reg = v.Args[1].Reg()
   527  		p.To.Type = obj.TYPE_REG
   528  		p.To.Reg = v.Reg0()
   529  
   530  	case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst:
   531  		p := s.Prog(v.Op.Asm())
   532  		p.From.Type = obj.TYPE_CONST
   533  		p.From.Offset = v.AuxInt
   534  		p.To.Type = obj.TYPE_REG
   535  		p.To.Reg = v.Reg0()
   536  
   537  	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
   538  		r := v.Reg()
   539  		a := v.Args[0].Reg()
   540  		if r == a {
   541  			switch v.AuxInt {
   542  			case 1:
   543  				var asm obj.As
   544  				// Software optimization manual recommends add $1,reg.
   545  				// But inc/dec is 1 byte smaller. ICC always uses inc
   546  				// Clang/GCC choose depending on flags, but prefer add.
   547  				// Experiments show that inc/dec is both a little faster
   548  				// and make a binary a little smaller.
   549  				if v.Op == ssa.OpAMD64ADDQconst {
   550  					asm = x86.AINCQ
   551  				} else {
   552  					asm = x86.AINCL
   553  				}
   554  				p := s.Prog(asm)
   555  				p.To.Type = obj.TYPE_REG
   556  				p.To.Reg = r
   557  				return
   558  			case -1:
   559  				var asm obj.As
   560  				if v.Op == ssa.OpAMD64ADDQconst {
   561  					asm = x86.ADECQ
   562  				} else {
   563  					asm = x86.ADECL
   564  				}
   565  				p := s.Prog(asm)
   566  				p.To.Type = obj.TYPE_REG
   567  				p.To.Reg = r
   568  				return
   569  			case 0x80:
   570  				// 'SUBQ $-0x80, r' is shorter to encode than
   571  				// and functionally equivalent to 'ADDQ $0x80, r'.
   572  				asm := x86.ASUBL
   573  				if v.Op == ssa.OpAMD64ADDQconst {
   574  					asm = x86.ASUBQ
   575  				}
   576  				p := s.Prog(asm)
   577  				p.From.Type = obj.TYPE_CONST
   578  				p.From.Offset = -0x80
   579  				p.To.Type = obj.TYPE_REG
   580  				p.To.Reg = r
   581  				return
   582  
   583  			}
   584  			p := s.Prog(v.Op.Asm())
   585  			p.From.Type = obj.TYPE_CONST
   586  			p.From.Offset = v.AuxInt
   587  			p.To.Type = obj.TYPE_REG
   588  			p.To.Reg = r
   589  			return
   590  		}
   591  		var asm obj.As
   592  		if v.Op == ssa.OpAMD64ADDQconst {
   593  			asm = x86.ALEAQ
   594  		} else {
   595  			asm = x86.ALEAL
   596  		}
   597  		p := s.Prog(asm)
   598  		p.From.Type = obj.TYPE_MEM
   599  		p.From.Reg = a
   600  		p.From.Offset = v.AuxInt
   601  		p.To.Type = obj.TYPE_REG
   602  		p.To.Reg = r
   603  
   604  	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
   605  		ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
   606  		ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
   607  		ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
   608  		ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
   609  		ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
   610  		ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
   611  		ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
   612  		ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
   613  		ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
   614  		ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
   615  		ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
   616  		p := s.Prog(v.Op.Asm())
   617  		p.From.Type = obj.TYPE_REG
   618  		p.From.Reg = v.Args[1].Reg()
   619  		p.To.Type = obj.TYPE_REG
   620  		p.To.Reg = v.Reg()
   621  
   622  	case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
   623  		// Flag condition: ^ZERO || PARITY
   624  		// Generate:
   625  		//   CMOV*NE  SRC,DST
   626  		//   CMOV*PS  SRC,DST
   627  		p := s.Prog(v.Op.Asm())
   628  		p.From.Type = obj.TYPE_REG
   629  		p.From.Reg = v.Args[1].Reg()
   630  		p.To.Type = obj.TYPE_REG
   631  		p.To.Reg = v.Reg()
   632  		var q *obj.Prog
   633  		if v.Op == ssa.OpAMD64CMOVQNEF {
   634  			q = s.Prog(x86.ACMOVQPS)
   635  		} else if v.Op == ssa.OpAMD64CMOVLNEF {
   636  			q = s.Prog(x86.ACMOVLPS)
   637  		} else {
   638  			q = s.Prog(x86.ACMOVWPS)
   639  		}
   640  		q.From.Type = obj.TYPE_REG
   641  		q.From.Reg = v.Args[1].Reg()
   642  		q.To.Type = obj.TYPE_REG
   643  		q.To.Reg = v.Reg()
   644  
   645  	case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
   646  		// Flag condition: ZERO && !PARITY
   647  		// Generate:
   648  		//   MOV      SRC,TMP
   649  		//   CMOV*NE  DST,TMP
   650  		//   CMOV*PC  TMP,DST
   651  		//
   652  		// TODO(rasky): we could generate:
   653  		//   CMOV*NE  DST,SRC
   654  		//   CMOV*PC  SRC,DST
   655  		// But this requires a way for regalloc to know that SRC might be
   656  		// clobbered by this instruction.
   657  		t := v.RegTmp()
   658  		opregreg(s, moveByRegsWidth(t, v.Args[1].Reg(), v.Type.Size()), t, v.Args[1].Reg())
   659  
   660  		p := s.Prog(v.Op.Asm())
   661  		p.From.Type = obj.TYPE_REG
   662  		p.From.Reg = v.Reg()
   663  		p.To.Type = obj.TYPE_REG
   664  		p.To.Reg = t
   665  		var q *obj.Prog
   666  		if v.Op == ssa.OpAMD64CMOVQEQF {
   667  			q = s.Prog(x86.ACMOVQPC)
   668  		} else if v.Op == ssa.OpAMD64CMOVLEQF {
   669  			q = s.Prog(x86.ACMOVLPC)
   670  		} else {
   671  			q = s.Prog(x86.ACMOVWPC)
   672  		}
   673  		q.From.Type = obj.TYPE_REG
   674  		q.From.Reg = t
   675  		q.To.Type = obj.TYPE_REG
   676  		q.To.Reg = v.Reg()
   677  
   678  	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
   679  		r := v.Reg()
   680  		p := s.Prog(v.Op.Asm())
   681  		p.From.Type = obj.TYPE_CONST
   682  		p.From.Offset = v.AuxInt
   683  		p.To.Type = obj.TYPE_REG
   684  		p.To.Reg = r
   685  		p.AddRestSourceReg(v.Args[0].Reg())
   686  
   687  	case ssa.OpAMD64ANDQconst:
   688  		asm := v.Op.Asm()
   689  		// If the constant is positive and fits into 32 bits, use ANDL.
   690  		// This saves a few bytes of encoding.
   691  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   692  			asm = x86.AANDL
   693  		}
   694  		p := s.Prog(asm)
   695  		p.From.Type = obj.TYPE_CONST
   696  		p.From.Offset = v.AuxInt
   697  		p.To.Type = obj.TYPE_REG
   698  		p.To.Reg = v.Reg()
   699  
   700  	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
   701  		ssa.OpAMD64ANDLconst,
   702  		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
   703  		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
   704  		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
   705  		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
   706  		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
   707  		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
   708  		p := s.Prog(v.Op.Asm())
   709  		p.From.Type = obj.TYPE_CONST
   710  		p.From.Offset = v.AuxInt
   711  		p.To.Type = obj.TYPE_REG
   712  		p.To.Reg = v.Reg()
   713  	case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
   714  		r := v.Reg()
   715  		p := s.Prog(v.Op.Asm())
   716  		p.From.Type = obj.TYPE_REG
   717  		p.From.Reg = r
   718  		p.To.Type = obj.TYPE_REG
   719  		p.To.Reg = r
   720  	case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8,
   721  		ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8,
   722  		ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   723  		p := s.Prog(v.Op.Asm())
   724  		memIdx(&p.From, v)
   725  		o := v.Reg()
   726  		p.To.Type = obj.TYPE_REG
   727  		p.To.Reg = o
   728  		if v.AuxInt != 0 && v.Aux == nil {
   729  			// Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA.
   730  			switch v.Op {
   731  			case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
   732  				p = s.Prog(x86.ALEAQ)
   733  			case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8:
   734  				p = s.Prog(x86.ALEAL)
   735  			case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
   736  				p = s.Prog(x86.ALEAW)
   737  			}
   738  			p.From.Type = obj.TYPE_MEM
   739  			p.From.Reg = o
   740  			p.To.Type = obj.TYPE_REG
   741  			p.To.Reg = o
   742  		}
   743  		ssagen.AddAux(&p.From, v)
   744  	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW:
   745  		p := s.Prog(v.Op.Asm())
   746  		p.From.Type = obj.TYPE_MEM
   747  		p.From.Reg = v.Args[0].Reg()
   748  		ssagen.AddAux(&p.From, v)
   749  		p.To.Type = obj.TYPE_REG
   750  		p.To.Reg = v.Reg()
   751  	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
   752  		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
   753  		ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
   754  		opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
   755  	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
   756  		// Go assembler has swapped operands for UCOMISx relative to CMP,
   757  		// must account for that right here.
   758  		opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
   759  	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
   760  		p := s.Prog(v.Op.Asm())
   761  		p.From.Type = obj.TYPE_REG
   762  		p.From.Reg = v.Args[0].Reg()
   763  		p.To.Type = obj.TYPE_CONST
   764  		p.To.Offset = v.AuxInt
   765  	case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
   766  		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
   767  		ssa.OpAMD64BTSQconst,
   768  		ssa.OpAMD64BTCQconst,
   769  		ssa.OpAMD64BTRQconst:
   770  		op := v.Op
   771  		if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
   772  			// Emit 32-bit version because it's shorter
   773  			op = ssa.OpAMD64BTLconst
   774  		}
   775  		p := s.Prog(op.Asm())
   776  		p.From.Type = obj.TYPE_CONST
   777  		p.From.Offset = v.AuxInt
   778  		p.To.Type = obj.TYPE_REG
   779  		p.To.Reg = v.Args[0].Reg()
   780  	case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload:
   781  		p := s.Prog(v.Op.Asm())
   782  		p.From.Type = obj.TYPE_MEM
   783  		p.From.Reg = v.Args[0].Reg()
   784  		ssagen.AddAux(&p.From, v)
   785  		p.To.Type = obj.TYPE_REG
   786  		p.To.Reg = v.Args[1].Reg()
   787  	case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload:
   788  		sc := v.AuxValAndOff()
   789  		p := s.Prog(v.Op.Asm())
   790  		p.From.Type = obj.TYPE_MEM
   791  		p.From.Reg = v.Args[0].Reg()
   792  		ssagen.AddAux2(&p.From, v, sc.Off64())
   793  		p.To.Type = obj.TYPE_CONST
   794  		p.To.Offset = sc.Val64()
   795  	case ssa.OpAMD64CMPQloadidx8, ssa.OpAMD64CMPQloadidx1, ssa.OpAMD64CMPLloadidx4, ssa.OpAMD64CMPLloadidx1, ssa.OpAMD64CMPWloadidx2, ssa.OpAMD64CMPWloadidx1, ssa.OpAMD64CMPBloadidx1:
   796  		p := s.Prog(v.Op.Asm())
   797  		memIdx(&p.From, v)
   798  		ssagen.AddAux(&p.From, v)
   799  		p.To.Type = obj.TYPE_REG
   800  		p.To.Reg = v.Args[2].Reg()
   801  	case ssa.OpAMD64CMPQconstloadidx8, ssa.OpAMD64CMPQconstloadidx1, ssa.OpAMD64CMPLconstloadidx4, ssa.OpAMD64CMPLconstloadidx1, ssa.OpAMD64CMPWconstloadidx2, ssa.OpAMD64CMPWconstloadidx1, ssa.OpAMD64CMPBconstloadidx1:
   802  		sc := v.AuxValAndOff()
   803  		p := s.Prog(v.Op.Asm())
   804  		memIdx(&p.From, v)
   805  		ssagen.AddAux2(&p.From, v, sc.Off64())
   806  		p.To.Type = obj.TYPE_CONST
   807  		p.To.Offset = sc.Val64()
   808  	case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
   809  		x := v.Reg()
   810  
   811  		// If flags aren't live (indicated by v.Aux == nil),
   812  		// then we can rewrite MOV $0, AX into XOR AX, AX.
   813  		if v.AuxInt == 0 && v.Aux == nil {
   814  			opregreg(s, x86.AXORL, x, x)
   815  			break
   816  		}
   817  
   818  		asm := v.Op.Asm()
   819  		// Use MOVL to move a small constant into a register
   820  		// when the constant is positive and fits into 32 bits.
   821  		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
   822  			// The upper 32bit are zeroed automatically when using MOVL.
   823  			asm = x86.AMOVL
   824  		}
   825  		p := s.Prog(asm)
   826  		p.From.Type = obj.TYPE_CONST
   827  		p.From.Offset = v.AuxInt
   828  		p.To.Type = obj.TYPE_REG
   829  		p.To.Reg = x
   830  
   831  	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
   832  		x := v.Reg()
   833  		if !isFPReg(x) && v.AuxInt == 0 && v.Aux == nil {
   834  			opregreg(s, x86.AXORL, x, x)
   835  			break
   836  		}
   837  		p := s.Prog(storeByRegWidth(x, v.Type.Size()))
   838  		p.From.Type = obj.TYPE_FCONST
   839  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
   840  		p.To.Type = obj.TYPE_REG
   841  		p.To.Reg = x
   842  	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
   843  		ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
   844  		ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
   845  		p := s.Prog(v.Op.Asm())
   846  		p.From.Type = obj.TYPE_MEM
   847  		p.From.Reg = v.Args[0].Reg()
   848  		ssagen.AddAux(&p.From, v)
   849  		p.To.Type = obj.TYPE_REG
   850  		p.To.Reg = v.Reg()
   851  	case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1,
   852  		ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2,
   853  		ssa.OpAMD64MOVBELloadidx1, ssa.OpAMD64MOVBELloadidx4, ssa.OpAMD64MOVBELloadidx8, ssa.OpAMD64MOVBEQloadidx1, ssa.OpAMD64MOVBEQloadidx8:
   854  		p := s.Prog(v.Op.Asm())
   855  		memIdx(&p.From, v)
   856  		ssagen.AddAux(&p.From, v)
   857  		p.To.Type = obj.TYPE_REG
   858  		p.To.Reg = v.Reg()
   859  	case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
   860  		ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
   861  		ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify,
   862  		ssa.OpAMD64MOVBEQstore, ssa.OpAMD64MOVBELstore, ssa.OpAMD64MOVBEWstore:
   863  		p := s.Prog(v.Op.Asm())
   864  		p.From.Type = obj.TYPE_REG
   865  		p.From.Reg = v.Args[1].Reg()
   866  		p.To.Type = obj.TYPE_MEM
   867  		p.To.Reg = v.Args[0].Reg()
   868  		ssagen.AddAux(&p.To, v)
   869  	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
   870  		ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2,
   871  		ssa.OpAMD64ADDLmodifyidx1, ssa.OpAMD64ADDLmodifyidx4, ssa.OpAMD64ADDLmodifyidx8, ssa.OpAMD64ADDQmodifyidx1, ssa.OpAMD64ADDQmodifyidx8,
   872  		ssa.OpAMD64SUBLmodifyidx1, ssa.OpAMD64SUBLmodifyidx4, ssa.OpAMD64SUBLmodifyidx8, ssa.OpAMD64SUBQmodifyidx1, ssa.OpAMD64SUBQmodifyidx8,
   873  		ssa.OpAMD64ANDLmodifyidx1, ssa.OpAMD64ANDLmodifyidx4, ssa.OpAMD64ANDLmodifyidx8, ssa.OpAMD64ANDQmodifyidx1, ssa.OpAMD64ANDQmodifyidx8,
   874  		ssa.OpAMD64ORLmodifyidx1, ssa.OpAMD64ORLmodifyidx4, ssa.OpAMD64ORLmodifyidx8, ssa.OpAMD64ORQmodifyidx1, ssa.OpAMD64ORQmodifyidx8,
   875  		ssa.OpAMD64XORLmodifyidx1, ssa.OpAMD64XORLmodifyidx4, ssa.OpAMD64XORLmodifyidx8, ssa.OpAMD64XORQmodifyidx1, ssa.OpAMD64XORQmodifyidx8,
   876  		ssa.OpAMD64MOVBEWstoreidx1, ssa.OpAMD64MOVBEWstoreidx2, ssa.OpAMD64MOVBELstoreidx1, ssa.OpAMD64MOVBELstoreidx4, ssa.OpAMD64MOVBELstoreidx8, ssa.OpAMD64MOVBEQstoreidx1, ssa.OpAMD64MOVBEQstoreidx8:
   877  		p := s.Prog(v.Op.Asm())
   878  		p.From.Type = obj.TYPE_REG
   879  		p.From.Reg = v.Args[2].Reg()
   880  		memIdx(&p.To, v)
   881  		ssagen.AddAux(&p.To, v)
   882  	case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify:
   883  		sc := v.AuxValAndOff()
   884  		off := sc.Off64()
   885  		val := sc.Val()
   886  		if val == 1 || val == -1 {
   887  			var asm obj.As
   888  			if v.Op == ssa.OpAMD64ADDQconstmodify {
   889  				if val == 1 {
   890  					asm = x86.AINCQ
   891  				} else {
   892  					asm = x86.ADECQ
   893  				}
   894  			} else {
   895  				if val == 1 {
   896  					asm = x86.AINCL
   897  				} else {
   898  					asm = x86.ADECL
   899  				}
   900  			}
   901  			p := s.Prog(asm)
   902  			p.To.Type = obj.TYPE_MEM
   903  			p.To.Reg = v.Args[0].Reg()
   904  			ssagen.AddAux2(&p.To, v, off)
   905  			break
   906  		}
   907  		fallthrough
   908  	case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
   909  		ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify,
   910  		ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTCQconstmodify:
   911  		sc := v.AuxValAndOff()
   912  		off := sc.Off64()
   913  		val := sc.Val64()
   914  		p := s.Prog(v.Op.Asm())
   915  		p.From.Type = obj.TYPE_CONST
   916  		p.From.Offset = val
   917  		p.To.Type = obj.TYPE_MEM
   918  		p.To.Reg = v.Args[0].Reg()
   919  		ssagen.AddAux2(&p.To, v, off)
   920  
   921  	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
   922  		p := s.Prog(v.Op.Asm())
   923  		p.From.Type = obj.TYPE_CONST
   924  		sc := v.AuxValAndOff()
   925  		p.From.Offset = sc.Val64()
   926  		p.To.Type = obj.TYPE_MEM
   927  		p.To.Reg = v.Args[0].Reg()
   928  		ssagen.AddAux2(&p.To, v, sc.Off64())
   929  	case ssa.OpAMD64MOVOstoreconst:
   930  		sc := v.AuxValAndOff()
   931  		if sc.Val() != 0 {
   932  			v.Fatalf("MOVO for non zero constants not implemented: %s", v.LongString())
   933  		}
   934  
   935  		if s.ABI != obj.ABIInternal {
   936  			// zero X15 manually
   937  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
   938  		}
   939  		p := s.Prog(v.Op.Asm())
   940  		p.From.Type = obj.TYPE_REG
   941  		p.From.Reg = x86.REG_X15
   942  		p.To.Type = obj.TYPE_MEM
   943  		p.To.Reg = v.Args[0].Reg()
   944  		ssagen.AddAux2(&p.To, v, sc.Off64())
   945  
   946  	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1,
   947  		ssa.OpAMD64ADDLconstmodifyidx1, ssa.OpAMD64ADDLconstmodifyidx4, ssa.OpAMD64ADDLconstmodifyidx8, ssa.OpAMD64ADDQconstmodifyidx1, ssa.OpAMD64ADDQconstmodifyidx8,
   948  		ssa.OpAMD64ANDLconstmodifyidx1, ssa.OpAMD64ANDLconstmodifyidx4, ssa.OpAMD64ANDLconstmodifyidx8, ssa.OpAMD64ANDQconstmodifyidx1, ssa.OpAMD64ANDQconstmodifyidx8,
   949  		ssa.OpAMD64ORLconstmodifyidx1, ssa.OpAMD64ORLconstmodifyidx4, ssa.OpAMD64ORLconstmodifyidx8, ssa.OpAMD64ORQconstmodifyidx1, ssa.OpAMD64ORQconstmodifyidx8,
   950  		ssa.OpAMD64XORLconstmodifyidx1, ssa.OpAMD64XORLconstmodifyidx4, ssa.OpAMD64XORLconstmodifyidx8, ssa.OpAMD64XORQconstmodifyidx1, ssa.OpAMD64XORQconstmodifyidx8:
   951  		p := s.Prog(v.Op.Asm())
   952  		p.From.Type = obj.TYPE_CONST
   953  		sc := v.AuxValAndOff()
   954  		p.From.Offset = sc.Val64()
   955  		switch {
   956  		case p.As == x86.AADDQ && p.From.Offset == 1:
   957  			p.As = x86.AINCQ
   958  			p.From.Type = obj.TYPE_NONE
   959  		case p.As == x86.AADDQ && p.From.Offset == -1:
   960  			p.As = x86.ADECQ
   961  			p.From.Type = obj.TYPE_NONE
   962  		case p.As == x86.AADDL && p.From.Offset == 1:
   963  			p.As = x86.AINCL
   964  			p.From.Type = obj.TYPE_NONE
   965  		case p.As == x86.AADDL && p.From.Offset == -1:
   966  			p.As = x86.ADECL
   967  			p.From.Type = obj.TYPE_NONE
   968  		}
   969  		memIdx(&p.To, v)
   970  		ssagen.AddAux2(&p.To, v, sc.Off64())
   971  	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
   972  		ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
   973  		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS, ssa.OpAMD64VPBROADCASTB, ssa.OpAMD64PMOVMSKB:
   974  		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
   975  	case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
   976  		r := v.Reg()
   977  		// Break false dependency on destination register.
   978  		opregreg(s, x86.AXORPS, r, r)
   979  		opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
   980  	case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   981  		var p *obj.Prog
   982  		switch v.Op {
   983  		case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
   984  			p = s.Prog(x86.AMOVQ)
   985  		case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
   986  			p = s.Prog(x86.AMOVL)
   987  		}
   988  		p.From.Type = obj.TYPE_REG
   989  		p.From.Reg = v.Args[0].Reg()
   990  		p.To.Type = obj.TYPE_REG
   991  		p.To.Reg = v.Reg()
   992  	case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
   993  		ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
   994  		ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
   995  		ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
   996  		ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
   997  		p := s.Prog(v.Op.Asm())
   998  		p.From.Type = obj.TYPE_MEM
   999  		p.From.Reg = v.Args[1].Reg()
  1000  		ssagen.AddAux(&p.From, v)
  1001  		p.To.Type = obj.TYPE_REG
  1002  		p.To.Reg = v.Reg()
  1003  	case ssa.OpAMD64ADDLloadidx1, ssa.OpAMD64ADDLloadidx4, ssa.OpAMD64ADDLloadidx8, ssa.OpAMD64ADDQloadidx1, ssa.OpAMD64ADDQloadidx8,
  1004  		ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8,
  1005  		ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8,
  1006  		ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8,
  1007  		ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8,
  1008  		ssa.OpAMD64ADDSSloadidx1, ssa.OpAMD64ADDSSloadidx4, ssa.OpAMD64ADDSDloadidx1, ssa.OpAMD64ADDSDloadidx8,
  1009  		ssa.OpAMD64SUBSSloadidx1, ssa.OpAMD64SUBSSloadidx4, ssa.OpAMD64SUBSDloadidx1, ssa.OpAMD64SUBSDloadidx8,
  1010  		ssa.OpAMD64MULSSloadidx1, ssa.OpAMD64MULSSloadidx4, ssa.OpAMD64MULSDloadidx1, ssa.OpAMD64MULSDloadidx8,
  1011  		ssa.OpAMD64DIVSSloadidx1, ssa.OpAMD64DIVSSloadidx4, ssa.OpAMD64DIVSDloadidx1, ssa.OpAMD64DIVSDloadidx8:
  1012  		p := s.Prog(v.Op.Asm())
  1013  
  1014  		r, i := v.Args[1].Reg(), v.Args[2].Reg()
  1015  		p.From.Type = obj.TYPE_MEM
  1016  		p.From.Scale = v.Op.Scale()
  1017  		if p.From.Scale == 1 && i == x86.REG_SP {
  1018  			r, i = i, r
  1019  		}
  1020  		p.From.Reg = r
  1021  		p.From.Index = i
  1022  
  1023  		ssagen.AddAux(&p.From, v)
  1024  		p.To.Type = obj.TYPE_REG
  1025  		p.To.Reg = v.Reg()
  1026  
  1027  	case ssa.OpAMD64LoweredZero:
  1028  		if s.ABI != obj.ABIInternal {
  1029  			// zero X15 manually
  1030  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1031  		}
  1032  		ptrReg := v.Args[0].Reg()
  1033  		n := v.AuxInt
  1034  		if n < 16 {
  1035  			v.Fatalf("Zero too small %d", n)
  1036  		}
  1037  		zero16 := func(off int64) {
  1038  			zero16(s, ptrReg, off)
  1039  		}
  1040  
  1041  		// Generate zeroing instructions.
  1042  		var off int64
  1043  		for n >= 16 {
  1044  			zero16(off)
  1045  			off += 16
  1046  			n -= 16
  1047  		}
  1048  		if n != 0 {
  1049  			// use partially overlapped write.
  1050  			// TODO: n <= 8, use smaller write?
  1051  			zero16(off + n - 16)
  1052  		}
  1053  
  1054  	case ssa.OpAMD64LoweredZeroLoop:
  1055  		if s.ABI != obj.ABIInternal {
  1056  			// zero X15 manually
  1057  			opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1058  		}
  1059  		ptrReg := v.Args[0].Reg()
  1060  		countReg := v.RegTmp()
  1061  		n := v.AuxInt
  1062  		loopSize := int64(64)
  1063  		if n < 3*loopSize {
  1064  			// - a loop count of 0 won't work.
  1065  			// - a loop count of 1 is useless.
  1066  			// - a loop count of 2 is a code size ~tie
  1067  			//     4 instructions to implement the loop
  1068  			//     4 instructions in the loop body
  1069  			//   vs
  1070  			//     8 instructions in the straightline code
  1071  			//   Might as well use straightline code.
  1072  			v.Fatalf("ZeroLoop size too small %d", n)
  1073  		}
  1074  		zero16 := func(off int64) {
  1075  			zero16(s, ptrReg, off)
  1076  		}
  1077  
  1078  		// Put iteration count in a register.
  1079  		//   MOVL    $n, countReg
  1080  		p := s.Prog(x86.AMOVL)
  1081  		p.From.Type = obj.TYPE_CONST
  1082  		p.From.Offset = n / loopSize
  1083  		p.To.Type = obj.TYPE_REG
  1084  		p.To.Reg = countReg
  1085  		cntInit := p
  1086  
  1087  		// Zero loopSize bytes starting at ptrReg.
  1088  		for i := range loopSize / 16 {
  1089  			zero16(i * 16)
  1090  		}
  1091  		//   ADDQ    $loopSize, ptrReg
  1092  		p = s.Prog(x86.AADDQ)
  1093  		p.From.Type = obj.TYPE_CONST
  1094  		p.From.Offset = loopSize
  1095  		p.To.Type = obj.TYPE_REG
  1096  		p.To.Reg = ptrReg
  1097  		//   DECL    countReg
  1098  		p = s.Prog(x86.ADECL)
  1099  		p.To.Type = obj.TYPE_REG
  1100  		p.To.Reg = countReg
  1101  		// Jump to first instruction in loop if we're not done yet.
  1102  		//   JNE     head
  1103  		p = s.Prog(x86.AJNE)
  1104  		p.To.Type = obj.TYPE_BRANCH
  1105  		p.To.SetTarget(cntInit.Link)
  1106  
  1107  		// Multiples of the loop size are now done.
  1108  		n %= loopSize
  1109  
  1110  		// Write any fractional portion.
  1111  		var off int64
  1112  		for n >= 16 {
  1113  			zero16(off)
  1114  			off += 16
  1115  			n -= 16
  1116  		}
  1117  		if n != 0 {
  1118  			// Use partially-overlapping write.
  1119  			// TODO: n <= 8, use smaller write?
  1120  			zero16(off + n - 16)
  1121  		}
  1122  
  1123  	case ssa.OpAMD64LoweredMove:
  1124  		dstReg := v.Args[0].Reg()
  1125  		srcReg := v.Args[1].Reg()
  1126  		if dstReg == srcReg {
  1127  			break
  1128  		}
  1129  		tmpReg := int16(x86.REG_X14)
  1130  		n := v.AuxInt
  1131  		if n < 16 {
  1132  			v.Fatalf("Move too small %d", n)
  1133  		}
  1134  		// move 16 bytes from srcReg+off to dstReg+off.
  1135  		move16 := func(off int64) {
  1136  			move16(s, srcReg, dstReg, tmpReg, off)
  1137  		}
  1138  
  1139  		// Generate copying instructions.
  1140  		var off int64
  1141  		for n >= 16 {
  1142  			move16(off)
  1143  			off += 16
  1144  			n -= 16
  1145  		}
  1146  		if n != 0 {
  1147  			// use partially overlapped read/write.
  1148  			// TODO: use smaller operations when we can?
  1149  			move16(off + n - 16)
  1150  		}
  1151  
  1152  	case ssa.OpAMD64LoweredMoveLoop:
  1153  		dstReg := v.Args[0].Reg()
  1154  		srcReg := v.Args[1].Reg()
  1155  		if dstReg == srcReg {
  1156  			break
  1157  		}
  1158  		countReg := v.RegTmp()
  1159  		tmpReg := int16(x86.REG_X14)
  1160  		n := v.AuxInt
  1161  		loopSize := int64(64)
  1162  		if n < 3*loopSize {
  1163  			// - a loop count of 0 won't work.
  1164  			// - a loop count of 1 is useless.
  1165  			// - a loop count of 2 is a code size ~tie
  1166  			//     4 instructions to implement the loop
  1167  			//     4 instructions in the loop body
  1168  			//   vs
  1169  			//     8 instructions in the straightline code
  1170  			//   Might as well use straightline code.
  1171  			v.Fatalf("ZeroLoop size too small %d", n)
  1172  		}
  1173  		// move 16 bytes from srcReg+off to dstReg+off.
  1174  		move16 := func(off int64) {
  1175  			move16(s, srcReg, dstReg, tmpReg, off)
  1176  		}
  1177  
  1178  		// Put iteration count in a register.
  1179  		//   MOVL    $n, countReg
  1180  		p := s.Prog(x86.AMOVL)
  1181  		p.From.Type = obj.TYPE_CONST
  1182  		p.From.Offset = n / loopSize
  1183  		p.To.Type = obj.TYPE_REG
  1184  		p.To.Reg = countReg
  1185  		cntInit := p
  1186  
  1187  		// Copy loopSize bytes starting at srcReg to dstReg.
  1188  		for i := range loopSize / 16 {
  1189  			move16(i * 16)
  1190  		}
  1191  		//   ADDQ    $loopSize, srcReg
  1192  		p = s.Prog(x86.AADDQ)
  1193  		p.From.Type = obj.TYPE_CONST
  1194  		p.From.Offset = loopSize
  1195  		p.To.Type = obj.TYPE_REG
  1196  		p.To.Reg = srcReg
  1197  		//   ADDQ    $loopSize, dstReg
  1198  		p = s.Prog(x86.AADDQ)
  1199  		p.From.Type = obj.TYPE_CONST
  1200  		p.From.Offset = loopSize
  1201  		p.To.Type = obj.TYPE_REG
  1202  		p.To.Reg = dstReg
  1203  		//   DECL    countReg
  1204  		p = s.Prog(x86.ADECL)
  1205  		p.To.Type = obj.TYPE_REG
  1206  		p.To.Reg = countReg
  1207  		// Jump to loop header if we're not done yet.
  1208  		//   JNE     head
  1209  		p = s.Prog(x86.AJNE)
  1210  		p.To.Type = obj.TYPE_BRANCH
  1211  		p.To.SetTarget(cntInit.Link)
  1212  
  1213  		// Multiples of the loop size are now done.
  1214  		n %= loopSize
  1215  
  1216  		// Copy any fractional portion.
  1217  		var off int64
  1218  		for n >= 16 {
  1219  			move16(off)
  1220  			off += 16
  1221  			n -= 16
  1222  		}
  1223  		if n != 0 {
  1224  			// Use partially-overlapping copy.
  1225  			move16(off + n - 16)
  1226  		}
  1227  
  1228  	case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
  1229  		if v.Type.IsMemory() {
  1230  			return
  1231  		}
  1232  		arg := v.Args[0]
  1233  		x := arg.Reg()
  1234  		y := v.Reg()
  1235  		if v.Type.IsSIMD() {
  1236  			x = simdOrMaskReg(arg)
  1237  			y = simdOrMaskReg(v)
  1238  		}
  1239  		if x != y {
  1240  			width := v.Type.Size()
  1241  			if width == 8 && isGPReg(y) && ssa.ZeroUpper32Bits(arg, 3) {
  1242  				// The source was naturally zext-ed from 32 to 64 bits,
  1243  				// but we are asked to do a full 64-bit copy.
  1244  				// Save the REX prefix byte in I-CACHE by using a 32-bit move,
  1245  				// since it zeroes the upper 32 bits anyway.
  1246  				width = 4
  1247  			}
  1248  			opregreg(s, moveByRegsWidth(y, x, width), y, x)
  1249  		}
  1250  	case ssa.OpLoadReg:
  1251  		if v.Type.IsFlags() {
  1252  			v.Fatalf("load flags not implemented: %v", v.LongString())
  1253  			return
  1254  		}
  1255  		r := v.Reg()
  1256  		p := s.Prog(loadByRegWidth(r, v.Type.Size()))
  1257  		ssagen.AddrAuto(&p.From, v.Args[0])
  1258  		p.To.Type = obj.TYPE_REG
  1259  		if v.Type.IsSIMD() {
  1260  			r = simdOrMaskReg(v)
  1261  		}
  1262  		p.To.Reg = r
  1263  
  1264  	case ssa.OpStoreReg:
  1265  		if v.Type.IsFlags() {
  1266  			v.Fatalf("store flags not implemented: %v", v.LongString())
  1267  			return
  1268  		}
  1269  		r := v.Args[0].Reg()
  1270  		if v.Type.IsSIMD() {
  1271  			r = simdOrMaskReg(v.Args[0])
  1272  		}
  1273  		p := s.Prog(storeByRegWidth(r, v.Type.Size()))
  1274  		p.From.Type = obj.TYPE_REG
  1275  		p.From.Reg = r
  1276  		ssagen.AddrAuto(&p.To, v)
  1277  	case ssa.OpAMD64LoweredHasCPUFeature:
  1278  		p := s.Prog(x86.AMOVBLZX)
  1279  		p.From.Type = obj.TYPE_MEM
  1280  		ssagen.AddAux(&p.From, v)
  1281  		p.To.Type = obj.TYPE_REG
  1282  		p.To.Reg = v.Reg()
  1283  	case ssa.OpArgIntReg, ssa.OpArgFloatReg:
  1284  		// The assembler needs to wrap the entry safepoint/stack growth code with spill/unspill
  1285  		// The loop only runs once.
  1286  		for _, ap := range v.Block.Func.RegArgs {
  1287  			// Pass the spill/unspill information along to the assembler, offset by size of return PC pushed on stack.
  1288  			addr := ssagen.SpillSlotAddr(ap, x86.REG_SP, v.Block.Func.Config.PtrSize)
  1289  			reg := ap.Reg
  1290  			t := ap.Type
  1291  			sz := t.Size()
  1292  			if t.IsSIMD() {
  1293  				reg = simdRegBySize(reg, sz)
  1294  			}
  1295  			s.FuncInfo().AddSpill(
  1296  				obj.RegSpill{Reg: reg, Addr: addr, Unspill: loadByRegWidth(reg, sz), Spill: storeByRegWidth(reg, sz)})
  1297  		}
  1298  		v.Block.Func.RegArgs = nil
  1299  		ssagen.CheckArgReg(v)
  1300  	case ssa.OpAMD64LoweredGetClosurePtr:
  1301  		// Closure pointer is DX.
  1302  		ssagen.CheckLoweredGetClosurePtr(v)
  1303  	case ssa.OpAMD64LoweredGetG:
  1304  		if s.ABI == obj.ABIInternal {
  1305  			v.Fatalf("LoweredGetG should not appear in ABIInternal")
  1306  		}
  1307  		r := v.Reg()
  1308  		getgFromTLS(s, r)
  1309  	case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
  1310  		if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
  1311  			// zeroing X15 when entering ABIInternal from ABI0
  1312  			zeroX15(s)
  1313  			// set G register from TLS
  1314  			getgFromTLS(s, x86.REG_R14)
  1315  		}
  1316  		if v.Op == ssa.OpAMD64CALLtail {
  1317  			s.TailCall(v)
  1318  			break
  1319  		}
  1320  		s.Call(v)
  1321  		if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
  1322  			// zeroing X15 when entering ABIInternal from ABI0
  1323  			zeroX15(s)
  1324  			// set G register from TLS
  1325  			getgFromTLS(s, x86.REG_R14)
  1326  		}
  1327  	case ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
  1328  		s.Call(v)
  1329  
  1330  	case ssa.OpAMD64LoweredGetCallerPC:
  1331  		p := s.Prog(x86.AMOVQ)
  1332  		p.From.Type = obj.TYPE_MEM
  1333  		p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
  1334  		p.From.Name = obj.NAME_PARAM
  1335  		p.To.Type = obj.TYPE_REG
  1336  		p.To.Reg = v.Reg()
  1337  
  1338  	case ssa.OpAMD64LoweredGetCallerSP:
  1339  		// caller's SP is the address of the first arg
  1340  		mov := x86.AMOVQ
  1341  		if types.PtrSize == 4 {
  1342  			mov = x86.AMOVL
  1343  		}
  1344  		p := s.Prog(mov)
  1345  		p.From.Type = obj.TYPE_ADDR
  1346  		p.From.Offset = -base.Ctxt.Arch.FixedFrameSize // 0 on amd64, just to be consistent with other architectures
  1347  		p.From.Name = obj.NAME_PARAM
  1348  		p.To.Type = obj.TYPE_REG
  1349  		p.To.Reg = v.Reg()
  1350  
  1351  	case ssa.OpAMD64LoweredWB:
  1352  		p := s.Prog(obj.ACALL)
  1353  		p.To.Type = obj.TYPE_MEM
  1354  		p.To.Name = obj.NAME_EXTERN
  1355  		// AuxInt encodes how many buffer entries we need.
  1356  		p.To.Sym = ir.Syms.GCWriteBarrier[v.AuxInt-1]
  1357  
  1358  	case ssa.OpAMD64LoweredPanicBoundsRR, ssa.OpAMD64LoweredPanicBoundsRC, ssa.OpAMD64LoweredPanicBoundsCR, ssa.OpAMD64LoweredPanicBoundsCC:
  1359  		// Compute the constant we put in the PCData entry for this call.
  1360  		code, signed := ssa.BoundsKind(v.AuxInt).Code()
  1361  		xIsReg := false
  1362  		yIsReg := false
  1363  		xVal := 0
  1364  		yVal := 0
  1365  		switch v.Op {
  1366  		case ssa.OpAMD64LoweredPanicBoundsRR:
  1367  			xIsReg = true
  1368  			xVal = int(v.Args[0].Reg() - x86.REG_AX)
  1369  			yIsReg = true
  1370  			yVal = int(v.Args[1].Reg() - x86.REG_AX)
  1371  		case ssa.OpAMD64LoweredPanicBoundsRC:
  1372  			xIsReg = true
  1373  			xVal = int(v.Args[0].Reg() - x86.REG_AX)
  1374  			c := v.Aux.(ssa.PanicBoundsC).C
  1375  			if c >= 0 && c <= abi.BoundsMaxConst {
  1376  				yVal = int(c)
  1377  			} else {
  1378  				// Move constant to a register
  1379  				yIsReg = true
  1380  				if yVal == xVal {
  1381  					yVal = 1
  1382  				}
  1383  				p := s.Prog(x86.AMOVQ)
  1384  				p.From.Type = obj.TYPE_CONST
  1385  				p.From.Offset = c
  1386  				p.To.Type = obj.TYPE_REG
  1387  				p.To.Reg = x86.REG_AX + int16(yVal)
  1388  			}
  1389  		case ssa.OpAMD64LoweredPanicBoundsCR:
  1390  			yIsReg = true
  1391  			yVal = int(v.Args[0].Reg() - x86.REG_AX)
  1392  			c := v.Aux.(ssa.PanicBoundsC).C
  1393  			if c >= 0 && c <= abi.BoundsMaxConst {
  1394  				xVal = int(c)
  1395  			} else {
  1396  				// Move constant to a register
  1397  				xIsReg = true
  1398  				if xVal == yVal {
  1399  					xVal = 1
  1400  				}
  1401  				p := s.Prog(x86.AMOVQ)
  1402  				p.From.Type = obj.TYPE_CONST
  1403  				p.From.Offset = c
  1404  				p.To.Type = obj.TYPE_REG
  1405  				p.To.Reg = x86.REG_AX + int16(xVal)
  1406  			}
  1407  		case ssa.OpAMD64LoweredPanicBoundsCC:
  1408  			c := v.Aux.(ssa.PanicBoundsCC).Cx
  1409  			if c >= 0 && c <= abi.BoundsMaxConst {
  1410  				xVal = int(c)
  1411  			} else {
  1412  				// Move constant to a register
  1413  				xIsReg = true
  1414  				p := s.Prog(x86.AMOVQ)
  1415  				p.From.Type = obj.TYPE_CONST
  1416  				p.From.Offset = c
  1417  				p.To.Type = obj.TYPE_REG
  1418  				p.To.Reg = x86.REG_AX + int16(xVal)
  1419  			}
  1420  			c = v.Aux.(ssa.PanicBoundsCC).Cy
  1421  			if c >= 0 && c <= abi.BoundsMaxConst {
  1422  				yVal = int(c)
  1423  			} else {
  1424  				// Move constant to a register
  1425  				yIsReg = true
  1426  				yVal = 1
  1427  				p := s.Prog(x86.AMOVQ)
  1428  				p.From.Type = obj.TYPE_CONST
  1429  				p.From.Offset = c
  1430  				p.To.Type = obj.TYPE_REG
  1431  				p.To.Reg = x86.REG_AX + int16(yVal)
  1432  			}
  1433  		}
  1434  		c := abi.BoundsEncode(code, signed, xIsReg, yIsReg, xVal, yVal)
  1435  
  1436  		p := s.Prog(obj.APCDATA)
  1437  		p.From.SetConst(abi.PCDATA_PanicBounds)
  1438  		p.To.SetConst(int64(c))
  1439  		p = s.Prog(obj.ACALL)
  1440  		p.To.Type = obj.TYPE_MEM
  1441  		p.To.Name = obj.NAME_EXTERN
  1442  		p.To.Sym = ir.Syms.PanicBounds
  1443  
  1444  	case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
  1445  		ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
  1446  		ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
  1447  		p := s.Prog(v.Op.Asm())
  1448  		p.To.Type = obj.TYPE_REG
  1449  		p.To.Reg = v.Reg()
  1450  
  1451  	case ssa.OpAMD64NEGLflags:
  1452  		p := s.Prog(v.Op.Asm())
  1453  		p.To.Type = obj.TYPE_REG
  1454  		p.To.Reg = v.Reg0()
  1455  
  1456  	case ssa.OpAMD64ADDQconstflags, ssa.OpAMD64ADDLconstflags:
  1457  		p := s.Prog(v.Op.Asm())
  1458  		p.From.Type = obj.TYPE_CONST
  1459  		p.From.Offset = v.AuxInt
  1460  		// Note: the inc/dec instructions do not modify
  1461  		// the carry flag like add$1 / sub$1 do.
  1462  		// We currently never use the CF/OF flags from
  1463  		// these instructions, so that is ok.
  1464  		switch {
  1465  		case p.As == x86.AADDQ && p.From.Offset == 1:
  1466  			p.As = x86.AINCQ
  1467  			p.From.Type = obj.TYPE_NONE
  1468  		case p.As == x86.AADDQ && p.From.Offset == -1:
  1469  			p.As = x86.ADECQ
  1470  			p.From.Type = obj.TYPE_NONE
  1471  		case p.As == x86.AADDL && p.From.Offset == 1:
  1472  			p.As = x86.AINCL
  1473  			p.From.Type = obj.TYPE_NONE
  1474  		case p.As == x86.AADDL && p.From.Offset == -1:
  1475  			p.As = x86.ADECL
  1476  			p.From.Type = obj.TYPE_NONE
  1477  		}
  1478  		p.To.Type = obj.TYPE_REG
  1479  		p.To.Reg = v.Reg0()
  1480  
  1481  	case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
  1482  		p := s.Prog(v.Op.Asm())
  1483  		p.From.Type = obj.TYPE_REG
  1484  		p.From.Reg = v.Args[0].Reg()
  1485  		p.To.Type = obj.TYPE_REG
  1486  		switch v.Op {
  1487  		case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ:
  1488  			p.To.Reg = v.Reg0()
  1489  		case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
  1490  			p.To.Reg = v.Reg()
  1491  		}
  1492  	case ssa.OpAMD64LoweredRound32F, ssa.OpAMD64LoweredRound64F:
  1493  		// input is already rounded
  1494  	case ssa.OpAMD64ROUNDSD:
  1495  		p := s.Prog(v.Op.Asm())
  1496  		val := v.AuxInt
  1497  		// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
  1498  		if val < 0 || val > 3 {
  1499  			v.Fatalf("Invalid rounding mode")
  1500  		}
  1501  		p.From.Offset = val
  1502  		p.From.Type = obj.TYPE_CONST
  1503  		p.AddRestSourceReg(v.Args[0].Reg())
  1504  		p.To.Type = obj.TYPE_REG
  1505  		p.To.Reg = v.Reg()
  1506  	case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL,
  1507  		ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL,
  1508  		ssa.OpAMD64LZCNTQ, ssa.OpAMD64LZCNTL:
  1509  		if v.Args[0].Reg() != v.Reg() {
  1510  			// POPCNT/TZCNT/LZCNT have a false dependency on the destination register on Intel cpus.
  1511  			// TZCNT/LZCNT problem affects pre-Skylake models. See discussion at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011#c7.
  1512  			// Xor register with itself to break the dependency.
  1513  			opregreg(s, x86.AXORL, v.Reg(), v.Reg())
  1514  		}
  1515  		p := s.Prog(v.Op.Asm())
  1516  		p.From.Type = obj.TYPE_REG
  1517  		p.From.Reg = v.Args[0].Reg()
  1518  		p.To.Type = obj.TYPE_REG
  1519  		p.To.Reg = v.Reg()
  1520  
  1521  	case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
  1522  		ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
  1523  		ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
  1524  		ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
  1525  		ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
  1526  		ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
  1527  		ssa.OpAMD64SETA, ssa.OpAMD64SETAE,
  1528  		ssa.OpAMD64SETO:
  1529  		p := s.Prog(v.Op.Asm())
  1530  		p.To.Type = obj.TYPE_REG
  1531  		p.To.Reg = v.Reg()
  1532  
  1533  	case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore,
  1534  		ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore,
  1535  		ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore,
  1536  		ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore,
  1537  		ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore:
  1538  		p := s.Prog(v.Op.Asm())
  1539  		p.To.Type = obj.TYPE_MEM
  1540  		p.To.Reg = v.Args[0].Reg()
  1541  		ssagen.AddAux(&p.To, v)
  1542  
  1543  	case ssa.OpAMD64SETEQstoreidx1, ssa.OpAMD64SETNEstoreidx1,
  1544  		ssa.OpAMD64SETLstoreidx1, ssa.OpAMD64SETLEstoreidx1,
  1545  		ssa.OpAMD64SETGstoreidx1, ssa.OpAMD64SETGEstoreidx1,
  1546  		ssa.OpAMD64SETBstoreidx1, ssa.OpAMD64SETBEstoreidx1,
  1547  		ssa.OpAMD64SETAstoreidx1, ssa.OpAMD64SETAEstoreidx1:
  1548  		p := s.Prog(v.Op.Asm())
  1549  		memIdx(&p.To, v)
  1550  		ssagen.AddAux(&p.To, v)
  1551  
  1552  	case ssa.OpAMD64SETNEF:
  1553  		t := v.RegTmp()
  1554  		p := s.Prog(v.Op.Asm())
  1555  		p.To.Type = obj.TYPE_REG
  1556  		p.To.Reg = v.Reg()
  1557  		q := s.Prog(x86.ASETPS)
  1558  		q.To.Type = obj.TYPE_REG
  1559  		q.To.Reg = t
  1560  		// ORL avoids partial register write and is smaller than ORQ, used by old compiler
  1561  		opregreg(s, x86.AORL, v.Reg(), t)
  1562  
  1563  	case ssa.OpAMD64SETEQF:
  1564  		t := v.RegTmp()
  1565  		p := s.Prog(v.Op.Asm())
  1566  		p.To.Type = obj.TYPE_REG
  1567  		p.To.Reg = v.Reg()
  1568  		q := s.Prog(x86.ASETPC)
  1569  		q.To.Type = obj.TYPE_REG
  1570  		q.To.Reg = t
  1571  		// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
  1572  		opregreg(s, x86.AANDL, v.Reg(), t)
  1573  
  1574  	case ssa.OpAMD64InvertFlags:
  1575  		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
  1576  	case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
  1577  		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
  1578  	case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
  1579  		v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
  1580  	case ssa.OpAMD64REPSTOSQ:
  1581  		s.Prog(x86.AREP)
  1582  		s.Prog(x86.ASTOSQ)
  1583  	case ssa.OpAMD64REPMOVSQ:
  1584  		s.Prog(x86.AREP)
  1585  		s.Prog(x86.AMOVSQ)
  1586  	case ssa.OpAMD64LoweredNilCheck:
  1587  		// Issue a load which will fault if the input is nil.
  1588  		// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
  1589  		// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
  1590  		// but it doesn't have false dependency on AX.
  1591  		// Or maybe allocate an output register and use MOVL (reg),reg2 ?
  1592  		// That trades clobbering flags for clobbering a register.
  1593  		p := s.Prog(x86.ATESTB)
  1594  		p.From.Type = obj.TYPE_REG
  1595  		p.From.Reg = x86.REG_AX
  1596  		p.To.Type = obj.TYPE_MEM
  1597  		p.To.Reg = v.Args[0].Reg()
  1598  		if logopt.Enabled() {
  1599  			logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
  1600  		}
  1601  		if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
  1602  			base.WarnfAt(v.Pos, "generated nil check")
  1603  		}
  1604  	case ssa.OpAMD64MOVBatomicload, ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
  1605  		p := s.Prog(v.Op.Asm())
  1606  		p.From.Type = obj.TYPE_MEM
  1607  		p.From.Reg = v.Args[0].Reg()
  1608  		ssagen.AddAux(&p.From, v)
  1609  		p.To.Type = obj.TYPE_REG
  1610  		p.To.Reg = v.Reg0()
  1611  	case ssa.OpAMD64XCHGB, ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
  1612  		p := s.Prog(v.Op.Asm())
  1613  		p.From.Type = obj.TYPE_REG
  1614  		p.From.Reg = v.Reg0()
  1615  		p.To.Type = obj.TYPE_MEM
  1616  		p.To.Reg = v.Args[1].Reg()
  1617  		ssagen.AddAux(&p.To, v)
  1618  	case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
  1619  		s.Prog(x86.ALOCK)
  1620  		p := s.Prog(v.Op.Asm())
  1621  		p.From.Type = obj.TYPE_REG
  1622  		p.From.Reg = v.Reg0()
  1623  		p.To.Type = obj.TYPE_MEM
  1624  		p.To.Reg = v.Args[1].Reg()
  1625  		ssagen.AddAux(&p.To, v)
  1626  	case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
  1627  		if v.Args[1].Reg() != x86.REG_AX {
  1628  			v.Fatalf("input[1] not in AX %s", v.LongString())
  1629  		}
  1630  		s.Prog(x86.ALOCK)
  1631  		p := s.Prog(v.Op.Asm())
  1632  		p.From.Type = obj.TYPE_REG
  1633  		p.From.Reg = v.Args[2].Reg()
  1634  		p.To.Type = obj.TYPE_MEM
  1635  		p.To.Reg = v.Args[0].Reg()
  1636  		ssagen.AddAux(&p.To, v)
  1637  		p = s.Prog(x86.ASETEQ)
  1638  		p.To.Type = obj.TYPE_REG
  1639  		p.To.Reg = v.Reg0()
  1640  	case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ANDQlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock, ssa.OpAMD64ORQlock:
  1641  		// Atomic memory operations that don't need to return the old value.
  1642  		s.Prog(x86.ALOCK)
  1643  		p := s.Prog(v.Op.Asm())
  1644  		p.From.Type = obj.TYPE_REG
  1645  		p.From.Reg = v.Args[1].Reg()
  1646  		p.To.Type = obj.TYPE_MEM
  1647  		p.To.Reg = v.Args[0].Reg()
  1648  		ssagen.AddAux(&p.To, v)
  1649  	case ssa.OpAMD64LoweredAtomicAnd64, ssa.OpAMD64LoweredAtomicOr64, ssa.OpAMD64LoweredAtomicAnd32, ssa.OpAMD64LoweredAtomicOr32:
  1650  		// Atomic memory operations that need to return the old value.
  1651  		// We need to do these with compare-and-exchange to get access to the old value.
  1652  		// loop:
  1653  		// MOVQ mask, tmp
  1654  		// MOVQ (addr), AX
  1655  		// ANDQ AX, tmp
  1656  		// LOCK CMPXCHGQ tmp, (addr) : note that AX is implicit old value to compare against
  1657  		// JNE loop
  1658  		// : result in AX
  1659  		mov := x86.AMOVQ
  1660  		op := x86.AANDQ
  1661  		cmpxchg := x86.ACMPXCHGQ
  1662  		switch v.Op {
  1663  		case ssa.OpAMD64LoweredAtomicOr64:
  1664  			op = x86.AORQ
  1665  		case ssa.OpAMD64LoweredAtomicAnd32:
  1666  			mov = x86.AMOVL
  1667  			op = x86.AANDL
  1668  			cmpxchg = x86.ACMPXCHGL
  1669  		case ssa.OpAMD64LoweredAtomicOr32:
  1670  			mov = x86.AMOVL
  1671  			op = x86.AORL
  1672  			cmpxchg = x86.ACMPXCHGL
  1673  		}
  1674  		addr := v.Args[0].Reg()
  1675  		mask := v.Args[1].Reg()
  1676  		tmp := v.RegTmp()
  1677  		p1 := s.Prog(mov)
  1678  		p1.From.Type = obj.TYPE_REG
  1679  		p1.From.Reg = mask
  1680  		p1.To.Type = obj.TYPE_REG
  1681  		p1.To.Reg = tmp
  1682  		p2 := s.Prog(mov)
  1683  		p2.From.Type = obj.TYPE_MEM
  1684  		p2.From.Reg = addr
  1685  		ssagen.AddAux(&p2.From, v)
  1686  		p2.To.Type = obj.TYPE_REG
  1687  		p2.To.Reg = x86.REG_AX
  1688  		p3 := s.Prog(op)
  1689  		p3.From.Type = obj.TYPE_REG
  1690  		p3.From.Reg = x86.REG_AX
  1691  		p3.To.Type = obj.TYPE_REG
  1692  		p3.To.Reg = tmp
  1693  		s.Prog(x86.ALOCK)
  1694  		p5 := s.Prog(cmpxchg)
  1695  		p5.From.Type = obj.TYPE_REG
  1696  		p5.From.Reg = tmp
  1697  		p5.To.Type = obj.TYPE_MEM
  1698  		p5.To.Reg = addr
  1699  		ssagen.AddAux(&p5.To, v)
  1700  		p6 := s.Prog(x86.AJNE)
  1701  		p6.To.Type = obj.TYPE_BRANCH
  1702  		p6.To.SetTarget(p1)
  1703  	case ssa.OpAMD64PrefetchT0, ssa.OpAMD64PrefetchNTA:
  1704  		p := s.Prog(v.Op.Asm())
  1705  		p.From.Type = obj.TYPE_MEM
  1706  		p.From.Reg = v.Args[0].Reg()
  1707  	case ssa.OpClobber:
  1708  		p := s.Prog(x86.AMOVL)
  1709  		p.From.Type = obj.TYPE_CONST
  1710  		p.From.Offset = 0xdeaddead
  1711  		p.To.Type = obj.TYPE_MEM
  1712  		p.To.Reg = x86.REG_SP
  1713  		ssagen.AddAux(&p.To, v)
  1714  		p = s.Prog(x86.AMOVL)
  1715  		p.From.Type = obj.TYPE_CONST
  1716  		p.From.Offset = 0xdeaddead
  1717  		p.To.Type = obj.TYPE_MEM
  1718  		p.To.Reg = x86.REG_SP
  1719  		ssagen.AddAux(&p.To, v)
  1720  		p.To.Offset += 4
  1721  	case ssa.OpClobberReg:
  1722  		x := uint64(0xdeaddeaddeaddead)
  1723  		p := s.Prog(x86.AMOVQ)
  1724  		p.From.Type = obj.TYPE_CONST
  1725  		p.From.Offset = int64(x)
  1726  		p.To.Type = obj.TYPE_REG
  1727  		p.To.Reg = v.Reg()
  1728  
  1729  	// SIMD ops
  1730  	case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL:
  1731  		s.Prog(v.Op.Asm())
  1732  
  1733  	case ssa.OpAMD64Zero128: // no code emitted
  1734  
  1735  	case ssa.OpAMD64Zero256, ssa.OpAMD64Zero512:
  1736  		p := s.Prog(v.Op.Asm())
  1737  		p.From.Type = obj.TYPE_REG
  1738  		p.From.Reg = simdReg(v)
  1739  		p.AddRestSourceReg(simdReg(v))
  1740  		p.To.Type = obj.TYPE_REG
  1741  		p.To.Reg = simdReg(v)
  1742  
  1743  	case ssa.OpAMD64VMOVSSf2v, ssa.OpAMD64VMOVSDf2v:
  1744  		// These are for initializing the least 32/64 bits of a SIMD register from a "float".
  1745  		p := s.Prog(v.Op.Asm())
  1746  		p.From.Type = obj.TYPE_REG
  1747  		p.From.Reg = v.Args[0].Reg()
  1748  		p.AddRestSourceReg(x86.REG_X15)
  1749  		p.To.Type = obj.TYPE_REG
  1750  		p.To.Reg = simdReg(v)
  1751  
  1752  	case ssa.OpAMD64VMOVQload, ssa.OpAMD64VMOVDload,
  1753  		ssa.OpAMD64VMOVSSload, ssa.OpAMD64VMOVSDload:
  1754  		p := s.Prog(v.Op.Asm())
  1755  		p.From.Type = obj.TYPE_MEM
  1756  		p.From.Reg = v.Args[0].Reg()
  1757  		ssagen.AddAux(&p.From, v)
  1758  		p.To.Type = obj.TYPE_REG
  1759  		p.To.Reg = simdReg(v)
  1760  
  1761  	case ssa.OpAMD64VMOVSSconst, ssa.OpAMD64VMOVSDconst:
  1762  		// for loading constants directly into SIMD registers
  1763  		x := simdReg(v)
  1764  		p := s.Prog(v.Op.Asm())
  1765  		p.From.Type = obj.TYPE_FCONST
  1766  		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
  1767  		p.To.Type = obj.TYPE_REG
  1768  		p.To.Reg = x
  1769  
  1770  	case ssa.OpAMD64VMOVD, ssa.OpAMD64VMOVQ:
  1771  		// These are for initializing the least 32/64 bits of a SIMD register from an "int".
  1772  		p := s.Prog(v.Op.Asm())
  1773  		p.From.Type = obj.TYPE_REG
  1774  		p.From.Reg = v.Args[0].Reg()
  1775  		p.To.Type = obj.TYPE_REG
  1776  		p.To.Reg = simdReg(v)
  1777  
  1778  	case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512,
  1779  		ssa.OpAMD64KMOVBload, ssa.OpAMD64KMOVWload, ssa.OpAMD64KMOVDload, ssa.OpAMD64KMOVQload:
  1780  		p := s.Prog(v.Op.Asm())
  1781  		p.From.Type = obj.TYPE_MEM
  1782  		p.From.Reg = v.Args[0].Reg()
  1783  		ssagen.AddAux(&p.From, v)
  1784  		p.To.Type = obj.TYPE_REG
  1785  		p.To.Reg = simdOrMaskReg(v)
  1786  	case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512,
  1787  		ssa.OpAMD64KMOVBstore, ssa.OpAMD64KMOVWstore, ssa.OpAMD64KMOVDstore, ssa.OpAMD64KMOVQstore:
  1788  		p := s.Prog(v.Op.Asm())
  1789  		p.From.Type = obj.TYPE_REG
  1790  		p.From.Reg = simdOrMaskReg(v.Args[1])
  1791  		p.To.Type = obj.TYPE_MEM
  1792  		p.To.Reg = v.Args[0].Reg()
  1793  		ssagen.AddAux(&p.To, v)
  1794  
  1795  	case ssa.OpAMD64VPMASK32load128, ssa.OpAMD64VPMASK64load128, ssa.OpAMD64VPMASK32load256, ssa.OpAMD64VPMASK64load256:
  1796  		p := s.Prog(v.Op.Asm())
  1797  		p.From.Type = obj.TYPE_MEM
  1798  		p.From.Reg = v.Args[0].Reg()
  1799  		ssagen.AddAux(&p.From, v)
  1800  		p.To.Type = obj.TYPE_REG
  1801  		p.To.Reg = simdReg(v)
  1802  		p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg
  1803  
  1804  	case ssa.OpAMD64VPMASK32store128, ssa.OpAMD64VPMASK64store128, ssa.OpAMD64VPMASK32store256, ssa.OpAMD64VPMASK64store256:
  1805  		p := s.Prog(v.Op.Asm())
  1806  		p.From.Type = obj.TYPE_REG
  1807  		p.From.Reg = simdReg(v.Args[2])
  1808  		p.To.Type = obj.TYPE_MEM
  1809  		p.To.Reg = v.Args[0].Reg()
  1810  		ssagen.AddAux(&p.To, v)
  1811  		p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg
  1812  
  1813  	case ssa.OpAMD64VPMASK64load512, ssa.OpAMD64VPMASK32load512, ssa.OpAMD64VPMASK16load512, ssa.OpAMD64VPMASK8load512:
  1814  		p := s.Prog(v.Op.Asm())
  1815  		p.From.Type = obj.TYPE_MEM
  1816  		p.From.Reg = v.Args[0].Reg()
  1817  		ssagen.AddAux(&p.From, v)
  1818  		p.To.Type = obj.TYPE_REG
  1819  		p.To.Reg = simdReg(v)
  1820  		p.AddRestSourceReg(v.Args[1].Reg()) // simd mask reg
  1821  		x86.ParseSuffix(p, "Z")             // must be zero if not in mask
  1822  
  1823  	case ssa.OpAMD64VPMASK64store512, ssa.OpAMD64VPMASK32store512, ssa.OpAMD64VPMASK16store512, ssa.OpAMD64VPMASK8store512:
  1824  		p := s.Prog(v.Op.Asm())
  1825  		p.From.Type = obj.TYPE_REG
  1826  		p.From.Reg = simdReg(v.Args[2])
  1827  		p.To.Type = obj.TYPE_MEM
  1828  		p.To.Reg = v.Args[0].Reg()
  1829  		ssagen.AddAux(&p.To, v)
  1830  		p.AddRestSourceReg(v.Args[1].Reg()) // simd mask reg
  1831  
  1832  	case ssa.OpAMD64VPMOVMToVec8x16,
  1833  		ssa.OpAMD64VPMOVMToVec8x32,
  1834  		ssa.OpAMD64VPMOVMToVec8x64,
  1835  		ssa.OpAMD64VPMOVMToVec16x8,
  1836  		ssa.OpAMD64VPMOVMToVec16x16,
  1837  		ssa.OpAMD64VPMOVMToVec16x32,
  1838  		ssa.OpAMD64VPMOVMToVec32x4,
  1839  		ssa.OpAMD64VPMOVMToVec32x8,
  1840  		ssa.OpAMD64VPMOVMToVec32x16,
  1841  		ssa.OpAMD64VPMOVMToVec64x2,
  1842  		ssa.OpAMD64VPMOVMToVec64x4,
  1843  		ssa.OpAMD64VPMOVMToVec64x8:
  1844  		p := s.Prog(v.Op.Asm())
  1845  		p.From.Type = obj.TYPE_REG
  1846  		p.From.Reg = v.Args[0].Reg()
  1847  		p.To.Type = obj.TYPE_REG
  1848  		p.To.Reg = simdReg(v)
  1849  
  1850  	case ssa.OpAMD64VPMOVVec8x16ToM,
  1851  		ssa.OpAMD64VPMOVVec8x32ToM,
  1852  		ssa.OpAMD64VPMOVVec8x64ToM,
  1853  		ssa.OpAMD64VPMOVVec16x8ToM,
  1854  		ssa.OpAMD64VPMOVVec16x16ToM,
  1855  		ssa.OpAMD64VPMOVVec16x32ToM,
  1856  		ssa.OpAMD64VPMOVVec32x4ToM,
  1857  		ssa.OpAMD64VPMOVVec32x8ToM,
  1858  		ssa.OpAMD64VPMOVVec32x16ToM,
  1859  		ssa.OpAMD64VPMOVVec64x2ToM,
  1860  		ssa.OpAMD64VPMOVVec64x4ToM,
  1861  		ssa.OpAMD64VPMOVVec64x8ToM,
  1862  		ssa.OpAMD64VPMOVMSKB128,
  1863  		ssa.OpAMD64VPMOVMSKB256,
  1864  		ssa.OpAMD64VMOVMSKPS128,
  1865  		ssa.OpAMD64VMOVMSKPS256,
  1866  		ssa.OpAMD64VMOVMSKPD128,
  1867  		ssa.OpAMD64VMOVMSKPD256:
  1868  		p := s.Prog(v.Op.Asm())
  1869  		p.From.Type = obj.TYPE_REG
  1870  		p.From.Reg = simdReg(v.Args[0])
  1871  		p.To.Type = obj.TYPE_REG
  1872  		p.To.Reg = v.Reg()
  1873  
  1874  	case ssa.OpAMD64KMOVQk, ssa.OpAMD64KMOVDk, ssa.OpAMD64KMOVWk, ssa.OpAMD64KMOVBk,
  1875  		ssa.OpAMD64KMOVQi, ssa.OpAMD64KMOVDi, ssa.OpAMD64KMOVWi, ssa.OpAMD64KMOVBi:
  1876  		// See also ssa.OpAMD64KMOVQload
  1877  		p := s.Prog(v.Op.Asm())
  1878  		p.From.Type = obj.TYPE_REG
  1879  		p.From.Reg = v.Args[0].Reg()
  1880  		p.To.Type = obj.TYPE_REG
  1881  		p.To.Reg = v.Reg()
  1882  	case ssa.OpAMD64VPTEST:
  1883  		// Some instructions setting flags put their second operand into the destination reg.
  1884  		// See also CMP[BWDQ].
  1885  		p := s.Prog(v.Op.Asm())
  1886  		p.From.Type = obj.TYPE_REG
  1887  		p.From.Reg = simdReg(v.Args[0])
  1888  		p.To.Type = obj.TYPE_REG
  1889  		p.To.Reg = simdReg(v.Args[1])
  1890  
  1891  	default:
  1892  		if !ssaGenSIMDValue(s, v) {
  1893  			v.Fatalf("genValue not implemented: %s", v.LongString())
  1894  		}
  1895  	}
  1896  }
  1897  
  1898  // zeroX15 zeroes the X15 register.
  1899  func zeroX15(s *ssagen.State) {
  1900  	opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
  1901  }
  1902  
  1903  // Example instruction: VRSQRTPS X1, X1
  1904  func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1905  	p := s.Prog(v.Op.Asm())
  1906  	p.From.Type = obj.TYPE_REG
  1907  	p.From.Reg = simdReg(v.Args[0])
  1908  	p.To.Type = obj.TYPE_REG
  1909  	p.To.Reg = simdReg(v)
  1910  	return p
  1911  }
  1912  
  1913  // Example instruction: VPSUBD X1, X2, X3
  1914  func simdV21(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1915  	p := s.Prog(v.Op.Asm())
  1916  	p.From.Type = obj.TYPE_REG
  1917  	// Vector registers operands follows a right-to-left order.
  1918  	// e.g. VPSUBD X1, X2, X3 means X3 = X2 - X1.
  1919  	p.From.Reg = simdReg(v.Args[1])
  1920  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1921  	p.To.Type = obj.TYPE_REG
  1922  	p.To.Reg = simdReg(v)
  1923  	return p
  1924  }
  1925  
  1926  // This function is to accustomize the shifts.
  1927  // The 2nd arg is an XMM, and this function merely checks that.
  1928  // Example instruction: VPSLLQ Z1, X1, Z2
  1929  func simdVfpv(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1930  	p := s.Prog(v.Op.Asm())
  1931  	p.From.Type = obj.TYPE_REG
  1932  	// Vector registers operands follows a right-to-left order.
  1933  	// e.g. VPSUBD X1, X2, X3 means X3 = X2 - X1.
  1934  	p.From.Reg = v.Args[1].Reg()
  1935  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1936  	p.To.Type = obj.TYPE_REG
  1937  	p.To.Reg = simdReg(v)
  1938  	return p
  1939  }
  1940  
  1941  // Example instruction: VPCMPEQW Z26, Z30, K4
  1942  func simdV2k(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1943  	p := s.Prog(v.Op.Asm())
  1944  	p.From.Type = obj.TYPE_REG
  1945  	p.From.Reg = simdReg(v.Args[1])
  1946  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1947  	p.To.Type = obj.TYPE_REG
  1948  	p.To.Reg = maskReg(v)
  1949  	return p
  1950  }
  1951  
  1952  // Example instruction: VPMINUQ X21, X3, K3, X31
  1953  func simdV2kv(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1954  	p := s.Prog(v.Op.Asm())
  1955  	p.From.Type = obj.TYPE_REG
  1956  	p.From.Reg = simdReg(v.Args[1])
  1957  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1958  	// These "simd*" series of functions assumes:
  1959  	// Any "K" register that serves as the write-mask
  1960  	// or "predicate" for "predicated AVX512 instructions"
  1961  	// sits right at the end of the operand list.
  1962  	// TODO: verify this assumption.
  1963  	p.AddRestSourceReg(maskReg(v.Args[2]))
  1964  	p.To.Type = obj.TYPE_REG
  1965  	p.To.Reg = simdReg(v)
  1966  	return p
  1967  }
  1968  
  1969  // Example instruction: VPABSB X1, X2, K3 (masking merging)
  1970  func simdV2kvResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1971  	p := s.Prog(v.Op.Asm())
  1972  	p.From.Type = obj.TYPE_REG
  1973  	p.From.Reg = simdReg(v.Args[1])
  1974  	// These "simd*" series of functions assumes:
  1975  	// Any "K" register that serves as the write-mask
  1976  	// or "predicate" for "predicated AVX512 instructions"
  1977  	// sits right at the end of the operand list.
  1978  	// TODO: verify this assumption.
  1979  	p.AddRestSourceReg(maskReg(v.Args[2]))
  1980  	p.To.Type = obj.TYPE_REG
  1981  	p.To.Reg = simdReg(v)
  1982  	return p
  1983  }
  1984  
  1985  // This function is to accustomize the shifts.
  1986  // The 2nd arg is an XMM, and this function merely checks that.
  1987  // Example instruction: VPSLLQ Z1, X1, K1, Z2
  1988  func simdVfpkv(s *ssagen.State, v *ssa.Value) *obj.Prog {
  1989  	p := s.Prog(v.Op.Asm())
  1990  	p.From.Type = obj.TYPE_REG
  1991  	p.From.Reg = v.Args[1].Reg()
  1992  	p.AddRestSourceReg(simdReg(v.Args[0]))
  1993  	p.AddRestSourceReg(maskReg(v.Args[2]))
  1994  	p.To.Type = obj.TYPE_REG
  1995  	p.To.Reg = simdReg(v)
  1996  	return p
  1997  }
  1998  
  1999  // Example instruction: VPCMPEQW Z26, Z30, K1, K4
  2000  func simdV2kk(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2001  	p := s.Prog(v.Op.Asm())
  2002  	p.From.Type = obj.TYPE_REG
  2003  	p.From.Reg = simdReg(v.Args[1])
  2004  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2005  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2006  	p.To.Type = obj.TYPE_REG
  2007  	p.To.Reg = maskReg(v)
  2008  	return p
  2009  }
  2010  
  2011  // Example instruction: VPOPCNTB X14, K4, X16
  2012  func simdVkv(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2013  	p := s.Prog(v.Op.Asm())
  2014  	p.From.Type = obj.TYPE_REG
  2015  	p.From.Reg = simdReg(v.Args[0])
  2016  	p.AddRestSourceReg(maskReg(v.Args[1]))
  2017  	p.To.Type = obj.TYPE_REG
  2018  	p.To.Reg = simdReg(v)
  2019  	return p
  2020  }
  2021  
  2022  // Example instruction: VROUNDPD $7, X2, X2
  2023  func simdV11Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2024  	p := s.Prog(v.Op.Asm())
  2025  	p.From.Offset = int64(v.AuxUInt8())
  2026  	p.From.Type = obj.TYPE_CONST
  2027  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2028  	p.To.Type = obj.TYPE_REG
  2029  	p.To.Reg = simdReg(v)
  2030  	return p
  2031  }
  2032  
  2033  // Example instruction: VREDUCEPD $126, X1, K3, X31
  2034  func simdVkvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2035  	p := s.Prog(v.Op.Asm())
  2036  	p.From.Offset = int64(v.AuxUInt8())
  2037  	p.From.Type = obj.TYPE_CONST
  2038  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2039  	p.AddRestSourceReg(maskReg(v.Args[1]))
  2040  	p.To.Type = obj.TYPE_REG
  2041  	p.To.Reg = simdReg(v)
  2042  	return p
  2043  }
  2044  
  2045  // Example instruction: VCMPPS $7, X2, X9, X2
  2046  func simdV21Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2047  	p := s.Prog(v.Op.Asm())
  2048  	p.From.Offset = int64(v.AuxUInt8())
  2049  	p.From.Type = obj.TYPE_CONST
  2050  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2051  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2052  	p.To.Type = obj.TYPE_REG
  2053  	p.To.Reg = simdReg(v)
  2054  	return p
  2055  }
  2056  
  2057  // Example instruction: VPINSRB $3, DX, X0, X0
  2058  func simdVgpvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2059  	p := s.Prog(v.Op.Asm())
  2060  	p.From.Offset = int64(v.AuxUInt8())
  2061  	p.From.Type = obj.TYPE_CONST
  2062  	p.AddRestSourceReg(v.Args[1].Reg())
  2063  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2064  	p.To.Type = obj.TYPE_REG
  2065  	p.To.Reg = simdReg(v)
  2066  	return p
  2067  }
  2068  
  2069  // Example instruction: VPCMPD $1, Z1, Z2, K1
  2070  func simdV2kImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2071  	p := s.Prog(v.Op.Asm())
  2072  	p.From.Offset = int64(v.AuxUInt8())
  2073  	p.From.Type = obj.TYPE_CONST
  2074  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2075  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2076  	p.To.Type = obj.TYPE_REG
  2077  	p.To.Reg = maskReg(v)
  2078  	return p
  2079  }
  2080  
  2081  // Example instruction: VPCMPD $1, Z1, Z2, K2, K1
  2082  func simdV2kkImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2083  	p := s.Prog(v.Op.Asm())
  2084  	p.From.Offset = int64(v.AuxUInt8())
  2085  	p.From.Type = obj.TYPE_CONST
  2086  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2087  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2088  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2089  	p.To.Type = obj.TYPE_REG
  2090  	p.To.Reg = maskReg(v)
  2091  	return p
  2092  }
  2093  
  2094  func simdV2kvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2095  	p := s.Prog(v.Op.Asm())
  2096  	p.From.Offset = int64(v.AuxUInt8())
  2097  	p.From.Type = obj.TYPE_CONST
  2098  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2099  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2100  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2101  	p.To.Type = obj.TYPE_REG
  2102  	p.To.Reg = simdReg(v)
  2103  	return p
  2104  }
  2105  
  2106  // Example instruction: VFMADD213PD Z2, Z1, Z0
  2107  func simdV31ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2108  	p := s.Prog(v.Op.Asm())
  2109  	p.From.Type = obj.TYPE_REG
  2110  	p.From.Reg = simdReg(v.Args[2])
  2111  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2112  	p.To.Type = obj.TYPE_REG
  2113  	p.To.Reg = simdReg(v)
  2114  	return p
  2115  }
  2116  
  2117  func simdV31ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2118  	p := s.Prog(v.Op.Asm())
  2119  	p.From.Offset = int64(v.AuxUInt8())
  2120  	p.From.Type = obj.TYPE_CONST
  2121  
  2122  	p.AddRestSourceReg(simdReg(v.Args[2]))
  2123  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2124  	// p.AddRestSourceReg(x86.REG_K0)
  2125  	p.To.Type = obj.TYPE_REG
  2126  	p.To.Reg = simdReg(v)
  2127  	return p
  2128  }
  2129  
  2130  // v31loadResultInArg0Imm8
  2131  // Example instruction:
  2132  // for (VPTERNLOGD128load {sym} [makeValAndOff(int32(int8(c)),off)]  x y ptr mem)
  2133  func simdV31loadResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2134  	sc := v.AuxValAndOff()
  2135  	p := s.Prog(v.Op.Asm())
  2136  
  2137  	p.From.Type = obj.TYPE_CONST
  2138  	p.From.Offset = sc.Val64()
  2139  
  2140  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[2].Reg()}
  2141  	ssagen.AddAux2(&m, v, sc.Off64())
  2142  	p.AddRestSource(m)
  2143  
  2144  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2145  	return p
  2146  }
  2147  
  2148  // Example instruction: VFMADD213PD Z2, Z1, K1, Z0
  2149  func simdV3kvResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2150  	p := s.Prog(v.Op.Asm())
  2151  	p.From.Type = obj.TYPE_REG
  2152  	p.From.Reg = simdReg(v.Args[2])
  2153  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2154  	p.AddRestSourceReg(maskReg(v.Args[3]))
  2155  	p.To.Type = obj.TYPE_REG
  2156  	p.To.Reg = simdReg(v)
  2157  	return p
  2158  }
  2159  
  2160  func simdVgpImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2161  	p := s.Prog(v.Op.Asm())
  2162  	p.From.Offset = int64(v.AuxUInt8())
  2163  	p.From.Type = obj.TYPE_CONST
  2164  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2165  	p.To.Type = obj.TYPE_REG
  2166  	p.To.Reg = v.Reg()
  2167  	return p
  2168  }
  2169  
  2170  // Currently unused
  2171  func simdV31(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2172  	p := s.Prog(v.Op.Asm())
  2173  	p.From.Type = obj.TYPE_REG
  2174  	p.From.Reg = simdReg(v.Args[2])
  2175  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2176  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2177  	p.To.Type = obj.TYPE_REG
  2178  	p.To.Reg = simdReg(v)
  2179  	return p
  2180  }
  2181  
  2182  // Currently unused
  2183  func simdV3kv(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2184  	p := s.Prog(v.Op.Asm())
  2185  	p.From.Type = obj.TYPE_REG
  2186  	p.From.Reg = simdReg(v.Args[2])
  2187  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2188  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2189  	p.AddRestSourceReg(maskReg(v.Args[3]))
  2190  	p.To.Type = obj.TYPE_REG
  2191  	p.To.Reg = simdReg(v)
  2192  	return p
  2193  }
  2194  
  2195  // Example instruction: VRCP14PS (DI), K6, X22
  2196  func simdVkvload(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2197  	p := s.Prog(v.Op.Asm())
  2198  	p.From.Type = obj.TYPE_MEM
  2199  	p.From.Reg = v.Args[0].Reg()
  2200  	ssagen.AddAux(&p.From, v)
  2201  	p.AddRestSourceReg(maskReg(v.Args[1]))
  2202  	p.To.Type = obj.TYPE_REG
  2203  	p.To.Reg = simdReg(v)
  2204  	return p
  2205  }
  2206  
  2207  // Example instruction: VPSLLVD (DX), X7, X18
  2208  func simdV21load(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2209  	p := s.Prog(v.Op.Asm())
  2210  	p.From.Type = obj.TYPE_MEM
  2211  	p.From.Reg = v.Args[1].Reg()
  2212  	ssagen.AddAux(&p.From, v)
  2213  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2214  	p.To.Type = obj.TYPE_REG
  2215  	p.To.Reg = simdReg(v)
  2216  	return p
  2217  }
  2218  
  2219  // Example instruction: VPDPWSSD (SI), X24, X18
  2220  func simdV31loadResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2221  	p := s.Prog(v.Op.Asm())
  2222  	p.From.Type = obj.TYPE_MEM
  2223  	p.From.Reg = v.Args[2].Reg()
  2224  	ssagen.AddAux(&p.From, v)
  2225  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2226  	p.To.Type = obj.TYPE_REG
  2227  	p.To.Reg = simdReg(v)
  2228  	return p
  2229  }
  2230  
  2231  // Example instruction: VPDPWSSD (SI), X24, K1, X18
  2232  func simdV3kvloadResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2233  	p := s.Prog(v.Op.Asm())
  2234  	p.From.Type = obj.TYPE_MEM
  2235  	p.From.Reg = v.Args[2].Reg()
  2236  	ssagen.AddAux(&p.From, v)
  2237  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2238  	p.AddRestSourceReg(maskReg(v.Args[3]))
  2239  	p.To.Type = obj.TYPE_REG
  2240  	p.To.Reg = simdReg(v)
  2241  	return p
  2242  }
  2243  
  2244  // Example instruction: VPSLLVD (SI), X1, K1, X2
  2245  func simdV2kvload(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2246  	p := s.Prog(v.Op.Asm())
  2247  	p.From.Type = obj.TYPE_MEM
  2248  	p.From.Reg = v.Args[1].Reg()
  2249  	ssagen.AddAux(&p.From, v)
  2250  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2251  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2252  	p.To.Type = obj.TYPE_REG
  2253  	p.To.Reg = simdReg(v)
  2254  	return p
  2255  }
  2256  
  2257  // Example instruction: VPCMPEQD (SI), X1, K1
  2258  func simdV2kload(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2259  	p := s.Prog(v.Op.Asm())
  2260  	p.From.Type = obj.TYPE_MEM
  2261  	p.From.Reg = v.Args[1].Reg()
  2262  	ssagen.AddAux(&p.From, v)
  2263  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2264  	p.To.Type = obj.TYPE_REG
  2265  	p.To.Reg = maskReg(v)
  2266  	return p
  2267  }
  2268  
  2269  // Example instruction: VCVTTPS2DQ (BX), X2
  2270  func simdV11load(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2271  	p := s.Prog(v.Op.Asm())
  2272  	p.From.Type = obj.TYPE_MEM
  2273  	p.From.Reg = v.Args[0].Reg()
  2274  	ssagen.AddAux(&p.From, v)
  2275  	p.To.Type = obj.TYPE_REG
  2276  	p.To.Reg = simdReg(v)
  2277  	return p
  2278  }
  2279  
  2280  // Example instruction: VPSHUFD $7, (BX), X11
  2281  func simdV11loadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2282  	sc := v.AuxValAndOff()
  2283  	p := s.Prog(v.Op.Asm())
  2284  	p.From.Type = obj.TYPE_CONST
  2285  	p.From.Offset = sc.Val64()
  2286  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
  2287  	ssagen.AddAux2(&m, v, sc.Off64())
  2288  	p.AddRestSource(m)
  2289  	p.To.Type = obj.TYPE_REG
  2290  	p.To.Reg = simdReg(v)
  2291  	return p
  2292  }
  2293  
  2294  // Example instruction: VPRORD $81, -15(R14), K7, Y1
  2295  func simdVkvloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2296  	sc := v.AuxValAndOff()
  2297  	p := s.Prog(v.Op.Asm())
  2298  	p.From.Type = obj.TYPE_CONST
  2299  	p.From.Offset = sc.Val64()
  2300  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
  2301  	ssagen.AddAux2(&m, v, sc.Off64())
  2302  	p.AddRestSource(m)
  2303  	p.AddRestSourceReg(maskReg(v.Args[1]))
  2304  	p.To.Type = obj.TYPE_REG
  2305  	p.To.Reg = simdReg(v)
  2306  	return p
  2307  }
  2308  
  2309  // Example instruction: VPSHLDD $82, 7(SI), Y21, Y3
  2310  func simdV21loadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2311  	sc := v.AuxValAndOff()
  2312  	p := s.Prog(v.Op.Asm())
  2313  	p.From.Type = obj.TYPE_CONST
  2314  	p.From.Offset = sc.Val64()
  2315  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
  2316  	ssagen.AddAux2(&m, v, sc.Off64())
  2317  	p.AddRestSource(m)
  2318  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2319  	p.To.Type = obj.TYPE_REG
  2320  	p.To.Reg = simdReg(v)
  2321  	return p
  2322  }
  2323  
  2324  // Example instruction: VCMPPS $81, -7(DI), Y16, K3
  2325  func simdV2kloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2326  	sc := v.AuxValAndOff()
  2327  	p := s.Prog(v.Op.Asm())
  2328  	p.From.Type = obj.TYPE_CONST
  2329  	p.From.Offset = sc.Val64()
  2330  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
  2331  	ssagen.AddAux2(&m, v, sc.Off64())
  2332  	p.AddRestSource(m)
  2333  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2334  	p.To.Type = obj.TYPE_REG
  2335  	p.To.Reg = maskReg(v)
  2336  	return p
  2337  }
  2338  
  2339  // Example instruction: VCMPPS $81, -7(DI), Y16, K1, K3
  2340  func simdV2kkloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2341  	sc := v.AuxValAndOff()
  2342  	p := s.Prog(v.Op.Asm())
  2343  	p.From.Type = obj.TYPE_CONST
  2344  	p.From.Offset = sc.Val64()
  2345  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
  2346  	ssagen.AddAux2(&m, v, sc.Off64())
  2347  	p.AddRestSource(m)
  2348  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2349  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2350  	p.To.Type = obj.TYPE_REG
  2351  	p.To.Reg = maskReg(v)
  2352  	return p
  2353  }
  2354  
  2355  // Example instruction: VGF2P8AFFINEINVQB $64, -17(BP), X31, K3, X26
  2356  func simdV2kvloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2357  	sc := v.AuxValAndOff()
  2358  	p := s.Prog(v.Op.Asm())
  2359  	p.From.Type = obj.TYPE_CONST
  2360  	p.From.Offset = sc.Val64()
  2361  	m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
  2362  	ssagen.AddAux2(&m, v, sc.Off64())
  2363  	p.AddRestSource(m)
  2364  	p.AddRestSourceReg(simdReg(v.Args[0]))
  2365  	p.AddRestSourceReg(maskReg(v.Args[2]))
  2366  	p.To.Type = obj.TYPE_REG
  2367  	p.To.Reg = simdReg(v)
  2368  	return p
  2369  }
  2370  
  2371  // Example instruction: SHA1NEXTE X2, X2
  2372  func simdV21ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2373  	p := s.Prog(v.Op.Asm())
  2374  	p.From.Type = obj.TYPE_REG
  2375  	p.From.Reg = simdReg(v.Args[1])
  2376  	p.To.Type = obj.TYPE_REG
  2377  	p.To.Reg = simdReg(v)
  2378  	return p
  2379  }
  2380  
  2381  // Example instruction: SHA1RNDS4 $1, X2, X2
  2382  func simdV21ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2383  	p := s.Prog(v.Op.Asm())
  2384  	p.From.Offset = int64(v.AuxUInt8())
  2385  	p.From.Type = obj.TYPE_CONST
  2386  	p.AddRestSourceReg(simdReg(v.Args[1]))
  2387  	p.To.Type = obj.TYPE_REG
  2388  	p.To.Reg = simdReg(v)
  2389  	return p
  2390  }
  2391  
  2392  // Example instruction: SHA256RNDS2 X0, X11, X2
  2393  func simdV31x0AtIn2ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
  2394  	return simdV31ResultInArg0(s, v)
  2395  }
  2396  
  2397  var blockJump = [...]struct {
  2398  	asm, invasm obj.As
  2399  }{
  2400  	ssa.BlockAMD64EQ:  {x86.AJEQ, x86.AJNE},
  2401  	ssa.BlockAMD64NE:  {x86.AJNE, x86.AJEQ},
  2402  	ssa.BlockAMD64LT:  {x86.AJLT, x86.AJGE},
  2403  	ssa.BlockAMD64GE:  {x86.AJGE, x86.AJLT},
  2404  	ssa.BlockAMD64LE:  {x86.AJLE, x86.AJGT},
  2405  	ssa.BlockAMD64GT:  {x86.AJGT, x86.AJLE},
  2406  	ssa.BlockAMD64OS:  {x86.AJOS, x86.AJOC},
  2407  	ssa.BlockAMD64OC:  {x86.AJOC, x86.AJOS},
  2408  	ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
  2409  	ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
  2410  	ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
  2411  	ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
  2412  	ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
  2413  	ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
  2414  }
  2415  
  2416  var eqfJumps = [2][2]ssagen.IndexJump{
  2417  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
  2418  	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
  2419  }
  2420  var nefJumps = [2][2]ssagen.IndexJump{
  2421  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
  2422  	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
  2423  }
  2424  
  2425  func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) {
  2426  	switch b.Kind {
  2427  	case ssa.BlockPlain, ssa.BlockDefer:
  2428  		if b.Succs[0].Block() != next {
  2429  			p := s.Prog(obj.AJMP)
  2430  			p.To.Type = obj.TYPE_BRANCH
  2431  			s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()})
  2432  		}
  2433  	case ssa.BlockExit, ssa.BlockRetJmp:
  2434  	case ssa.BlockRet:
  2435  		s.Prog(obj.ARET)
  2436  
  2437  	case ssa.BlockAMD64EQF:
  2438  		s.CombJump(b, next, &eqfJumps)
  2439  
  2440  	case ssa.BlockAMD64NEF:
  2441  		s.CombJump(b, next, &nefJumps)
  2442  
  2443  	case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
  2444  		ssa.BlockAMD64LT, ssa.BlockAMD64GE,
  2445  		ssa.BlockAMD64LE, ssa.BlockAMD64GT,
  2446  		ssa.BlockAMD64OS, ssa.BlockAMD64OC,
  2447  		ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
  2448  		ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
  2449  		jmp := blockJump[b.Kind]
  2450  		switch next {
  2451  		case b.Succs[0].Block():
  2452  			s.Br(jmp.invasm, b.Succs[1].Block())
  2453  		case b.Succs[1].Block():
  2454  			s.Br(jmp.asm, b.Succs[0].Block())
  2455  		default:
  2456  			if b.Likely != ssa.BranchUnlikely {
  2457  				s.Br(jmp.asm, b.Succs[0].Block())
  2458  				s.Br(obj.AJMP, b.Succs[1].Block())
  2459  			} else {
  2460  				s.Br(jmp.invasm, b.Succs[1].Block())
  2461  				s.Br(obj.AJMP, b.Succs[0].Block())
  2462  			}
  2463  		}
  2464  
  2465  	case ssa.BlockAMD64JUMPTABLE:
  2466  		// JMP      *(TABLE)(INDEX*8)
  2467  		p := s.Prog(obj.AJMP)
  2468  		p.To.Type = obj.TYPE_MEM
  2469  		p.To.Reg = b.Controls[1].Reg()
  2470  		p.To.Index = b.Controls[0].Reg()
  2471  		p.To.Scale = 8
  2472  		// Save jump tables for later resolution of the target blocks.
  2473  		s.JumpTables = append(s.JumpTables, b)
  2474  
  2475  	default:
  2476  		b.Fatalf("branch not implemented: %s", b.LongString())
  2477  	}
  2478  }
  2479  
  2480  func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
  2481  	p := s.Prog(loadByRegWidth(reg, t.Size()))
  2482  	p.From.Type = obj.TYPE_MEM
  2483  	p.From.Name = obj.NAME_AUTO
  2484  	p.From.Sym = n.Linksym()
  2485  	p.From.Offset = n.FrameOffset() + off
  2486  	p.To.Type = obj.TYPE_REG
  2487  	p.To.Reg = reg
  2488  	return p
  2489  }
  2490  
  2491  func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
  2492  	p = pp.Append(p, storeByRegWidth(reg, t.Size()), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off)
  2493  	p.To.Name = obj.NAME_PARAM
  2494  	p.To.Sym = n.Linksym()
  2495  	p.Pos = p.Pos.WithNotStmt()
  2496  	return p
  2497  }
  2498  
  2499  // zero 16 bytes at reg+off.
  2500  func zero16(s *ssagen.State, reg int16, off int64) {
  2501  	//   MOVUPS  X15, off(ptrReg)
  2502  	p := s.Prog(x86.AMOVUPS)
  2503  	p.From.Type = obj.TYPE_REG
  2504  	p.From.Reg = x86.REG_X15
  2505  	p.To.Type = obj.TYPE_MEM
  2506  	p.To.Reg = reg
  2507  	p.To.Offset = off
  2508  }
  2509  
  2510  // move 16 bytes from src+off to dst+off using temporary register tmp.
  2511  func move16(s *ssagen.State, src, dst, tmp int16, off int64) {
  2512  	//   MOVUPS  off(srcReg), tmpReg
  2513  	//   MOVUPS  tmpReg, off(dstReg)
  2514  	p := s.Prog(x86.AMOVUPS)
  2515  	p.From.Type = obj.TYPE_MEM
  2516  	p.From.Reg = src
  2517  	p.From.Offset = off
  2518  	p.To.Type = obj.TYPE_REG
  2519  	p.To.Reg = tmp
  2520  	p = s.Prog(x86.AMOVUPS)
  2521  	p.From.Type = obj.TYPE_REG
  2522  	p.From.Reg = tmp
  2523  	p.To.Type = obj.TYPE_MEM
  2524  	p.To.Reg = dst
  2525  	p.To.Offset = off
  2526  }
  2527  
  2528  // XXX maybe make this part of v.Reg?
  2529  // On the other hand, it is architecture-specific.
  2530  func simdReg(v *ssa.Value) int16 {
  2531  	t := v.Type
  2532  	if !t.IsSIMD() {
  2533  		base.Fatalf("simdReg: not a simd type; v=%s, b=b%d, f=%s", v.LongString(), v.Block.ID, v.Block.Func.Name)
  2534  	}
  2535  	return simdRegBySize(v.Reg(), t.Size())
  2536  }
  2537  
  2538  func simdRegBySize(reg int16, size int64) int16 {
  2539  	switch size {
  2540  	case 16:
  2541  		return reg
  2542  	case 32:
  2543  		return reg + (x86.REG_Y0 - x86.REG_X0)
  2544  	case 64:
  2545  		return reg + (x86.REG_Z0 - x86.REG_X0)
  2546  	}
  2547  	panic("simdRegBySize: bad size")
  2548  }
  2549  
  2550  // XXX k mask
  2551  func maskReg(v *ssa.Value) int16 {
  2552  	t := v.Type
  2553  	if !t.IsSIMD() {
  2554  		base.Fatalf("maskReg: not a simd type; v=%s, b=b%d, f=%s", v.LongString(), v.Block.ID, v.Block.Func.Name)
  2555  	}
  2556  	switch t.Size() {
  2557  	case 8:
  2558  		return v.Reg()
  2559  	}
  2560  	panic("unreachable")
  2561  }
  2562  
  2563  // XXX k mask + vec
  2564  func simdOrMaskReg(v *ssa.Value) int16 {
  2565  	t := v.Type
  2566  	if t.Size() <= 8 {
  2567  		return maskReg(v)
  2568  	}
  2569  	return simdReg(v)
  2570  }
  2571  
  2572  // XXX this is used for shift operations only.
  2573  // regalloc will issue OpCopy with incorrect type, but the assigned
  2574  // register should be correct, and this function is merely checking
  2575  // the sanity of this part.
  2576  func simdCheckRegOnly(v *ssa.Value, regStart, regEnd int16) int16 {
  2577  	if v.Reg() > regEnd || v.Reg() < regStart {
  2578  		panic("simdCheckRegOnly: not the desired register")
  2579  	}
  2580  	return v.Reg()
  2581  }
  2582  

View as plain text