Source file src/cmd/compile/internal/ssa/regalloc.go

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Register allocation.
     6  //
     7  // We use a version of a linear scan register allocator. We treat the
     8  // whole function as a single long basic block and run through
     9  // it using a greedy register allocator. Then all merge edges
    10  // (those targeting a block with len(Preds)>1) are processed to
    11  // shuffle data into the place that the target of the edge expects.
    12  //
    13  // The greedy allocator moves values into registers just before they
    14  // are used, spills registers only when necessary, and spills the
    15  // value whose next use is farthest in the future.
    16  //
    17  // The register allocator requires that a block is not scheduled until
    18  // at least one of its predecessors have been scheduled. The most recent
    19  // such predecessor provides the starting register state for a block.
    20  //
    21  // It also requires that there are no critical edges (critical =
    22  // comes from a block with >1 successor and goes to a block with >1
    23  // predecessor).  This makes it easy to add fixup code on merge edges -
    24  // the source of a merge edge has only one successor, so we can add
    25  // fixup code to the end of that block.
    26  
    27  // Spilling
    28  //
    29  // During the normal course of the allocator, we might throw a still-live
    30  // value out of all registers. When that value is subsequently used, we must
    31  // load it from a slot on the stack. We must also issue an instruction to
    32  // initialize that stack location with a copy of v.
    33  //
    34  // pre-regalloc:
    35  //   (1) v = Op ...
    36  //   (2) x = Op ...
    37  //   (3) ... = Op v ...
    38  //
    39  // post-regalloc:
    40  //   (1) v = Op ...    : AX // computes v, store result in AX
    41  //       s = StoreReg v     // spill v to a stack slot
    42  //   (2) x = Op ...    : AX // some other op uses AX
    43  //       c = LoadReg s : CX // restore v from stack slot
    44  //   (3) ... = Op c ...     // use the restored value
    45  //
    46  // Allocation occurs normally until we reach (3) and we realize we have
    47  // a use of v and it isn't in any register. At that point, we allocate
    48  // a spill (a StoreReg) for v. We can't determine the correct place for
    49  // the spill at this point, so we allocate the spill as blockless initially.
    50  // The restore is then generated to load v back into a register so it can
    51  // be used. Subsequent uses of v will use the restored value c instead.
    52  //
    53  // What remains is the question of where to schedule the spill.
    54  // During allocation, we keep track of the dominator of all restores of v.
    55  // The spill of v must dominate that block. The spill must also be issued at
    56  // a point where v is still in a register.
    57  //
    58  // To find the right place, start at b, the block which dominates all restores.
    59  //  - If b is v.Block, then issue the spill right after v.
    60  //    It is known to be in a register at that point, and dominates any restores.
    61  //  - Otherwise, if v is in a register at the start of b,
    62  //    put the spill of v at the start of b.
    63  //  - Otherwise, set b = immediate dominator of b, and repeat.
    64  //
    65  // Phi values are special, as always. We define two kinds of phis, those
    66  // where the merge happens in a register (a "register" phi) and those where
    67  // the merge happens in a stack location (a "stack" phi).
    68  //
    69  // A register phi must have the phi and all of its inputs allocated to the
    70  // same register. Register phis are spilled similarly to regular ops.
    71  //
    72  // A stack phi must have the phi and all of its inputs allocated to the same
    73  // stack location. Stack phis start out life already spilled - each phi
    74  // input must be a store (using StoreReg) at the end of the corresponding
    75  // predecessor block.
    76  //     b1: y = ... : AX        b2: z = ... : BX
    77  //         y2 = StoreReg y         z2 = StoreReg z
    78  //         goto b3                 goto b3
    79  //     b3: x = phi(y2, z2)
    80  // The stack allocator knows that StoreReg args of stack-allocated phis
    81  // must be allocated to the same stack slot as the phi that uses them.
    82  // x is now a spilled value and a restore must appear before its first use.
    83  
    84  // TODO
    85  
    86  // Use an affinity graph to mark two values which should use the
    87  // same register. This affinity graph will be used to prefer certain
    88  // registers for allocation. This affinity helps eliminate moves that
    89  // are required for phi implementations and helps generate allocations
    90  // for 2-register architectures.
    91  
    92  // Note: regalloc generates a not-quite-SSA output. If we have:
    93  //
    94  //             b1: x = ... : AX
    95  //                 x2 = StoreReg x
    96  //                 ... AX gets reused for something else ...
    97  //                 if ... goto b3 else b4
    98  //
    99  //   b3: x3 = LoadReg x2 : BX       b4: x4 = LoadReg x2 : CX
   100  //       ... use x3 ...                 ... use x4 ...
   101  //
   102  //             b2: ... use x3 ...
   103  //
   104  // If b3 is the primary predecessor of b2, then we use x3 in b2 and
   105  // add a x4:CX->BX copy at the end of b4.
   106  // But the definition of x3 doesn't dominate b2.  We should really
   107  // insert an extra phi at the start of b2 (x5=phi(x3,x4):BX) to keep
   108  // SSA form. For now, we ignore this problem as remaining in strict
   109  // SSA form isn't needed after regalloc. We'll just leave the use
   110  // of x3 not dominated by the definition of x3, and the CX->BX copy
   111  // will have no use (so don't run deadcode after regalloc!).
   112  // TODO: maybe we should introduce these extra phis?
   113  
   114  package ssa
   115  
   116  import (
   117  	"cmd/compile/internal/base"
   118  	"cmd/compile/internal/ir"
   119  	"cmd/compile/internal/types"
   120  	"cmd/internal/src"
   121  	"cmd/internal/sys"
   122  	"cmp"
   123  	"fmt"
   124  	"internal/buildcfg"
   125  	"math"
   126  	"math/bits"
   127  	"slices"
   128  	"unsafe"
   129  )
   130  
   131  const (
   132  	moveSpills = iota
   133  	logSpills
   134  	regDebug
   135  	stackDebug
   136  )
   137  
   138  // distance is a measure of how far into the future values are used.
   139  // distance is measured in units of instructions.
   140  const (
   141  	likelyDistance   = 1
   142  	normalDistance   = 10
   143  	unlikelyDistance = 100
   144  )
   145  
   146  // regalloc performs register allocation on f. It sets f.RegAlloc
   147  // to the resulting allocation.
   148  func regalloc(f *Func) {
   149  	var s regAllocState
   150  	s.init(f)
   151  	s.regalloc(f)
   152  	s.close()
   153  }
   154  
   155  type register uint8
   156  
   157  const noRegister register = 255
   158  
   159  // For bulk initializing
   160  var noRegisters [32]register = [32]register{
   161  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   162  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   163  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   164  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   165  }
   166  
   167  // A regMask encodes a set of machine registers.
   168  // TODO: regMask -> regSet?
   169  type regMask uint64
   170  
   171  func (m regMask) String() string {
   172  	s := ""
   173  	for r := register(0); m != 0; r++ {
   174  		if m>>r&1 == 0 {
   175  			continue
   176  		}
   177  		m &^= regMask(1) << r
   178  		if s != "" {
   179  			s += " "
   180  		}
   181  		s += fmt.Sprintf("r%d", r)
   182  	}
   183  	return s
   184  }
   185  
   186  func (m regMask) contains(r register) bool {
   187  	return m>>r&1 != 0
   188  }
   189  
   190  func (s *regAllocState) RegMaskString(m regMask) string {
   191  	str := ""
   192  	for r := register(0); m != 0; r++ {
   193  		if m>>r&1 == 0 {
   194  			continue
   195  		}
   196  		m &^= regMask(1) << r
   197  		if str != "" {
   198  			str += " "
   199  		}
   200  		str += s.registers[r].String()
   201  	}
   202  	return str
   203  }
   204  
   205  // countRegs returns the number of set bits in the register mask.
   206  func countRegs(r regMask) int {
   207  	return bits.OnesCount64(uint64(r))
   208  }
   209  
   210  // pickReg picks an arbitrary register from the register mask.
   211  func pickReg(r regMask) register {
   212  	if r == 0 {
   213  		panic("can't pick a register from an empty set")
   214  	}
   215  	// pick the lowest one
   216  	return register(bits.TrailingZeros64(uint64(r)))
   217  }
   218  
   219  type use struct {
   220  	// distance from start of the block to a use of a value
   221  	//   dist == 0                 used by first instruction in block
   222  	//   dist == len(b.Values)-1   used by last instruction in block
   223  	//   dist == len(b.Values)     used by block's control value
   224  	//   dist  > len(b.Values)     used by a subsequent block
   225  	dist int32
   226  	pos  src.XPos // source position of the use
   227  	next *use     // linked list of uses of a value in nondecreasing dist order
   228  }
   229  
   230  // A valState records the register allocation state for a (pre-regalloc) value.
   231  type valState struct {
   232  	regs              regMask // the set of registers holding a Value (usually just one)
   233  	uses              *use    // list of uses in this block
   234  	spill             *Value  // spilled copy of the Value (if any)
   235  	restoreMin        int32   // minimum of all restores' blocks' sdom.entry
   236  	restoreMax        int32   // maximum of all restores' blocks' sdom.exit
   237  	needReg           bool    // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
   238  	rematerializeable bool    // cached value of v.rematerializeable()
   239  }
   240  
   241  type regState struct {
   242  	v *Value // Original (preregalloc) Value stored in this register.
   243  	c *Value // A Value equal to v which is currently in a register.  Might be v or a copy of it.
   244  	// If a register is unused, v==c==nil
   245  }
   246  
   247  type regAllocState struct {
   248  	f *Func
   249  
   250  	sdom        SparseTree
   251  	registers   []Register
   252  	numRegs     register
   253  	SPReg       register
   254  	SBReg       register
   255  	GReg        register
   256  	ZeroIntReg  register
   257  	allocatable regMask
   258  
   259  	// live values at the end of each block.  live[b.ID] is a list of value IDs
   260  	// which are live at the end of b, together with a count of how many instructions
   261  	// forward to the next use.
   262  	live [][]liveInfo
   263  	// desired register assignments at the end of each block.
   264  	// Note that this is a static map computed before allocation occurs. Dynamic
   265  	// register desires (from partially completed allocations) will trump
   266  	// this information.
   267  	desired []desiredState
   268  
   269  	// current state of each (preregalloc) Value
   270  	values []valState
   271  
   272  	// ID of SP, SB values
   273  	sp, sb ID
   274  
   275  	// For each Value, map from its value ID back to the
   276  	// preregalloc Value it was derived from.
   277  	orig []*Value
   278  
   279  	// current state of each register.
   280  	// Includes only registers in allocatable.
   281  	regs []regState
   282  
   283  	// registers that contain values which can't be kicked out
   284  	nospill regMask
   285  
   286  	// mask of registers currently in use
   287  	used regMask
   288  
   289  	// mask of registers used since the start of the current block
   290  	usedSinceBlockStart regMask
   291  
   292  	// mask of registers used in the current instruction
   293  	tmpused regMask
   294  
   295  	// current block we're working on
   296  	curBlock *Block
   297  
   298  	// cache of use records
   299  	freeUseRecords *use
   300  
   301  	// endRegs[blockid] is the register state at the end of each block.
   302  	// encoded as a set of endReg records.
   303  	endRegs [][]endReg
   304  
   305  	// startRegs[blockid] is the register state at the start of merge blocks.
   306  	// saved state does not include the state of phi ops in the block.
   307  	startRegs [][]startReg
   308  
   309  	// startRegsMask is a mask of the registers in startRegs[curBlock.ID].
   310  	// Registers dropped from startRegsMask are later synchronoized back to
   311  	// startRegs by dropping from there as well.
   312  	startRegsMask regMask
   313  
   314  	// spillLive[blockid] is the set of live spills at the end of each block
   315  	spillLive [][]ID
   316  
   317  	// a set of copies we generated to move things around, and
   318  	// whether it is used in shuffle. Unused copies will be deleted.
   319  	copies map[*Value]bool
   320  
   321  	loopnest *loopnest
   322  
   323  	// choose a good order in which to visit blocks for allocation purposes.
   324  	visitOrder []*Block
   325  
   326  	// blockOrder[b.ID] corresponds to the index of block b in visitOrder.
   327  	blockOrder []int32
   328  
   329  	// whether to insert instructions that clobber dead registers at call sites
   330  	doClobber bool
   331  
   332  	// For each instruction index in a basic block, the index of the next call
   333  	// at or after that instruction index.
   334  	// If there is no next call, returns maxInt32.
   335  	// nextCall for a call instruction points to itself.
   336  	// (Indexes and results are pre-regalloc.)
   337  	nextCall []int32
   338  
   339  	// Index of the instruction we're currently working on.
   340  	// Index is expressed in terms of the pre-regalloc b.Values list.
   341  	curIdx int
   342  }
   343  
   344  type endReg struct {
   345  	r register
   346  	v *Value // pre-regalloc value held in this register (TODO: can we use ID here?)
   347  	c *Value // cached version of the value
   348  }
   349  
   350  type startReg struct {
   351  	r   register
   352  	v   *Value   // pre-regalloc value needed in this register
   353  	c   *Value   // cached version of the value
   354  	pos src.XPos // source position of use of this register
   355  }
   356  
   357  // freeReg frees up register r. Any current user of r is kicked out.
   358  func (s *regAllocState) freeReg(r register) {
   359  	if !s.allocatable.contains(r) && !s.isGReg(r) {
   360  		return
   361  	}
   362  	v := s.regs[r].v
   363  	if v == nil {
   364  		s.f.Fatalf("tried to free an already free register %d\n", r)
   365  	}
   366  
   367  	// Mark r as unused.
   368  	if s.f.pass.debug > regDebug {
   369  		fmt.Printf("freeReg %s (dump %s/%s)\n", &s.registers[r], v, s.regs[r].c)
   370  	}
   371  	s.regs[r] = regState{}
   372  	s.values[v.ID].regs &^= regMask(1) << r
   373  	s.used &^= regMask(1) << r
   374  }
   375  
   376  // freeRegs frees up all registers listed in m.
   377  func (s *regAllocState) freeRegs(m regMask) {
   378  	for m&s.used != 0 {
   379  		s.freeReg(pickReg(m & s.used))
   380  	}
   381  }
   382  
   383  // clobberRegs inserts instructions that clobber registers listed in m.
   384  func (s *regAllocState) clobberRegs(m regMask) {
   385  	m &= s.allocatable & s.f.Config.gpRegMask // only integer register can contain pointers, only clobber them
   386  	for m != 0 {
   387  		r := pickReg(m)
   388  		m &^= 1 << r
   389  		x := s.curBlock.NewValue0(src.NoXPos, OpClobberReg, types.TypeVoid)
   390  		s.f.setHome(x, &s.registers[r])
   391  	}
   392  }
   393  
   394  // setOrig records that c's original value is the same as
   395  // v's original value.
   396  func (s *regAllocState) setOrig(c *Value, v *Value) {
   397  	if int(c.ID) >= cap(s.orig) {
   398  		x := s.f.Cache.allocValueSlice(int(c.ID) + 1)
   399  		copy(x, s.orig)
   400  		s.f.Cache.freeValueSlice(s.orig)
   401  		s.orig = x
   402  	}
   403  	for int(c.ID) >= len(s.orig) {
   404  		s.orig = append(s.orig, nil)
   405  	}
   406  	if s.orig[c.ID] != nil {
   407  		s.f.Fatalf("orig value set twice %s %s", c, v)
   408  	}
   409  	s.orig[c.ID] = s.orig[v.ID]
   410  }
   411  
   412  // assignReg assigns register r to hold c, a copy of v.
   413  // r must be unused.
   414  func (s *regAllocState) assignReg(r register, v *Value, c *Value) {
   415  	if s.f.pass.debug > regDebug {
   416  		fmt.Printf("assignReg %s %s/%s\n", &s.registers[r], v, c)
   417  	}
   418  	// Allocate v to r.
   419  	s.values[v.ID].regs |= regMask(1) << r
   420  	s.f.setHome(c, &s.registers[r])
   421  
   422  	// Allocate r to v.
   423  	if !s.allocatable.contains(r) && !s.isGReg(r) {
   424  		return
   425  	}
   426  	if s.regs[r].v != nil {
   427  		s.f.Fatalf("tried to assign register %d to %s/%s but it is already used by %s", r, v, c, s.regs[r].v)
   428  	}
   429  	s.regs[r] = regState{v, c}
   430  	s.used |= regMask(1) << r
   431  }
   432  
   433  // allocReg chooses a register from the set of registers in mask.
   434  // If there is no unused register, a Value will be kicked out of
   435  // a register to make room.
   436  func (s *regAllocState) allocReg(mask regMask, v *Value) register {
   437  	if v.OnWasmStack {
   438  		return noRegister
   439  	}
   440  
   441  	mask &= s.allocatable
   442  	mask &^= s.nospill
   443  	if mask == 0 {
   444  		s.f.Fatalf("no register available for %s", v.LongString())
   445  	}
   446  
   447  	// Pick an unused register if one is available.
   448  	if mask&^s.used != 0 {
   449  		r := pickReg(mask &^ s.used)
   450  		s.usedSinceBlockStart |= regMask(1) << r
   451  		return r
   452  	}
   453  
   454  	// Pick a value to spill. Spill the value with the
   455  	// farthest-in-the-future use.
   456  	// TODO: Prefer registers with already spilled Values?
   457  	// TODO: Modify preference using affinity graph.
   458  	// TODO: if a single value is in multiple registers, spill one of them
   459  	// before spilling a value in just a single register.
   460  
   461  	// Find a register to spill. We spill the register containing the value
   462  	// whose next use is as far in the future as possible.
   463  	// https://en.wikipedia.org/wiki/Page_replacement_algorithm#The_theoretically_optimal_page_replacement_algorithm
   464  	var r register
   465  	maxuse := int32(-1)
   466  	for t := register(0); t < s.numRegs; t++ {
   467  		if mask>>t&1 == 0 {
   468  			continue
   469  		}
   470  		v := s.regs[t].v
   471  		if n := s.values[v.ID].uses.dist; n > maxuse {
   472  			// v's next use is farther in the future than any value
   473  			// we've seen so far. A new best spill candidate.
   474  			r = t
   475  			maxuse = n
   476  		}
   477  	}
   478  	if maxuse == -1 {
   479  		s.f.Fatalf("couldn't find register to spill")
   480  	}
   481  
   482  	if s.f.Config.ctxt.Arch.Arch == sys.ArchWasm {
   483  		// TODO(neelance): In theory this should never happen, because all wasm registers are equal.
   484  		// So if there is still a free register, the allocation should have picked that one in the first place instead of
   485  		// trying to kick some other value out. In practice, this case does happen and it breaks the stack optimization.
   486  		s.freeReg(r)
   487  		return r
   488  	}
   489  
   490  	// Try to move it around before kicking out, if there is a free register.
   491  	// We generate a Copy and record it. It will be deleted if never used.
   492  	v2 := s.regs[r].v
   493  	m := s.compatRegs(v2.Type) &^ s.used &^ s.tmpused &^ (regMask(1) << r)
   494  	if m != 0 && !s.values[v2.ID].rematerializeable && countRegs(s.values[v2.ID].regs) == 1 {
   495  		s.usedSinceBlockStart |= regMask(1) << r
   496  		r2 := pickReg(m)
   497  		c := s.curBlock.NewValue1(v2.Pos, OpCopy, v2.Type, s.regs[r].c)
   498  		s.copies[c] = false
   499  		if s.f.pass.debug > regDebug {
   500  			fmt.Printf("copy %s to %s : %s\n", v2, c, &s.registers[r2])
   501  		}
   502  		s.setOrig(c, v2)
   503  		s.assignReg(r2, v2, c)
   504  	}
   505  
   506  	// If the evicted register isn't used between the start of the block
   507  	// and now then there is no reason to even request it on entry. We can
   508  	// drop from startRegs in that case.
   509  	if s.usedSinceBlockStart&(regMask(1)<<r) == 0 {
   510  		if s.startRegsMask&(regMask(1)<<r) == 1 {
   511  			if s.f.pass.debug > regDebug {
   512  				fmt.Printf("dropped from startRegs: %s\n", &s.registers[r])
   513  			}
   514  			s.startRegsMask &^= regMask(1) << r
   515  		}
   516  	}
   517  
   518  	s.freeReg(r)
   519  	s.usedSinceBlockStart |= regMask(1) << r
   520  	return r
   521  }
   522  
   523  // makeSpill returns a Value which represents the spilled value of v.
   524  // b is the block in which the spill is used.
   525  func (s *regAllocState) makeSpill(v *Value, b *Block) *Value {
   526  	vi := &s.values[v.ID]
   527  	if vi.spill != nil {
   528  		// Final block not known - keep track of subtree where restores reside.
   529  		vi.restoreMin = min(vi.restoreMin, s.sdom[b.ID].entry)
   530  		vi.restoreMax = max(vi.restoreMax, s.sdom[b.ID].exit)
   531  		return vi.spill
   532  	}
   533  	// Make a spill for v. We don't know where we want
   534  	// to put it yet, so we leave it blockless for now.
   535  	spill := s.f.newValueNoBlock(OpStoreReg, v.Type, v.Pos)
   536  	// We also don't know what the spill's arg will be.
   537  	// Leave it argless for now.
   538  	s.setOrig(spill, v)
   539  	vi.spill = spill
   540  	vi.restoreMin = s.sdom[b.ID].entry
   541  	vi.restoreMax = s.sdom[b.ID].exit
   542  	return spill
   543  }
   544  
   545  // allocValToReg allocates v to a register selected from regMask and
   546  // returns the register copy of v. Any previous user is kicked out and spilled
   547  // (if necessary). Load code is added at the current pc. If nospill is set the
   548  // allocated register is marked nospill so the assignment cannot be
   549  // undone until the caller allows it by clearing nospill. Returns a
   550  // *Value which is either v or a copy of v allocated to the chosen register.
   551  func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos src.XPos) *Value {
   552  	if s.f.Config.ctxt.Arch.Arch == sys.ArchWasm && v.rematerializeable() {
   553  		c := v.copyIntoWithXPos(s.curBlock, pos)
   554  		c.OnWasmStack = true
   555  		s.setOrig(c, v)
   556  		return c
   557  	}
   558  	if v.OnWasmStack {
   559  		return v
   560  	}
   561  
   562  	vi := &s.values[v.ID]
   563  	pos = pos.WithNotStmt()
   564  	// Check if v is already in a requested register.
   565  	if mask&vi.regs != 0 {
   566  		mask &= vi.regs
   567  		r := pickReg(mask)
   568  		if mask.contains(s.SPReg) {
   569  			// Prefer the stack pointer if it is allowed.
   570  			// (Needed because the op might have an Aux symbol
   571  			// that needs SP as its base.)
   572  			r = s.SPReg
   573  		}
   574  		if !s.allocatable.contains(r) {
   575  			return v // v is in a fixed register
   576  		}
   577  		if s.regs[r].v != v || s.regs[r].c == nil {
   578  			panic("bad register state")
   579  		}
   580  		if nospill {
   581  			s.nospill |= regMask(1) << r
   582  		}
   583  		s.usedSinceBlockStart |= regMask(1) << r
   584  		return s.regs[r].c
   585  	}
   586  
   587  	var r register
   588  	// If nospill is set, the value is used immediately, so it can live on the WebAssembly stack.
   589  	onWasmStack := nospill && s.f.Config.ctxt.Arch.Arch == sys.ArchWasm
   590  	if !onWasmStack {
   591  		// Allocate a register.
   592  		r = s.allocReg(mask, v)
   593  	}
   594  
   595  	// Allocate v to the new register.
   596  	var c *Value
   597  	if vi.regs != 0 {
   598  		// Copy from a register that v is already in.
   599  		var current *Value
   600  		if vi.regs&^s.allocatable != 0 {
   601  			// v is in a fixed register, prefer that
   602  			current = v
   603  		} else {
   604  			r2 := pickReg(vi.regs)
   605  			if s.regs[r2].v != v {
   606  				panic("bad register state")
   607  			}
   608  			current = s.regs[r2].c
   609  			s.usedSinceBlockStart |= regMask(1) << r2
   610  		}
   611  		c = s.curBlock.NewValue1(pos, OpCopy, v.Type, current)
   612  	} else if v.rematerializeable() {
   613  		// Rematerialize instead of loading from the spill location.
   614  		c = v.copyIntoWithXPos(s.curBlock, pos)
   615  		// We need to consider its output mask and potentially issue a Copy
   616  		// if there are register mask conflicts.
   617  		// This currently happens for the SIMD package only between GP and FP
   618  		// register. Because Intel's vector extension can put integer value into
   619  		// FP, which is seen as a vector. Example instruction: VPSLL[BWDQ]
   620  		// Because GP and FP masks do not overlap, mask & outputMask == 0
   621  		// detects this situation thoroughly.
   622  		sourceMask := s.regspec(c).outputs[0].regs
   623  		if mask&sourceMask == 0 && !onWasmStack {
   624  			s.setOrig(c, v)
   625  			s.assignReg(s.allocReg(sourceMask, v), v, c)
   626  			// v.Type for the new OpCopy is likely wrong and it might delay the problem
   627  			// until ssa to asm lowering, which might need the types to generate the right
   628  			// assembly for OpCopy. For Intel's GP to FP move, it happens to be that
   629  			// MOV instruction has such a variant so it happens to be right.
   630  			// But it's unclear for other architectures or situations, and the problem
   631  			// might be exposed when the assembler sees illegal instructions.
   632  			// Right now make we still pick v.Type, because at least its size should be correct
   633  			// for the rematerialization case the amd64 SIMD package exposed.
   634  			// TODO: We might need to figure out a way to find the correct type or make
   635  			// the asm lowering use reg info only for OpCopy.
   636  			c = s.curBlock.NewValue1(pos, OpCopy, v.Type, c)
   637  		}
   638  	} else {
   639  		// Load v from its spill location.
   640  		spill := s.makeSpill(v, s.curBlock)
   641  		if s.f.pass.debug > logSpills {
   642  			s.f.Warnl(vi.spill.Pos, "load spill for %v from %v", v, spill)
   643  		}
   644  		c = s.curBlock.NewValue1(pos, OpLoadReg, v.Type, spill)
   645  	}
   646  
   647  	s.setOrig(c, v)
   648  
   649  	if onWasmStack {
   650  		c.OnWasmStack = true
   651  		return c
   652  	}
   653  
   654  	s.assignReg(r, v, c)
   655  	if c.Op == OpLoadReg && s.isGReg(r) {
   656  		s.f.Fatalf("allocValToReg.OpLoadReg targeting g: " + c.LongString())
   657  	}
   658  	if nospill {
   659  		s.nospill |= regMask(1) << r
   660  	}
   661  	return c
   662  }
   663  
   664  // isLeaf reports whether f performs any calls.
   665  func isLeaf(f *Func) bool {
   666  	for _, b := range f.Blocks {
   667  		for _, v := range b.Values {
   668  			if v.Op.IsCall() && !v.Op.IsTailCall() {
   669  				// tail call is not counted as it does not save the return PC or need a frame
   670  				return false
   671  			}
   672  		}
   673  	}
   674  	return true
   675  }
   676  
   677  // needRegister reports whether v needs a register.
   678  func (v *Value) needRegister() bool {
   679  	return !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && !v.Type.IsTuple()
   680  }
   681  
   682  func (s *regAllocState) init(f *Func) {
   683  	s.f = f
   684  	s.f.RegAlloc = s.f.Cache.locs[:0]
   685  	s.registers = f.Config.registers
   686  	if nr := len(s.registers); nr == 0 || nr > int(noRegister) || nr > int(unsafe.Sizeof(regMask(0))*8) {
   687  		s.f.Fatalf("bad number of registers: %d", nr)
   688  	} else {
   689  		s.numRegs = register(nr)
   690  	}
   691  	// Locate SP, SB, and g registers.
   692  	s.SPReg = noRegister
   693  	s.SBReg = noRegister
   694  	s.GReg = noRegister
   695  	s.ZeroIntReg = noRegister
   696  	for r := register(0); r < s.numRegs; r++ {
   697  		switch s.registers[r].String() {
   698  		case "SP":
   699  			s.SPReg = r
   700  		case "SB":
   701  			s.SBReg = r
   702  		case "g":
   703  			s.GReg = r
   704  		case "ZERO": // TODO: arch-specific?
   705  			s.ZeroIntReg = r
   706  		}
   707  	}
   708  	// Make sure we found all required registers.
   709  	switch noRegister {
   710  	case s.SPReg:
   711  		s.f.Fatalf("no SP register found")
   712  	case s.SBReg:
   713  		s.f.Fatalf("no SB register found")
   714  	case s.GReg:
   715  		if f.Config.hasGReg {
   716  			s.f.Fatalf("no g register found")
   717  		}
   718  	}
   719  
   720  	// Figure out which registers we're allowed to use.
   721  	s.allocatable = s.f.Config.gpRegMask | s.f.Config.fpRegMask | s.f.Config.specialRegMask
   722  	s.allocatable &^= 1 << s.SPReg
   723  	s.allocatable &^= 1 << s.SBReg
   724  	if s.f.Config.hasGReg {
   725  		s.allocatable &^= 1 << s.GReg
   726  	}
   727  	if s.ZeroIntReg != noRegister {
   728  		s.allocatable &^= 1 << s.ZeroIntReg
   729  	}
   730  	if buildcfg.FramePointerEnabled && s.f.Config.FPReg >= 0 {
   731  		s.allocatable &^= 1 << uint(s.f.Config.FPReg)
   732  	}
   733  	if s.f.Config.LinkReg != -1 {
   734  		if isLeaf(f) {
   735  			// Leaf functions don't save/restore the link register.
   736  			s.allocatable &^= 1 << uint(s.f.Config.LinkReg)
   737  		}
   738  	}
   739  	if s.f.Config.ctxt.Flag_dynlink {
   740  		switch s.f.Config.arch {
   741  		case "386":
   742  			// nothing to do.
   743  			// Note that for Flag_shared (position independent code)
   744  			// we do need to be careful, but that carefulness is hidden
   745  			// in the rewrite rules so we always have a free register
   746  			// available for global load/stores. See _gen/386.rules (search for Flag_shared).
   747  		case "amd64":
   748  			s.allocatable &^= 1 << 15 // R15
   749  		case "arm":
   750  			s.allocatable &^= 1 << 9 // R9
   751  		case "arm64":
   752  			// nothing to do
   753  		case "loong64": // R2 (aka TP) already reserved.
   754  			// nothing to do
   755  		case "ppc64le": // R2 already reserved.
   756  			// nothing to do
   757  		case "riscv64": // X3 (aka GP) and X4 (aka TP) already reserved.
   758  			// nothing to do
   759  		case "s390x":
   760  			s.allocatable &^= 1 << 11 // R11
   761  		default:
   762  			s.f.fe.Fatalf(src.NoXPos, "arch %s not implemented", s.f.Config.arch)
   763  		}
   764  	}
   765  
   766  	// Linear scan register allocation can be influenced by the order in which blocks appear.
   767  	// Decouple the register allocation order from the generated block order.
   768  	// This also creates an opportunity for experiments to find a better order.
   769  	s.visitOrder = layoutRegallocOrder(f)
   770  
   771  	// Compute block order. This array allows us to distinguish forward edges
   772  	// from backward edges and compute how far they go.
   773  	s.blockOrder = make([]int32, f.NumBlocks())
   774  	for i, b := range s.visitOrder {
   775  		s.blockOrder[b.ID] = int32(i)
   776  	}
   777  
   778  	s.regs = make([]regState, s.numRegs)
   779  	nv := f.NumValues()
   780  	if cap(s.f.Cache.regallocValues) >= nv {
   781  		s.f.Cache.regallocValues = s.f.Cache.regallocValues[:nv]
   782  	} else {
   783  		s.f.Cache.regallocValues = make([]valState, nv)
   784  	}
   785  	s.values = s.f.Cache.regallocValues
   786  	s.orig = s.f.Cache.allocValueSlice(nv)
   787  	s.copies = make(map[*Value]bool)
   788  	for _, b := range s.visitOrder {
   789  		for _, v := range b.Values {
   790  			if v.needRegister() {
   791  				s.values[v.ID].needReg = true
   792  				s.values[v.ID].rematerializeable = v.rematerializeable()
   793  				s.orig[v.ID] = v
   794  			}
   795  			// Note: needReg is false for values returning Tuple types.
   796  			// Instead, we mark the corresponding Selects as needReg.
   797  		}
   798  	}
   799  	s.computeLive()
   800  
   801  	s.endRegs = make([][]endReg, f.NumBlocks())
   802  	s.startRegs = make([][]startReg, f.NumBlocks())
   803  	s.spillLive = make([][]ID, f.NumBlocks())
   804  	s.sdom = f.Sdom()
   805  
   806  	// wasm: Mark instructions that can be optimized to have their values only on the WebAssembly stack.
   807  	if f.Config.ctxt.Arch.Arch == sys.ArchWasm {
   808  		canLiveOnStack := f.newSparseSet(f.NumValues())
   809  		defer f.retSparseSet(canLiveOnStack)
   810  		for _, b := range f.Blocks {
   811  			// New block. Clear candidate set.
   812  			canLiveOnStack.clear()
   813  			for _, c := range b.ControlValues() {
   814  				if c.Uses == 1 && !opcodeTable[c.Op].generic {
   815  					canLiveOnStack.add(c.ID)
   816  				}
   817  			}
   818  			// Walking backwards.
   819  			for i := len(b.Values) - 1; i >= 0; i-- {
   820  				v := b.Values[i]
   821  				if canLiveOnStack.contains(v.ID) {
   822  					v.OnWasmStack = true
   823  				} else {
   824  					// Value can not live on stack. Values are not allowed to be reordered, so clear candidate set.
   825  					canLiveOnStack.clear()
   826  				}
   827  				for _, arg := range v.Args {
   828  					// Value can live on the stack if:
   829  					// - it is only used once
   830  					// - it is used in the same basic block
   831  					// - it is not a "mem" value
   832  					// - it is a WebAssembly op
   833  					if arg.Uses == 1 && arg.Block == v.Block && !arg.Type.IsMemory() && !opcodeTable[arg.Op].generic {
   834  						canLiveOnStack.add(arg.ID)
   835  					}
   836  				}
   837  			}
   838  		}
   839  	}
   840  
   841  	// The clobberdeadreg experiment inserts code to clobber dead registers
   842  	// at call sites.
   843  	// Ignore huge functions to avoid doing too much work.
   844  	if base.Flag.ClobberDeadReg && len(s.f.Blocks) <= 10000 {
   845  		// TODO: honor GOCLOBBERDEADHASH, or maybe GOSSAHASH.
   846  		s.doClobber = true
   847  	}
   848  }
   849  
   850  func (s *regAllocState) close() {
   851  	s.f.Cache.freeValueSlice(s.orig)
   852  }
   853  
   854  // Adds a use record for id at distance dist from the start of the block.
   855  // All calls to addUse must happen with nonincreasing dist.
   856  func (s *regAllocState) addUse(id ID, dist int32, pos src.XPos) {
   857  	r := s.freeUseRecords
   858  	if r != nil {
   859  		s.freeUseRecords = r.next
   860  	} else {
   861  		r = &use{}
   862  	}
   863  	r.dist = dist
   864  	r.pos = pos
   865  	r.next = s.values[id].uses
   866  	s.values[id].uses = r
   867  	if r.next != nil && dist > r.next.dist {
   868  		s.f.Fatalf("uses added in wrong order")
   869  	}
   870  }
   871  
   872  // advanceUses advances the uses of v's args from the state before v to the state after v.
   873  // Any values which have no more uses are deallocated from registers.
   874  func (s *regAllocState) advanceUses(v *Value) {
   875  	for _, a := range v.Args {
   876  		if !s.values[a.ID].needReg {
   877  			continue
   878  		}
   879  		ai := &s.values[a.ID]
   880  		r := ai.uses
   881  		ai.uses = r.next
   882  		if r.next == nil || (!opcodeTable[a.Op].fixedReg && r.next.dist > s.nextCall[s.curIdx]) {
   883  			// Value is dead (or is not used again until after a call), free all registers that hold it.
   884  			s.freeRegs(ai.regs)
   885  		}
   886  		r.next = s.freeUseRecords
   887  		s.freeUseRecords = r
   888  	}
   889  	s.dropIfUnused(v)
   890  }
   891  
   892  // Drop v from registers if it isn't used again, or its only uses are after
   893  // a call instruction.
   894  func (s *regAllocState) dropIfUnused(v *Value) {
   895  	if !s.values[v.ID].needReg {
   896  		return
   897  	}
   898  	vi := &s.values[v.ID]
   899  	r := vi.uses
   900  	nextCall := s.nextCall[s.curIdx]
   901  	if opcodeTable[v.Op].call {
   902  		if s.curIdx == len(s.nextCall)-1 {
   903  			nextCall = math.MaxInt32
   904  		} else {
   905  			nextCall = s.nextCall[s.curIdx+1]
   906  		}
   907  	}
   908  	if r == nil || (!opcodeTable[v.Op].fixedReg && r.dist > nextCall) {
   909  		s.freeRegs(vi.regs)
   910  	}
   911  }
   912  
   913  // liveAfterCurrentInstruction reports whether v is live after
   914  // the current instruction is completed.  v must be used by the
   915  // current instruction.
   916  func (s *regAllocState) liveAfterCurrentInstruction(v *Value) bool {
   917  	u := s.values[v.ID].uses
   918  	if u == nil {
   919  		panic(fmt.Errorf("u is nil, v = %s, s.values[v.ID] = %v", v.LongString(), s.values[v.ID]))
   920  	}
   921  	d := u.dist
   922  	for u != nil && u.dist == d {
   923  		u = u.next
   924  	}
   925  	return u != nil && u.dist > d
   926  }
   927  
   928  // Sets the state of the registers to that encoded in regs.
   929  func (s *regAllocState) setState(regs []endReg) {
   930  	s.freeRegs(s.used)
   931  	for _, x := range regs {
   932  		s.assignReg(x.r, x.v, x.c)
   933  	}
   934  }
   935  
   936  // compatRegs returns the set of registers which can store a type t.
   937  func (s *regAllocState) compatRegs(t *types.Type) regMask {
   938  	var m regMask
   939  	if t.IsTuple() || t.IsFlags() {
   940  		return 0
   941  	}
   942  	if t.IsSIMD() {
   943  		if t.Size() > 8 {
   944  			return s.f.Config.fpRegMask & s.allocatable
   945  		} else {
   946  			// K mask
   947  			return s.f.Config.gpRegMask & s.allocatable
   948  		}
   949  	}
   950  	if t.IsFloat() || t == types.TypeInt128 {
   951  		if t.Kind() == types.TFLOAT32 && s.f.Config.fp32RegMask != 0 {
   952  			m = s.f.Config.fp32RegMask
   953  		} else if t.Kind() == types.TFLOAT64 && s.f.Config.fp64RegMask != 0 {
   954  			m = s.f.Config.fp64RegMask
   955  		} else {
   956  			m = s.f.Config.fpRegMask
   957  		}
   958  	} else {
   959  		m = s.f.Config.gpRegMask
   960  	}
   961  	return m & s.allocatable
   962  }
   963  
   964  // regspec returns the regInfo for operation op.
   965  func (s *regAllocState) regspec(v *Value) regInfo {
   966  	op := v.Op
   967  	if op == OpConvert {
   968  		// OpConvert is a generic op, so it doesn't have a
   969  		// register set in the static table. It can use any
   970  		// allocatable integer register.
   971  		m := s.allocatable & s.f.Config.gpRegMask
   972  		return regInfo{inputs: []inputInfo{{regs: m}}, outputs: []outputInfo{{regs: m}}}
   973  	}
   974  	if op == OpArgIntReg {
   975  		reg := v.Block.Func.Config.intParamRegs[v.AuxInt8()]
   976  		return regInfo{outputs: []outputInfo{{regs: 1 << uint(reg)}}}
   977  	}
   978  	if op == OpArgFloatReg {
   979  		reg := v.Block.Func.Config.floatParamRegs[v.AuxInt8()]
   980  		return regInfo{outputs: []outputInfo{{regs: 1 << uint(reg)}}}
   981  	}
   982  	if op.IsCall() {
   983  		if ac, ok := v.Aux.(*AuxCall); ok && ac.reg != nil {
   984  			return *ac.Reg(&opcodeTable[op].reg, s.f.Config)
   985  		}
   986  	}
   987  	if op == OpMakeResult && s.f.OwnAux.reg != nil {
   988  		return *s.f.OwnAux.ResultReg(s.f.Config)
   989  	}
   990  	return opcodeTable[op].reg
   991  }
   992  
   993  func (s *regAllocState) isGReg(r register) bool {
   994  	return s.f.Config.hasGReg && s.GReg == r
   995  }
   996  
   997  // Dummy value used to represent the value being held in a temporary register.
   998  var tmpVal Value
   999  
  1000  func (s *regAllocState) regalloc(f *Func) {
  1001  	regValLiveSet := f.newSparseSet(f.NumValues()) // set of values that may be live in register
  1002  	defer f.retSparseSet(regValLiveSet)
  1003  	var oldSched []*Value
  1004  	var phis []*Value
  1005  	var phiRegs []register
  1006  	var args []*Value
  1007  
  1008  	// Data structure used for computing desired registers.
  1009  	var desired desiredState
  1010  	desiredSecondReg := map[ID][4]register{} // desired register allocation for 2nd part of a tuple
  1011  
  1012  	// Desired registers for inputs & outputs for each instruction in the block.
  1013  	type dentry struct {
  1014  		out [4]register    // desired output registers
  1015  		in  [3][4]register // desired input registers (for inputs 0,1, and 2)
  1016  	}
  1017  	var dinfo []dentry
  1018  
  1019  	if f.Entry != f.Blocks[0] {
  1020  		f.Fatalf("entry block must be first")
  1021  	}
  1022  
  1023  	for _, b := range s.visitOrder {
  1024  		if s.f.pass.debug > regDebug {
  1025  			fmt.Printf("Begin processing block %v\n", b)
  1026  		}
  1027  		s.curBlock = b
  1028  		s.startRegsMask = 0
  1029  		s.usedSinceBlockStart = 0
  1030  		clear(desiredSecondReg)
  1031  
  1032  		// Initialize regValLiveSet and uses fields for this block.
  1033  		// Walk backwards through the block doing liveness analysis.
  1034  		regValLiveSet.clear()
  1035  		if s.live != nil {
  1036  			for _, e := range s.live[b.ID] {
  1037  				s.addUse(e.ID, int32(len(b.Values))+e.dist, e.pos) // pseudo-uses from beyond end of block
  1038  				regValLiveSet.add(e.ID)
  1039  			}
  1040  		}
  1041  		for _, v := range b.ControlValues() {
  1042  			if s.values[v.ID].needReg {
  1043  				s.addUse(v.ID, int32(len(b.Values)), b.Pos) // pseudo-use by control values
  1044  				regValLiveSet.add(v.ID)
  1045  			}
  1046  		}
  1047  		if cap(s.nextCall) < len(b.Values) {
  1048  			c := cap(s.nextCall)
  1049  			s.nextCall = append(s.nextCall[:c], make([]int32, len(b.Values)-c)...)
  1050  		} else {
  1051  			s.nextCall = s.nextCall[:len(b.Values)]
  1052  		}
  1053  		var nextCall int32 = math.MaxInt32
  1054  		for i := len(b.Values) - 1; i >= 0; i-- {
  1055  			v := b.Values[i]
  1056  			regValLiveSet.remove(v.ID)
  1057  			if v.Op == OpPhi {
  1058  				// Remove v from the live set, but don't add
  1059  				// any inputs. This is the state the len(b.Preds)>1
  1060  				// case below desires; it wants to process phis specially.
  1061  				s.nextCall[i] = nextCall
  1062  				continue
  1063  			}
  1064  			if opcodeTable[v.Op].call {
  1065  				// Function call clobbers all the registers but SP and SB.
  1066  				regValLiveSet.clear()
  1067  				if s.sp != 0 && s.values[s.sp].uses != nil {
  1068  					regValLiveSet.add(s.sp)
  1069  				}
  1070  				if s.sb != 0 && s.values[s.sb].uses != nil {
  1071  					regValLiveSet.add(s.sb)
  1072  				}
  1073  				nextCall = int32(i)
  1074  			}
  1075  			for _, a := range v.Args {
  1076  				if !s.values[a.ID].needReg {
  1077  					continue
  1078  				}
  1079  				s.addUse(a.ID, int32(i), v.Pos)
  1080  				regValLiveSet.add(a.ID)
  1081  			}
  1082  			s.nextCall[i] = nextCall
  1083  		}
  1084  		if s.f.pass.debug > regDebug {
  1085  			fmt.Printf("use distances for %s\n", b)
  1086  			for i := range s.values {
  1087  				vi := &s.values[i]
  1088  				u := vi.uses
  1089  				if u == nil {
  1090  					continue
  1091  				}
  1092  				fmt.Printf("  v%d:", i)
  1093  				for u != nil {
  1094  					fmt.Printf(" %d", u.dist)
  1095  					u = u.next
  1096  				}
  1097  				fmt.Println()
  1098  			}
  1099  		}
  1100  
  1101  		// Make a copy of the block schedule so we can generate a new one in place.
  1102  		// We make a separate copy for phis and regular values.
  1103  		nphi := 0
  1104  		for _, v := range b.Values {
  1105  			if v.Op != OpPhi {
  1106  				break
  1107  			}
  1108  			nphi++
  1109  		}
  1110  		phis = append(phis[:0], b.Values[:nphi]...)
  1111  		oldSched = append(oldSched[:0], b.Values[nphi:]...)
  1112  		b.Values = b.Values[:0]
  1113  
  1114  		// Initialize start state of block.
  1115  		if b == f.Entry {
  1116  			// Regalloc state is empty to start.
  1117  			if nphi > 0 {
  1118  				f.Fatalf("phis in entry block")
  1119  			}
  1120  		} else if len(b.Preds) == 1 {
  1121  			// Start regalloc state with the end state of the previous block.
  1122  			s.setState(s.endRegs[b.Preds[0].b.ID])
  1123  			if nphi > 0 {
  1124  				f.Fatalf("phis in single-predecessor block")
  1125  			}
  1126  			// Drop any values which are no longer live.
  1127  			// This may happen because at the end of p, a value may be
  1128  			// live but only used by some other successor of p.
  1129  			for r := register(0); r < s.numRegs; r++ {
  1130  				v := s.regs[r].v
  1131  				if v != nil && !regValLiveSet.contains(v.ID) {
  1132  					s.freeReg(r)
  1133  				}
  1134  			}
  1135  		} else {
  1136  			// This is the complicated case. We have more than one predecessor,
  1137  			// which means we may have Phi ops.
  1138  
  1139  			// Start with the final register state of the predecessor with least spill values.
  1140  			// This is based on the following points:
  1141  			// 1, The less spill value indicates that the register pressure of this path is smaller,
  1142  			//    so the values of this block are more likely to be allocated to registers.
  1143  			// 2, Avoid the predecessor that contains the function call, because the predecessor that
  1144  			//    contains the function call usually generates a lot of spills and lose the previous
  1145  			//    allocation state.
  1146  			// TODO: Improve this part. At least the size of endRegs of the predecessor also has
  1147  			// an impact on the code size and compiler speed. But it is not easy to find a simple
  1148  			// and efficient method that combines multiple factors.
  1149  			idx := -1
  1150  			for i, p := range b.Preds {
  1151  				// If the predecessor has not been visited yet, skip it because its end state
  1152  				// (redRegs and spillLive) has not been computed yet.
  1153  				pb := p.b
  1154  				if s.blockOrder[pb.ID] >= s.blockOrder[b.ID] {
  1155  					continue
  1156  				}
  1157  				if idx == -1 {
  1158  					idx = i
  1159  					continue
  1160  				}
  1161  				pSel := b.Preds[idx].b
  1162  				if len(s.spillLive[pb.ID]) < len(s.spillLive[pSel.ID]) {
  1163  					idx = i
  1164  				} else if len(s.spillLive[pb.ID]) == len(s.spillLive[pSel.ID]) {
  1165  					// Use a bit of likely information. After critical pass, pb and pSel must
  1166  					// be plain blocks, so check edge pb->pb.Preds instead of edge pb->b.
  1167  					// TODO: improve the prediction of the likely predecessor. The following
  1168  					// method is only suitable for the simplest cases. For complex cases,
  1169  					// the prediction may be inaccurate, but this does not affect the
  1170  					// correctness of the program.
  1171  					// According to the layout algorithm, the predecessor with the
  1172  					// smaller blockOrder is the true branch, and the test results show
  1173  					// that it is better to choose the predecessor with a smaller
  1174  					// blockOrder than no choice.
  1175  					if pb.likelyBranch() && !pSel.likelyBranch() || s.blockOrder[pb.ID] < s.blockOrder[pSel.ID] {
  1176  						idx = i
  1177  					}
  1178  				}
  1179  			}
  1180  			if idx < 0 {
  1181  				f.Fatalf("bad visitOrder, no predecessor of %s has been visited before it", b)
  1182  			}
  1183  			p := b.Preds[idx].b
  1184  			s.setState(s.endRegs[p.ID])
  1185  
  1186  			if s.f.pass.debug > regDebug {
  1187  				fmt.Printf("starting merge block %s with end state of %s:\n", b, p)
  1188  				for _, x := range s.endRegs[p.ID] {
  1189  					fmt.Printf("  %s: orig:%s cache:%s\n", &s.registers[x.r], x.v, x.c)
  1190  				}
  1191  			}
  1192  
  1193  			// Decide on registers for phi ops. Use the registers determined
  1194  			// by the primary predecessor if we can.
  1195  			// TODO: pick best of (already processed) predecessors?
  1196  			// Majority vote? Deepest nesting level?
  1197  			phiRegs = phiRegs[:0]
  1198  			var phiUsed regMask
  1199  
  1200  			for _, v := range phis {
  1201  				if !s.values[v.ID].needReg {
  1202  					phiRegs = append(phiRegs, noRegister)
  1203  					continue
  1204  				}
  1205  				a := v.Args[idx]
  1206  				// Some instructions target not-allocatable registers.
  1207  				// They're not suitable for further (phi-function) allocation.
  1208  				m := s.values[a.ID].regs &^ phiUsed & s.allocatable
  1209  				if m != 0 {
  1210  					r := pickReg(m)
  1211  					phiUsed |= regMask(1) << r
  1212  					phiRegs = append(phiRegs, r)
  1213  				} else {
  1214  					phiRegs = append(phiRegs, noRegister)
  1215  				}
  1216  			}
  1217  
  1218  			// Second pass - deallocate all in-register phi inputs.
  1219  			for i, v := range phis {
  1220  				if !s.values[v.ID].needReg {
  1221  					continue
  1222  				}
  1223  				a := v.Args[idx]
  1224  				r := phiRegs[i]
  1225  				if r == noRegister {
  1226  					continue
  1227  				}
  1228  				if regValLiveSet.contains(a.ID) {
  1229  					// Input value is still live (it is used by something other than Phi).
  1230  					// Try to move it around before kicking out, if there is a free register.
  1231  					// We generate a Copy in the predecessor block and record it. It will be
  1232  					// deleted later if never used.
  1233  					//
  1234  					// Pick a free register. At this point some registers used in the predecessor
  1235  					// block may have been deallocated. Those are the ones used for Phis. Exclude
  1236  					// them (and they are not going to be helpful anyway).
  1237  					m := s.compatRegs(a.Type) &^ s.used &^ phiUsed
  1238  					if m != 0 && !s.values[a.ID].rematerializeable && countRegs(s.values[a.ID].regs) == 1 {
  1239  						r2 := pickReg(m)
  1240  						c := p.NewValue1(a.Pos, OpCopy, a.Type, s.regs[r].c)
  1241  						s.copies[c] = false
  1242  						if s.f.pass.debug > regDebug {
  1243  							fmt.Printf("copy %s to %s : %s\n", a, c, &s.registers[r2])
  1244  						}
  1245  						s.setOrig(c, a)
  1246  						s.assignReg(r2, a, c)
  1247  						s.endRegs[p.ID] = append(s.endRegs[p.ID], endReg{r2, a, c})
  1248  					}
  1249  				}
  1250  				s.freeReg(r)
  1251  			}
  1252  
  1253  			// Copy phi ops into new schedule.
  1254  			b.Values = append(b.Values, phis...)
  1255  
  1256  			// Third pass - pick registers for phis whose input
  1257  			// was not in a register in the primary predecessor.
  1258  			for i, v := range phis {
  1259  				if !s.values[v.ID].needReg {
  1260  					continue
  1261  				}
  1262  				if phiRegs[i] != noRegister {
  1263  					continue
  1264  				}
  1265  				m := s.compatRegs(v.Type) &^ phiUsed &^ s.used
  1266  				// If one of the other inputs of v is in a register, and the register is available,
  1267  				// select this register, which can save some unnecessary copies.
  1268  				for i, pe := range b.Preds {
  1269  					if i == idx {
  1270  						continue
  1271  					}
  1272  					ri := noRegister
  1273  					for _, er := range s.endRegs[pe.b.ID] {
  1274  						if er.v == s.orig[v.Args[i].ID] {
  1275  							ri = er.r
  1276  							break
  1277  						}
  1278  					}
  1279  					if ri != noRegister && m>>ri&1 != 0 {
  1280  						m = regMask(1) << ri
  1281  						break
  1282  					}
  1283  				}
  1284  				if m != 0 {
  1285  					r := pickReg(m)
  1286  					phiRegs[i] = r
  1287  					phiUsed |= regMask(1) << r
  1288  				}
  1289  			}
  1290  
  1291  			// Set registers for phis. Add phi spill code.
  1292  			for i, v := range phis {
  1293  				if !s.values[v.ID].needReg {
  1294  					continue
  1295  				}
  1296  				r := phiRegs[i]
  1297  				if r == noRegister {
  1298  					// stack-based phi
  1299  					// Spills will be inserted in all the predecessors below.
  1300  					s.values[v.ID].spill = v // v starts life spilled
  1301  					continue
  1302  				}
  1303  				// register-based phi
  1304  				s.assignReg(r, v, v)
  1305  			}
  1306  
  1307  			// Deallocate any values which are no longer live. Phis are excluded.
  1308  			for r := register(0); r < s.numRegs; r++ {
  1309  				if phiUsed>>r&1 != 0 {
  1310  					continue
  1311  				}
  1312  				v := s.regs[r].v
  1313  				if v != nil && !regValLiveSet.contains(v.ID) {
  1314  					s.freeReg(r)
  1315  				}
  1316  			}
  1317  
  1318  			// Save the starting state for use by merge edges.
  1319  			// We append to a stack allocated variable that we'll
  1320  			// later copy into s.startRegs in one fell swoop, to save
  1321  			// on allocations.
  1322  			regList := make([]startReg, 0, 32)
  1323  			for r := register(0); r < s.numRegs; r++ {
  1324  				v := s.regs[r].v
  1325  				if v == nil {
  1326  					continue
  1327  				}
  1328  				if phiUsed>>r&1 != 0 {
  1329  					// Skip registers that phis used, we'll handle those
  1330  					// specially during merge edge processing.
  1331  					continue
  1332  				}
  1333  				regList = append(regList, startReg{r, v, s.regs[r].c, s.values[v.ID].uses.pos})
  1334  				s.startRegsMask |= regMask(1) << r
  1335  			}
  1336  			s.startRegs[b.ID] = make([]startReg, len(regList))
  1337  			copy(s.startRegs[b.ID], regList)
  1338  
  1339  			if s.f.pass.debug > regDebug {
  1340  				fmt.Printf("after phis\n")
  1341  				for _, x := range s.startRegs[b.ID] {
  1342  					fmt.Printf("  %s: v%d\n", &s.registers[x.r], x.v.ID)
  1343  				}
  1344  			}
  1345  		}
  1346  
  1347  		// Drop phis from registers if they immediately go dead.
  1348  		for i, v := range phis {
  1349  			s.curIdx = i
  1350  			s.dropIfUnused(v)
  1351  		}
  1352  
  1353  		// Allocate space to record the desired registers for each value.
  1354  		if l := len(oldSched); cap(dinfo) < l {
  1355  			dinfo = make([]dentry, l)
  1356  		} else {
  1357  			dinfo = dinfo[:l]
  1358  			clear(dinfo)
  1359  		}
  1360  
  1361  		// Load static desired register info at the end of the block.
  1362  		if s.desired != nil {
  1363  			desired.copy(&s.desired[b.ID])
  1364  		}
  1365  
  1366  		// Check actual assigned registers at the start of the next block(s).
  1367  		// Dynamically assigned registers will trump the static
  1368  		// desired registers computed during liveness analysis.
  1369  		// Note that we do this phase after startRegs is set above, so that
  1370  		// we get the right behavior for a block which branches to itself.
  1371  		for _, e := range b.Succs {
  1372  			succ := e.b
  1373  			// TODO: prioritize likely successor?
  1374  			for _, x := range s.startRegs[succ.ID] {
  1375  				desired.add(x.v.ID, x.r)
  1376  			}
  1377  			// Process phi ops in succ.
  1378  			pidx := e.i
  1379  			for _, v := range succ.Values {
  1380  				if v.Op != OpPhi {
  1381  					break
  1382  				}
  1383  				if !s.values[v.ID].needReg {
  1384  					continue
  1385  				}
  1386  				rp, ok := s.f.getHome(v.ID).(*Register)
  1387  				if !ok {
  1388  					// If v is not assigned a register, pick a register assigned to one of v's inputs.
  1389  					// Hopefully v will get assigned that register later.
  1390  					// If the inputs have allocated register information, add it to desired,
  1391  					// which may reduce spill or copy operations when the register is available.
  1392  					for _, a := range v.Args {
  1393  						rp, ok = s.f.getHome(a.ID).(*Register)
  1394  						if ok {
  1395  							break
  1396  						}
  1397  					}
  1398  					if !ok {
  1399  						continue
  1400  					}
  1401  				}
  1402  				desired.add(v.Args[pidx].ID, register(rp.num))
  1403  			}
  1404  		}
  1405  		// Walk values backwards computing desired register info.
  1406  		// See computeDesired for more comments.
  1407  		for i := len(oldSched) - 1; i >= 0; i-- {
  1408  			v := oldSched[i]
  1409  			prefs := desired.remove(v.ID)
  1410  			regspec := s.regspec(v)
  1411  			desired.clobber(regspec.clobbers)
  1412  			for _, j := range regspec.inputs {
  1413  				if countRegs(j.regs) != 1 {
  1414  					continue
  1415  				}
  1416  				desired.clobber(j.regs)
  1417  				desired.add(v.Args[j.idx].ID, pickReg(j.regs))
  1418  			}
  1419  			if opcodeTable[v.Op].resultInArg0 || v.Op == OpAMD64ADDQconst || v.Op == OpAMD64ADDLconst || v.Op == OpSelect0 {
  1420  				if opcodeTable[v.Op].commutative {
  1421  					desired.addList(v.Args[1].ID, prefs)
  1422  				}
  1423  				desired.addList(v.Args[0].ID, prefs)
  1424  			}
  1425  			// Save desired registers for this value.
  1426  			dinfo[i].out = prefs
  1427  			for j, a := range v.Args {
  1428  				if j >= len(dinfo[i].in) {
  1429  					break
  1430  				}
  1431  				dinfo[i].in[j] = desired.get(a.ID)
  1432  			}
  1433  			if v.Op == OpSelect1 && prefs[0] != noRegister {
  1434  				// Save desired registers of select1 for
  1435  				// use by the tuple generating instruction.
  1436  				desiredSecondReg[v.Args[0].ID] = prefs
  1437  			}
  1438  		}
  1439  
  1440  		// Process all the non-phi values.
  1441  		for idx, v := range oldSched {
  1442  			s.curIdx = nphi + idx
  1443  			tmpReg := noRegister
  1444  			if s.f.pass.debug > regDebug {
  1445  				fmt.Printf("  processing %s\n", v.LongString())
  1446  			}
  1447  			regspec := s.regspec(v)
  1448  			if v.Op == OpPhi {
  1449  				f.Fatalf("phi %s not at start of block", v)
  1450  			}
  1451  			if opcodeTable[v.Op].fixedReg {
  1452  				switch v.Op {
  1453  				case OpSP:
  1454  					s.assignReg(s.SPReg, v, v)
  1455  					s.sp = v.ID
  1456  				case OpSB:
  1457  					s.assignReg(s.SBReg, v, v)
  1458  					s.sb = v.ID
  1459  				case OpARM64ZERO, OpLOONG64ZERO, OpMIPS64ZERO:
  1460  					s.assignReg(s.ZeroIntReg, v, v)
  1461  				case OpAMD64Zero128, OpAMD64Zero256, OpAMD64Zero512:
  1462  					regspec := s.regspec(v)
  1463  					m := regspec.outputs[0].regs
  1464  					if countRegs(m) != 1 {
  1465  						f.Fatalf("bad fixed-register op %s", v)
  1466  					}
  1467  					s.assignReg(pickReg(m), v, v)
  1468  				default:
  1469  					f.Fatalf("unknown fixed-register op %s", v)
  1470  				}
  1471  				b.Values = append(b.Values, v)
  1472  				s.advanceUses(v)
  1473  				continue
  1474  			}
  1475  			if v.Op == OpSelect0 || v.Op == OpSelect1 || v.Op == OpSelectN {
  1476  				if s.values[v.ID].needReg {
  1477  					if v.Op == OpSelectN {
  1478  						s.assignReg(register(s.f.getHome(v.Args[0].ID).(LocResults)[int(v.AuxInt)].(*Register).num), v, v)
  1479  					} else {
  1480  						var i = 0
  1481  						if v.Op == OpSelect1 {
  1482  							i = 1
  1483  						}
  1484  						s.assignReg(register(s.f.getHome(v.Args[0].ID).(LocPair)[i].(*Register).num), v, v)
  1485  					}
  1486  				}
  1487  				b.Values = append(b.Values, v)
  1488  				s.advanceUses(v)
  1489  				continue
  1490  			}
  1491  			if v.Op == OpGetG && s.f.Config.hasGReg {
  1492  				// use hardware g register
  1493  				if s.regs[s.GReg].v != nil {
  1494  					s.freeReg(s.GReg) // kick out the old value
  1495  				}
  1496  				s.assignReg(s.GReg, v, v)
  1497  				b.Values = append(b.Values, v)
  1498  				s.advanceUses(v)
  1499  				continue
  1500  			}
  1501  			if v.Op == OpArg {
  1502  				// Args are "pre-spilled" values. We don't allocate
  1503  				// any register here. We just set up the spill pointer to
  1504  				// point at itself and any later user will restore it to use it.
  1505  				s.values[v.ID].spill = v
  1506  				b.Values = append(b.Values, v)
  1507  				s.advanceUses(v)
  1508  				continue
  1509  			}
  1510  			if v.Op == OpKeepAlive {
  1511  				// Make sure the argument to v is still live here.
  1512  				s.advanceUses(v)
  1513  				a := v.Args[0]
  1514  				vi := &s.values[a.ID]
  1515  				if vi.regs == 0 && !vi.rematerializeable {
  1516  					// Use the spill location.
  1517  					// This forces later liveness analysis to make the
  1518  					// value live at this point.
  1519  					v.SetArg(0, s.makeSpill(a, b))
  1520  				} else if _, ok := a.Aux.(*ir.Name); ok && vi.rematerializeable {
  1521  					// Rematerializeable value with a *ir.Name. This is the address of
  1522  					// a stack object (e.g. an LEAQ). Keep the object live.
  1523  					// Change it to VarLive, which is what plive expects for locals.
  1524  					v.Op = OpVarLive
  1525  					v.SetArgs1(v.Args[1])
  1526  					v.Aux = a.Aux
  1527  				} else {
  1528  					// In-register and rematerializeable values are already live.
  1529  					// These are typically rematerializeable constants like nil,
  1530  					// or values of a variable that were modified since the last call.
  1531  					v.Op = OpCopy
  1532  					v.SetArgs1(v.Args[1])
  1533  				}
  1534  				b.Values = append(b.Values, v)
  1535  				continue
  1536  			}
  1537  			if len(regspec.inputs) == 0 && len(regspec.outputs) == 0 {
  1538  				// No register allocation required (or none specified yet)
  1539  				if s.doClobber && v.Op.IsCall() {
  1540  					s.clobberRegs(regspec.clobbers)
  1541  				}
  1542  				s.freeRegs(regspec.clobbers)
  1543  				b.Values = append(b.Values, v)
  1544  				s.advanceUses(v)
  1545  				continue
  1546  			}
  1547  
  1548  			if s.values[v.ID].rematerializeable {
  1549  				// Value is rematerializeable, don't issue it here.
  1550  				// It will get issued just before each use (see
  1551  				// allocValueToReg).
  1552  				for _, a := range v.Args {
  1553  					a.Uses--
  1554  				}
  1555  				s.advanceUses(v)
  1556  				continue
  1557  			}
  1558  
  1559  			if s.f.pass.debug > regDebug {
  1560  				fmt.Printf("value %s\n", v.LongString())
  1561  				fmt.Printf("  out:")
  1562  				for _, r := range dinfo[idx].out {
  1563  					if r != noRegister {
  1564  						fmt.Printf(" %s", &s.registers[r])
  1565  					}
  1566  				}
  1567  				fmt.Println()
  1568  				for i := 0; i < len(v.Args) && i < 3; i++ {
  1569  					fmt.Printf("  in%d:", i)
  1570  					for _, r := range dinfo[idx].in[i] {
  1571  						if r != noRegister {
  1572  							fmt.Printf(" %s", &s.registers[r])
  1573  						}
  1574  					}
  1575  					fmt.Println()
  1576  				}
  1577  			}
  1578  
  1579  			// Move arguments to registers.
  1580  			// First, if an arg must be in a specific register and it is already
  1581  			// in place, keep it.
  1582  			args = append(args[:0], make([]*Value, len(v.Args))...)
  1583  			for i, a := range v.Args {
  1584  				if !s.values[a.ID].needReg {
  1585  					args[i] = a
  1586  				}
  1587  			}
  1588  			for _, i := range regspec.inputs {
  1589  				mask := i.regs
  1590  				if countRegs(mask) == 1 && mask&s.values[v.Args[i.idx].ID].regs != 0 {
  1591  					args[i.idx] = s.allocValToReg(v.Args[i.idx], mask, true, v.Pos)
  1592  				}
  1593  			}
  1594  			// Then, if an arg must be in a specific register and that
  1595  			// register is free, allocate that one. Otherwise when processing
  1596  			// another input we may kick a value into the free register, which
  1597  			// then will be kicked out again.
  1598  			// This is a common case for passing-in-register arguments for
  1599  			// function calls.
  1600  			for {
  1601  				freed := false
  1602  				for _, i := range regspec.inputs {
  1603  					if args[i.idx] != nil {
  1604  						continue // already allocated
  1605  					}
  1606  					mask := i.regs
  1607  					if countRegs(mask) == 1 && mask&^s.used != 0 {
  1608  						args[i.idx] = s.allocValToReg(v.Args[i.idx], mask, true, v.Pos)
  1609  						// If the input is in other registers that will be clobbered by v,
  1610  						// or the input is dead, free the registers. This may make room
  1611  						// for other inputs.
  1612  						oldregs := s.values[v.Args[i.idx].ID].regs
  1613  						if oldregs&^regspec.clobbers == 0 || !s.liveAfterCurrentInstruction(v.Args[i.idx]) {
  1614  							s.freeRegs(oldregs &^ mask &^ s.nospill)
  1615  							freed = true
  1616  						}
  1617  					}
  1618  				}
  1619  				if !freed {
  1620  					break
  1621  				}
  1622  			}
  1623  			// Last, allocate remaining ones, in an ordering defined
  1624  			// by the register specification (most constrained first).
  1625  			for _, i := range regspec.inputs {
  1626  				if args[i.idx] != nil {
  1627  					continue // already allocated
  1628  				}
  1629  				mask := i.regs
  1630  				if mask&s.values[v.Args[i.idx].ID].regs == 0 {
  1631  					// Need a new register for the input.
  1632  					mask &= s.allocatable
  1633  					mask &^= s.nospill
  1634  					// Used desired register if available.
  1635  					if i.idx < 3 {
  1636  						for _, r := range dinfo[idx].in[i.idx] {
  1637  							if r != noRegister && (mask&^s.used)>>r&1 != 0 {
  1638  								// Desired register is allowed and unused.
  1639  								mask = regMask(1) << r
  1640  								break
  1641  							}
  1642  						}
  1643  					}
  1644  					// Avoid registers we're saving for other values.
  1645  					if mask&^desired.avoid != 0 {
  1646  						mask &^= desired.avoid
  1647  					}
  1648  				}
  1649  				if mask&s.values[v.Args[i.idx].ID].regs&(1<<s.SPReg) != 0 {
  1650  					// Prefer SP register. This ensures that local variables
  1651  					// use SP as their base register (instead of a copy of the
  1652  					// stack pointer living in another register). See issue 74836.
  1653  					mask = 1 << s.SPReg
  1654  				}
  1655  				args[i.idx] = s.allocValToReg(v.Args[i.idx], mask, true, v.Pos)
  1656  			}
  1657  
  1658  			// If the output clobbers the input register, make sure we have
  1659  			// at least two copies of the input register so we don't
  1660  			// have to reload the value from the spill location.
  1661  			if opcodeTable[v.Op].resultInArg0 {
  1662  				var m regMask
  1663  				if !s.liveAfterCurrentInstruction(v.Args[0]) {
  1664  					// arg0 is dead.  We can clobber its register.
  1665  					goto ok
  1666  				}
  1667  				if opcodeTable[v.Op].commutative && !s.liveAfterCurrentInstruction(v.Args[1]) {
  1668  					args[0], args[1] = args[1], args[0]
  1669  					goto ok
  1670  				}
  1671  				if s.values[v.Args[0].ID].rematerializeable {
  1672  					// We can rematerialize the input, don't worry about clobbering it.
  1673  					goto ok
  1674  				}
  1675  				if opcodeTable[v.Op].commutative && s.values[v.Args[1].ID].rematerializeable {
  1676  					args[0], args[1] = args[1], args[0]
  1677  					goto ok
  1678  				}
  1679  				if countRegs(s.values[v.Args[0].ID].regs) >= 2 {
  1680  					// we have at least 2 copies of arg0.  We can afford to clobber one.
  1681  					goto ok
  1682  				}
  1683  				if opcodeTable[v.Op].commutative && countRegs(s.values[v.Args[1].ID].regs) >= 2 {
  1684  					args[0], args[1] = args[1], args[0]
  1685  					goto ok
  1686  				}
  1687  
  1688  				// We can't overwrite arg0 (or arg1, if commutative).  So we
  1689  				// need to make a copy of an input so we have a register we can modify.
  1690  
  1691  				// Possible new registers to copy into.
  1692  				m = s.compatRegs(v.Args[0].Type) &^ s.used
  1693  				if m == 0 {
  1694  					// No free registers.  In this case we'll just clobber
  1695  					// an input and future uses of that input must use a restore.
  1696  					// TODO(khr): We should really do this like allocReg does it,
  1697  					// spilling the value with the most distant next use.
  1698  					goto ok
  1699  				}
  1700  
  1701  				// Try to move an input to the desired output, if allowed.
  1702  				for _, r := range dinfo[idx].out {
  1703  					if r != noRegister && (m&regspec.outputs[0].regs)>>r&1 != 0 {
  1704  						m = regMask(1) << r
  1705  						args[0] = s.allocValToReg(v.Args[0], m, true, v.Pos)
  1706  						// Note: we update args[0] so the instruction will
  1707  						// use the register copy we just made.
  1708  						goto ok
  1709  					}
  1710  				}
  1711  				// Try to copy input to its desired location & use its old
  1712  				// location as the result register.
  1713  				for _, r := range dinfo[idx].in[0] {
  1714  					if r != noRegister && m>>r&1 != 0 {
  1715  						m = regMask(1) << r
  1716  						c := s.allocValToReg(v.Args[0], m, true, v.Pos)
  1717  						s.copies[c] = false
  1718  						// Note: no update to args[0] so the instruction will
  1719  						// use the original copy.
  1720  						goto ok
  1721  					}
  1722  				}
  1723  				if opcodeTable[v.Op].commutative {
  1724  					for _, r := range dinfo[idx].in[1] {
  1725  						if r != noRegister && m>>r&1 != 0 {
  1726  							m = regMask(1) << r
  1727  							c := s.allocValToReg(v.Args[1], m, true, v.Pos)
  1728  							s.copies[c] = false
  1729  							args[0], args[1] = args[1], args[0]
  1730  							goto ok
  1731  						}
  1732  					}
  1733  				}
  1734  
  1735  				// Avoid future fixed uses if we can.
  1736  				if m&^desired.avoid != 0 {
  1737  					m &^= desired.avoid
  1738  				}
  1739  				// Save input 0 to a new register so we can clobber it.
  1740  				c := s.allocValToReg(v.Args[0], m, true, v.Pos)
  1741  				s.copies[c] = false
  1742  
  1743  				// Normally we use the register of the old copy of input 0 as the target.
  1744  				// However, if input 0 is already in its desired register then we use
  1745  				// the register of the new copy instead.
  1746  				if regspec.outputs[0].regs>>s.f.getHome(c.ID).(*Register).num&1 != 0 {
  1747  					if rp, ok := s.f.getHome(args[0].ID).(*Register); ok {
  1748  						r := register(rp.num)
  1749  						for _, r2 := range dinfo[idx].in[0] {
  1750  							if r == r2 {
  1751  								args[0] = c
  1752  								break
  1753  							}
  1754  						}
  1755  					}
  1756  				}
  1757  			}
  1758  		ok:
  1759  			for i := 0; i < 2; i++ {
  1760  				if !(i == 0 && regspec.clobbersArg0 || i == 1 && regspec.clobbersArg1) {
  1761  					continue
  1762  				}
  1763  				if !s.liveAfterCurrentInstruction(v.Args[i]) {
  1764  					// arg is dead.  We can clobber its register.
  1765  					continue
  1766  				}
  1767  				if s.values[v.Args[i].ID].rematerializeable {
  1768  					// We can rematerialize the input, don't worry about clobbering it.
  1769  					continue
  1770  				}
  1771  				if countRegs(s.values[v.Args[i].ID].regs) >= 2 {
  1772  					// We have at least 2 copies of arg.  We can afford to clobber one.
  1773  					continue
  1774  				}
  1775  				// Possible new registers to copy into.
  1776  				m := s.compatRegs(v.Args[i].Type) &^ s.used
  1777  				if m == 0 {
  1778  					// No free registers.  In this case we'll just clobber the
  1779  					// input and future uses of that input must use a restore.
  1780  					// TODO(khr): We should really do this like allocReg does it,
  1781  					// spilling the value with the most distant next use.
  1782  					continue
  1783  				}
  1784  				// Copy input to a different register that won't be clobbered.
  1785  				c := s.allocValToReg(v.Args[i], m, true, v.Pos)
  1786  				s.copies[c] = false
  1787  			}
  1788  
  1789  			// Pick a temporary register if needed.
  1790  			// It should be distinct from all the input registers, so we
  1791  			// allocate it after all the input registers, but before
  1792  			// the input registers are freed via advanceUses below.
  1793  			// (Not all instructions need that distinct part, but it is conservative.)
  1794  			// We also ensure it is not any of the single-choice output registers.
  1795  			if opcodeTable[v.Op].needIntTemp {
  1796  				m := s.allocatable & s.f.Config.gpRegMask
  1797  				for _, out := range regspec.outputs {
  1798  					if countRegs(out.regs) == 1 {
  1799  						m &^= out.regs
  1800  					}
  1801  				}
  1802  				if m&^desired.avoid&^s.nospill != 0 {
  1803  					m &^= desired.avoid
  1804  				}
  1805  				tmpReg = s.allocReg(m, &tmpVal)
  1806  				s.nospill |= regMask(1) << tmpReg
  1807  				s.tmpused |= regMask(1) << tmpReg
  1808  			}
  1809  
  1810  			if regspec.clobbersArg0 {
  1811  				s.freeReg(register(s.f.getHome(args[0].ID).(*Register).num))
  1812  			}
  1813  			if regspec.clobbersArg1 && !(regspec.clobbersArg0 && s.f.getHome(args[0].ID) == s.f.getHome(args[1].ID)) {
  1814  				s.freeReg(register(s.f.getHome(args[1].ID).(*Register).num))
  1815  			}
  1816  
  1817  			// Now that all args are in regs, we're ready to issue the value itself.
  1818  			// Before we pick a register for the output value, allow input registers
  1819  			// to be deallocated. We do this here so that the output can use the
  1820  			// same register as a dying input.
  1821  			if !opcodeTable[v.Op].resultNotInArgs {
  1822  				s.tmpused = s.nospill
  1823  				s.nospill = 0
  1824  				s.advanceUses(v) // frees any registers holding args that are no longer live
  1825  			}
  1826  
  1827  			// Dump any registers which will be clobbered
  1828  			if s.doClobber && v.Op.IsCall() {
  1829  				// clobber registers that are marked as clobber in regmask, but
  1830  				// don't clobber inputs.
  1831  				s.clobberRegs(regspec.clobbers &^ s.tmpused &^ s.nospill)
  1832  			}
  1833  			s.freeRegs(regspec.clobbers)
  1834  			s.tmpused |= regspec.clobbers
  1835  
  1836  			// Pick registers for outputs.
  1837  			{
  1838  				outRegs := noRegisters // TODO if this is costly, hoist and clear incrementally below.
  1839  				maxOutIdx := -1
  1840  				var used regMask
  1841  				if tmpReg != noRegister {
  1842  					// Ensure output registers are distinct from the temporary register.
  1843  					// (Not all instructions need that distinct part, but it is conservative.)
  1844  					used |= regMask(1) << tmpReg
  1845  				}
  1846  				for _, out := range regspec.outputs {
  1847  					if out.regs == 0 {
  1848  						continue
  1849  					}
  1850  					mask := out.regs & s.allocatable &^ used
  1851  					if mask == 0 {
  1852  						s.f.Fatalf("can't find any output register %s", v.LongString())
  1853  					}
  1854  					if opcodeTable[v.Op].resultInArg0 && out.idx == 0 {
  1855  						if !opcodeTable[v.Op].commutative {
  1856  							// Output must use the same register as input 0.
  1857  							r := register(s.f.getHome(args[0].ID).(*Register).num)
  1858  							if mask>>r&1 == 0 {
  1859  								s.f.Fatalf("resultInArg0 value's input %v cannot be an output of %s", s.f.getHome(args[0].ID).(*Register), v.LongString())
  1860  							}
  1861  							mask = regMask(1) << r
  1862  						} else {
  1863  							// Output must use the same register as input 0 or 1.
  1864  							r0 := register(s.f.getHome(args[0].ID).(*Register).num)
  1865  							r1 := register(s.f.getHome(args[1].ID).(*Register).num)
  1866  							// Check r0 and r1 for desired output register.
  1867  							found := false
  1868  							for _, r := range dinfo[idx].out {
  1869  								if (r == r0 || r == r1) && (mask&^s.used)>>r&1 != 0 {
  1870  									mask = regMask(1) << r
  1871  									found = true
  1872  									if r == r1 {
  1873  										args[0], args[1] = args[1], args[0]
  1874  									}
  1875  									break
  1876  								}
  1877  							}
  1878  							if !found {
  1879  								// Neither are desired, pick r0.
  1880  								mask = regMask(1) << r0
  1881  							}
  1882  						}
  1883  					}
  1884  					if out.idx == 0 { // desired registers only apply to the first element of a tuple result
  1885  						for _, r := range dinfo[idx].out {
  1886  							if r != noRegister && (mask&^s.used)>>r&1 != 0 {
  1887  								// Desired register is allowed and unused.
  1888  								mask = regMask(1) << r
  1889  								break
  1890  							}
  1891  						}
  1892  					}
  1893  					if out.idx == 1 {
  1894  						if prefs, ok := desiredSecondReg[v.ID]; ok {
  1895  							for _, r := range prefs {
  1896  								if r != noRegister && (mask&^s.used)>>r&1 != 0 {
  1897  									// Desired register is allowed and unused.
  1898  									mask = regMask(1) << r
  1899  									break
  1900  								}
  1901  							}
  1902  						}
  1903  					}
  1904  					// Avoid registers we're saving for other values.
  1905  					if mask&^desired.avoid&^s.nospill&^s.used != 0 {
  1906  						mask &^= desired.avoid
  1907  					}
  1908  					r := s.allocReg(mask, v)
  1909  					if out.idx > maxOutIdx {
  1910  						maxOutIdx = out.idx
  1911  					}
  1912  					outRegs[out.idx] = r
  1913  					used |= regMask(1) << r
  1914  					s.tmpused |= regMask(1) << r
  1915  				}
  1916  				// Record register choices
  1917  				if v.Type.IsTuple() {
  1918  					var outLocs LocPair
  1919  					if r := outRegs[0]; r != noRegister {
  1920  						outLocs[0] = &s.registers[r]
  1921  					}
  1922  					if r := outRegs[1]; r != noRegister {
  1923  						outLocs[1] = &s.registers[r]
  1924  					}
  1925  					s.f.setHome(v, outLocs)
  1926  					// Note that subsequent SelectX instructions will do the assignReg calls.
  1927  				} else if v.Type.IsResults() {
  1928  					// preallocate outLocs to the right size, which is maxOutIdx+1
  1929  					outLocs := make(LocResults, maxOutIdx+1, maxOutIdx+1)
  1930  					for i := 0; i <= maxOutIdx; i++ {
  1931  						if r := outRegs[i]; r != noRegister {
  1932  							outLocs[i] = &s.registers[r]
  1933  						}
  1934  					}
  1935  					s.f.setHome(v, outLocs)
  1936  				} else {
  1937  					if r := outRegs[0]; r != noRegister {
  1938  						s.assignReg(r, v, v)
  1939  					}
  1940  				}
  1941  				if tmpReg != noRegister {
  1942  					// Remember the temp register allocation, if any.
  1943  					if s.f.tempRegs == nil {
  1944  						s.f.tempRegs = map[ID]*Register{}
  1945  					}
  1946  					s.f.tempRegs[v.ID] = &s.registers[tmpReg]
  1947  				}
  1948  			}
  1949  
  1950  			// deallocate dead args, if we have not done so
  1951  			if opcodeTable[v.Op].resultNotInArgs {
  1952  				s.nospill = 0
  1953  				s.advanceUses(v) // frees any registers holding args that are no longer live
  1954  			}
  1955  			s.tmpused = 0
  1956  
  1957  			// Issue the Value itself.
  1958  			for i, a := range args {
  1959  				v.SetArg(i, a) // use register version of arguments
  1960  			}
  1961  			b.Values = append(b.Values, v)
  1962  			s.dropIfUnused(v)
  1963  		}
  1964  
  1965  		// Copy the control values - we need this so we can reduce the
  1966  		// uses property of these values later.
  1967  		controls := append(make([]*Value, 0, 2), b.ControlValues()...)
  1968  
  1969  		// Load control values into registers.
  1970  		for i, v := range b.ControlValues() {
  1971  			if !s.values[v.ID].needReg {
  1972  				continue
  1973  			}
  1974  			if s.f.pass.debug > regDebug {
  1975  				fmt.Printf("  processing control %s\n", v.LongString())
  1976  			}
  1977  			// We assume that a control input can be passed in any
  1978  			// type-compatible register. If this turns out not to be true,
  1979  			// we'll need to introduce a regspec for a block's control value.
  1980  			b.ReplaceControl(i, s.allocValToReg(v, s.compatRegs(v.Type), false, b.Pos))
  1981  		}
  1982  
  1983  		// Reduce the uses of the control values once registers have been loaded.
  1984  		// This loop is equivalent to the advanceUses method.
  1985  		for _, v := range controls {
  1986  			vi := &s.values[v.ID]
  1987  			if !vi.needReg {
  1988  				continue
  1989  			}
  1990  			// Remove this use from the uses list.
  1991  			u := vi.uses
  1992  			vi.uses = u.next
  1993  			if u.next == nil {
  1994  				s.freeRegs(vi.regs) // value is dead
  1995  			}
  1996  			u.next = s.freeUseRecords
  1997  			s.freeUseRecords = u
  1998  		}
  1999  
  2000  		// If we are approaching a merge point and we are the primary
  2001  		// predecessor of it, find live values that we use soon after
  2002  		// the merge point and promote them to registers now.
  2003  		if len(b.Succs) == 1 {
  2004  			if s.f.Config.hasGReg && s.regs[s.GReg].v != nil {
  2005  				s.freeReg(s.GReg) // Spill value in G register before any merge.
  2006  			}
  2007  			if s.blockOrder[b.ID] > s.blockOrder[b.Succs[0].b.ID] {
  2008  				// No point if we've already regalloc'd the destination.
  2009  				goto badloop
  2010  			}
  2011  			// For this to be worthwhile, the loop must have no calls in it.
  2012  			top := b.Succs[0].b
  2013  			loop := s.loopnest.b2l[top.ID]
  2014  			if loop == nil || loop.header != top || loop.containsUnavoidableCall {
  2015  				goto badloop
  2016  			}
  2017  
  2018  			// Look into target block, find Phi arguments that come from b.
  2019  			phiArgs := regValLiveSet // reuse this space
  2020  			phiArgs.clear()
  2021  			for _, v := range b.Succs[0].b.Values {
  2022  				if v.Op == OpPhi {
  2023  					phiArgs.add(v.Args[b.Succs[0].i].ID)
  2024  				}
  2025  			}
  2026  
  2027  			// Get mask of all registers that might be used soon in the destination.
  2028  			// We don't want to kick values out of these registers, but we will
  2029  			// kick out an unlikely-to-be-used value for a likely-to-be-used one.
  2030  			var likelyUsedRegs regMask
  2031  			for _, live := range s.live[b.ID] {
  2032  				if live.dist < unlikelyDistance {
  2033  					likelyUsedRegs |= s.values[live.ID].regs
  2034  				}
  2035  			}
  2036  			// Promote values we're going to use soon in the destination to registers.
  2037  			// Note that this iterates nearest-use first, as we sorted
  2038  			// live lists by distance in computeLive.
  2039  			for _, live := range s.live[b.ID] {
  2040  				if live.dist >= unlikelyDistance {
  2041  					// Don't preload anything live after the loop.
  2042  					continue
  2043  				}
  2044  				vid := live.ID
  2045  				vi := &s.values[vid]
  2046  				v := s.orig[vid]
  2047  				if phiArgs.contains(vid) {
  2048  					// A phi argument needs its value in a regular register,
  2049  					// as returned by compatRegs. Being in a fixed register
  2050  					// (e.g. the zero register) or being easily
  2051  					// rematerializeable isn't enough.
  2052  					if vi.regs&s.compatRegs(v.Type) != 0 {
  2053  						continue
  2054  					}
  2055  				} else {
  2056  					if vi.regs != 0 {
  2057  						continue
  2058  					}
  2059  					if vi.rematerializeable {
  2060  						// TODO: maybe we should not skip rematerializeable
  2061  						// values here. One rematerialization outside the loop
  2062  						// is better than N in the loop. But rematerializations
  2063  						// are cheap, and spilling another value may not be.
  2064  						// And we don't want to materialize the zero register
  2065  						// into a different register when it is just the
  2066  						// argument to a store.
  2067  						continue
  2068  					}
  2069  				}
  2070  				if vi.rematerializeable && s.f.Config.ctxt.Arch.Arch == sys.ArchWasm {
  2071  					continue
  2072  				}
  2073  				// Registers we could load v into.
  2074  				// Don't kick out other likely-used values.
  2075  				m := s.compatRegs(v.Type) &^ likelyUsedRegs
  2076  				if m == 0 {
  2077  					// To many likely-used values to give them all a register.
  2078  					continue
  2079  				}
  2080  
  2081  				// Used desired register if available.
  2082  			outerloop:
  2083  				for _, e := range desired.entries {
  2084  					if e.ID != v.ID {
  2085  						continue
  2086  					}
  2087  					for _, r := range e.regs {
  2088  						if r != noRegister && m>>r&1 != 0 {
  2089  							m = regMask(1) << r
  2090  							break outerloop
  2091  						}
  2092  					}
  2093  				}
  2094  				if m&^desired.avoid != 0 {
  2095  					m &^= desired.avoid
  2096  				}
  2097  				s.allocValToReg(v, m, false, b.Pos)
  2098  				likelyUsedRegs |= s.values[v.ID].regs
  2099  			}
  2100  		}
  2101  	badloop:
  2102  		;
  2103  
  2104  		// Save end-of-block register state.
  2105  		// First count how many, this cuts allocations in half.
  2106  		k := 0
  2107  		for r := register(0); r < s.numRegs; r++ {
  2108  			v := s.regs[r].v
  2109  			if v == nil {
  2110  				continue
  2111  			}
  2112  			k++
  2113  		}
  2114  		regList := make([]endReg, 0, k)
  2115  		for r := register(0); r < s.numRegs; r++ {
  2116  			v := s.regs[r].v
  2117  			if v == nil {
  2118  				continue
  2119  			}
  2120  			regList = append(regList, endReg{r, v, s.regs[r].c})
  2121  		}
  2122  		s.endRegs[b.ID] = regList
  2123  
  2124  		if checkEnabled {
  2125  			regValLiveSet.clear()
  2126  			if s.live != nil {
  2127  				for _, x := range s.live[b.ID] {
  2128  					regValLiveSet.add(x.ID)
  2129  				}
  2130  			}
  2131  			for r := register(0); r < s.numRegs; r++ {
  2132  				v := s.regs[r].v
  2133  				if v == nil {
  2134  					continue
  2135  				}
  2136  				if !regValLiveSet.contains(v.ID) {
  2137  					s.f.Fatalf("val %s is in reg but not live at end of %s", v, b)
  2138  				}
  2139  			}
  2140  		}
  2141  
  2142  		// If a value is live at the end of the block and
  2143  		// isn't in a register, generate a use for the spill location.
  2144  		// We need to remember this information so that
  2145  		// the liveness analysis in stackalloc is correct.
  2146  		if s.live != nil {
  2147  			for _, e := range s.live[b.ID] {
  2148  				vi := &s.values[e.ID]
  2149  				if vi.regs != 0 {
  2150  					// in a register, we'll use that source for the merge.
  2151  					continue
  2152  				}
  2153  				if vi.rematerializeable {
  2154  					// we'll rematerialize during the merge.
  2155  					continue
  2156  				}
  2157  				if s.f.pass.debug > regDebug {
  2158  					fmt.Printf("live-at-end spill for %s at %s\n", s.orig[e.ID], b)
  2159  				}
  2160  				spill := s.makeSpill(s.orig[e.ID], b)
  2161  				s.spillLive[b.ID] = append(s.spillLive[b.ID], spill.ID)
  2162  			}
  2163  
  2164  			// Clear any final uses.
  2165  			// All that is left should be the pseudo-uses added for values which
  2166  			// are live at the end of b.
  2167  			for _, e := range s.live[b.ID] {
  2168  				u := s.values[e.ID].uses
  2169  				if u == nil {
  2170  					f.Fatalf("live at end, no uses v%d", e.ID)
  2171  				}
  2172  				if u.next != nil {
  2173  					f.Fatalf("live at end, too many uses v%d", e.ID)
  2174  				}
  2175  				s.values[e.ID].uses = nil
  2176  				u.next = s.freeUseRecords
  2177  				s.freeUseRecords = u
  2178  			}
  2179  		}
  2180  
  2181  		// allocReg may have dropped registers from startRegsMask that
  2182  		// aren't actually needed in startRegs. Synchronize back to
  2183  		// startRegs.
  2184  		//
  2185  		// This must be done before placing spills, which will look at
  2186  		// startRegs to decide if a block is a valid block for a spill.
  2187  		if c := countRegs(s.startRegsMask); c != len(s.startRegs[b.ID]) {
  2188  			regs := make([]startReg, 0, c)
  2189  			for _, sr := range s.startRegs[b.ID] {
  2190  				if s.startRegsMask&(regMask(1)<<sr.r) == 0 {
  2191  					continue
  2192  				}
  2193  				regs = append(regs, sr)
  2194  			}
  2195  			s.startRegs[b.ID] = regs
  2196  		}
  2197  	}
  2198  
  2199  	// Decide where the spills we generated will go.
  2200  	s.placeSpills()
  2201  
  2202  	// Anything that didn't get a register gets a stack location here.
  2203  	// (StoreReg, stack-based phis, inputs, ...)
  2204  	stacklive := stackalloc(s.f, s.spillLive)
  2205  
  2206  	// Fix up all merge edges.
  2207  	s.shuffle(stacklive)
  2208  
  2209  	// Erase any copies we never used.
  2210  	// Also, an unused copy might be the only use of another copy,
  2211  	// so continue erasing until we reach a fixed point.
  2212  	for {
  2213  		progress := false
  2214  		for c, used := range s.copies {
  2215  			if !used && c.Uses == 0 {
  2216  				if s.f.pass.debug > regDebug {
  2217  					fmt.Printf("delete copied value %s\n", c.LongString())
  2218  				}
  2219  				c.resetArgs()
  2220  				f.freeValue(c)
  2221  				delete(s.copies, c)
  2222  				progress = true
  2223  			}
  2224  		}
  2225  		if !progress {
  2226  			break
  2227  		}
  2228  	}
  2229  
  2230  	for _, b := range s.visitOrder {
  2231  		i := 0
  2232  		for _, v := range b.Values {
  2233  			if v.Op == OpInvalid {
  2234  				continue
  2235  			}
  2236  			b.Values[i] = v
  2237  			i++
  2238  		}
  2239  		b.Values = b.Values[:i]
  2240  	}
  2241  }
  2242  
  2243  func (s *regAllocState) placeSpills() {
  2244  	mustBeFirst := func(op Op) bool {
  2245  		return op.isLoweredGetClosurePtr() || op == OpPhi || op == OpArgIntReg || op == OpArgFloatReg
  2246  	}
  2247  
  2248  	// Start maps block IDs to the list of spills
  2249  	// that go at the start of the block (but after any phis).
  2250  	start := map[ID][]*Value{}
  2251  	// After maps value IDs to the list of spills
  2252  	// that go immediately after that value ID.
  2253  	after := map[ID][]*Value{}
  2254  
  2255  	for i := range s.values {
  2256  		vi := s.values[i]
  2257  		spill := vi.spill
  2258  		if spill == nil {
  2259  			continue
  2260  		}
  2261  		if spill.Block != nil {
  2262  			// Some spills are already fully set up,
  2263  			// like OpArgs and stack-based phis.
  2264  			continue
  2265  		}
  2266  		v := s.orig[i]
  2267  
  2268  		// Walk down the dominator tree looking for a good place to
  2269  		// put the spill of v.  At the start "best" is the best place
  2270  		// we have found so far.
  2271  		// TODO: find a way to make this O(1) without arbitrary cutoffs.
  2272  		if v == nil {
  2273  			panic(fmt.Errorf("nil v, s.orig[%d], vi = %v, spill = %s", i, vi, spill.LongString()))
  2274  		}
  2275  		best := v.Block
  2276  		bestArg := v
  2277  		var bestDepth int16
  2278  		if s.loopnest != nil && s.loopnest.b2l[best.ID] != nil {
  2279  			bestDepth = s.loopnest.b2l[best.ID].depth
  2280  		}
  2281  		b := best
  2282  		const maxSpillSearch = 100
  2283  		for i := 0; i < maxSpillSearch; i++ {
  2284  			// Find the child of b in the dominator tree which
  2285  			// dominates all restores.
  2286  			p := b
  2287  			b = nil
  2288  			for c := s.sdom.Child(p); c != nil && i < maxSpillSearch; c, i = s.sdom.Sibling(c), i+1 {
  2289  				if s.sdom[c.ID].entry <= vi.restoreMin && s.sdom[c.ID].exit >= vi.restoreMax {
  2290  					// c also dominates all restores.  Walk down into c.
  2291  					b = c
  2292  					break
  2293  				}
  2294  			}
  2295  			if b == nil {
  2296  				// Ran out of blocks which dominate all restores.
  2297  				break
  2298  			}
  2299  
  2300  			var depth int16
  2301  			if s.loopnest != nil && s.loopnest.b2l[b.ID] != nil {
  2302  				depth = s.loopnest.b2l[b.ID].depth
  2303  			}
  2304  			if depth > bestDepth {
  2305  				// Don't push the spill into a deeper loop.
  2306  				continue
  2307  			}
  2308  
  2309  			// If v is in a register at the start of b, we can
  2310  			// place the spill here (after the phis).
  2311  			if len(b.Preds) == 1 {
  2312  				for _, e := range s.endRegs[b.Preds[0].b.ID] {
  2313  					if e.v == v {
  2314  						// Found a better spot for the spill.
  2315  						best = b
  2316  						bestArg = e.c
  2317  						bestDepth = depth
  2318  						break
  2319  					}
  2320  				}
  2321  			} else {
  2322  				for _, e := range s.startRegs[b.ID] {
  2323  					if e.v == v {
  2324  						// Found a better spot for the spill.
  2325  						best = b
  2326  						bestArg = e.c
  2327  						bestDepth = depth
  2328  						break
  2329  					}
  2330  				}
  2331  			}
  2332  		}
  2333  
  2334  		// Put the spill in the best block we found.
  2335  		spill.Block = best
  2336  		spill.AddArg(bestArg)
  2337  		if best == v.Block && !mustBeFirst(v.Op) {
  2338  			// Place immediately after v.
  2339  			after[v.ID] = append(after[v.ID], spill)
  2340  		} else {
  2341  			// Place at the start of best block.
  2342  			start[best.ID] = append(start[best.ID], spill)
  2343  		}
  2344  	}
  2345  
  2346  	// Insert spill instructions into the block schedules.
  2347  	var oldSched []*Value
  2348  	for _, b := range s.visitOrder {
  2349  		nfirst := 0
  2350  		for _, v := range b.Values {
  2351  			if !mustBeFirst(v.Op) {
  2352  				break
  2353  			}
  2354  			nfirst++
  2355  		}
  2356  		oldSched = append(oldSched[:0], b.Values[nfirst:]...)
  2357  		b.Values = b.Values[:nfirst]
  2358  		b.Values = append(b.Values, start[b.ID]...)
  2359  		for _, v := range oldSched {
  2360  			b.Values = append(b.Values, v)
  2361  			b.Values = append(b.Values, after[v.ID]...)
  2362  		}
  2363  	}
  2364  }
  2365  
  2366  // shuffle fixes up all the merge edges (those going into blocks of indegree > 1).
  2367  func (s *regAllocState) shuffle(stacklive [][]ID) {
  2368  	var e edgeState
  2369  	e.s = s
  2370  	e.cache = map[ID][]*Value{}
  2371  	e.contents = map[Location]contentRecord{}
  2372  	if s.f.pass.debug > regDebug {
  2373  		fmt.Printf("shuffle %s\n", s.f.Name)
  2374  		fmt.Println(s.f.String())
  2375  	}
  2376  
  2377  	for _, b := range s.visitOrder {
  2378  		if len(b.Preds) <= 1 {
  2379  			continue
  2380  		}
  2381  		e.b = b
  2382  		for i, edge := range b.Preds {
  2383  			p := edge.b
  2384  			e.p = p
  2385  			e.setup(i, s.endRegs[p.ID], s.startRegs[b.ID], stacklive[p.ID])
  2386  			e.process()
  2387  		}
  2388  	}
  2389  
  2390  	if s.f.pass.debug > regDebug {
  2391  		fmt.Printf("post shuffle %s\n", s.f.Name)
  2392  		fmt.Println(s.f.String())
  2393  	}
  2394  }
  2395  
  2396  type edgeState struct {
  2397  	s    *regAllocState
  2398  	p, b *Block // edge goes from p->b.
  2399  
  2400  	// for each pre-regalloc value, a list of equivalent cached values
  2401  	cache      map[ID][]*Value
  2402  	cachedVals []ID // (superset of) keys of the above map, for deterministic iteration
  2403  
  2404  	// map from location to the value it contains
  2405  	contents map[Location]contentRecord
  2406  
  2407  	// desired destination locations
  2408  	destinations []dstRecord
  2409  	extra        []dstRecord
  2410  
  2411  	usedRegs              regMask // registers currently holding something
  2412  	uniqueRegs            regMask // registers holding the only copy of a value
  2413  	finalRegs             regMask // registers holding final target
  2414  	rematerializeableRegs regMask // registers that hold rematerializeable values
  2415  }
  2416  
  2417  type contentRecord struct {
  2418  	vid   ID       // pre-regalloc value
  2419  	c     *Value   // cached value
  2420  	final bool     // this is a satisfied destination
  2421  	pos   src.XPos // source position of use of the value
  2422  }
  2423  
  2424  type dstRecord struct {
  2425  	loc    Location // register or stack slot
  2426  	vid    ID       // pre-regalloc value it should contain
  2427  	splice **Value  // place to store reference to the generating instruction
  2428  	pos    src.XPos // source position of use of this location
  2429  }
  2430  
  2431  // setup initializes the edge state for shuffling.
  2432  func (e *edgeState) setup(idx int, srcReg []endReg, dstReg []startReg, stacklive []ID) {
  2433  	if e.s.f.pass.debug > regDebug {
  2434  		fmt.Printf("edge %s->%s\n", e.p, e.b)
  2435  	}
  2436  
  2437  	// Clear state.
  2438  	clear(e.cache)
  2439  	e.cachedVals = e.cachedVals[:0]
  2440  	clear(e.contents)
  2441  	e.usedRegs = 0
  2442  	e.uniqueRegs = 0
  2443  	e.finalRegs = 0
  2444  	e.rematerializeableRegs = 0
  2445  
  2446  	// Live registers can be sources.
  2447  	for _, x := range srcReg {
  2448  		e.set(&e.s.registers[x.r], x.v.ID, x.c, false, src.NoXPos) // don't care the position of the source
  2449  	}
  2450  	// So can all of the spill locations.
  2451  	for _, spillID := range stacklive {
  2452  		v := e.s.orig[spillID]
  2453  		spill := e.s.values[v.ID].spill
  2454  		if !e.s.sdom.IsAncestorEq(spill.Block, e.p) {
  2455  			// Spills were placed that only dominate the uses found
  2456  			// during the first regalloc pass. The edge fixup code
  2457  			// can't use a spill location if the spill doesn't dominate
  2458  			// the edge.
  2459  			// We are guaranteed that if the spill doesn't dominate this edge,
  2460  			// then the value is available in a register (because we called
  2461  			// makeSpill for every value not in a register at the start
  2462  			// of an edge).
  2463  			continue
  2464  		}
  2465  		e.set(e.s.f.getHome(spillID), v.ID, spill, false, src.NoXPos) // don't care the position of the source
  2466  	}
  2467  
  2468  	// Figure out all the destinations we need.
  2469  	dsts := e.destinations[:0]
  2470  	for _, x := range dstReg {
  2471  		dsts = append(dsts, dstRecord{&e.s.registers[x.r], x.v.ID, nil, x.pos})
  2472  	}
  2473  	// Phis need their args to end up in a specific location.
  2474  	for _, v := range e.b.Values {
  2475  		if v.Op != OpPhi {
  2476  			break
  2477  		}
  2478  		loc := e.s.f.getHome(v.ID)
  2479  		if loc == nil {
  2480  			continue
  2481  		}
  2482  		dsts = append(dsts, dstRecord{loc, v.Args[idx].ID, &v.Args[idx], v.Pos})
  2483  	}
  2484  	e.destinations = dsts
  2485  
  2486  	if e.s.f.pass.debug > regDebug {
  2487  		for _, vid := range e.cachedVals {
  2488  			a := e.cache[vid]
  2489  			for _, c := range a {
  2490  				fmt.Printf("src %s: v%d cache=%s\n", e.s.f.getHome(c.ID), vid, c)
  2491  			}
  2492  		}
  2493  		for _, d := range e.destinations {
  2494  			fmt.Printf("dst %s: v%d\n", d.loc, d.vid)
  2495  		}
  2496  	}
  2497  }
  2498  
  2499  // process generates code to move all the values to the right destination locations.
  2500  func (e *edgeState) process() {
  2501  	dsts := e.destinations
  2502  
  2503  	// Process the destinations until they are all satisfied.
  2504  	for len(dsts) > 0 {
  2505  		i := 0
  2506  		for _, d := range dsts {
  2507  			if !e.processDest(d.loc, d.vid, d.splice, d.pos) {
  2508  				// Failed - save for next iteration.
  2509  				dsts[i] = d
  2510  				i++
  2511  			}
  2512  		}
  2513  		if i < len(dsts) {
  2514  			// Made some progress. Go around again.
  2515  			dsts = dsts[:i]
  2516  
  2517  			// Append any extras destinations we generated.
  2518  			dsts = append(dsts, e.extra...)
  2519  			e.extra = e.extra[:0]
  2520  			continue
  2521  		}
  2522  
  2523  		// We made no progress. That means that any
  2524  		// remaining unsatisfied moves are in simple cycles.
  2525  		// For example, A -> B -> C -> D -> A.
  2526  		//   A ----> B
  2527  		//   ^       |
  2528  		//   |       |
  2529  		//   |       v
  2530  		//   D <---- C
  2531  
  2532  		// To break the cycle, we pick an unused register, say R,
  2533  		// and put a copy of B there.
  2534  		//   A ----> B
  2535  		//   ^       |
  2536  		//   |       |
  2537  		//   |       v
  2538  		//   D <---- C <---- R=copyofB
  2539  		// When we resume the outer loop, the A->B move can now proceed,
  2540  		// and eventually the whole cycle completes.
  2541  
  2542  		// Copy any cycle location to a temp register. This duplicates
  2543  		// one of the cycle entries, allowing the just duplicated value
  2544  		// to be overwritten and the cycle to proceed.
  2545  		d := dsts[0]
  2546  		loc := d.loc
  2547  		vid := e.contents[loc].vid
  2548  		c := e.contents[loc].c
  2549  		r := e.findRegFor(c.Type)
  2550  		if e.s.f.pass.debug > regDebug {
  2551  			fmt.Printf("breaking cycle with v%d in %s:%s\n", vid, loc, c)
  2552  		}
  2553  		e.erase(r)
  2554  		pos := d.pos.WithNotStmt()
  2555  		if _, isReg := loc.(*Register); isReg {
  2556  			c = e.p.NewValue1(pos, OpCopy, c.Type, c)
  2557  		} else {
  2558  			c = e.p.NewValue1(pos, OpLoadReg, c.Type, c)
  2559  		}
  2560  		e.set(r, vid, c, false, pos)
  2561  		if c.Op == OpLoadReg && e.s.isGReg(register(r.(*Register).num)) {
  2562  			e.s.f.Fatalf("process.OpLoadReg targeting g: " + c.LongString())
  2563  		}
  2564  	}
  2565  }
  2566  
  2567  // processDest generates code to put value vid into location loc. Returns true
  2568  // if progress was made.
  2569  func (e *edgeState) processDest(loc Location, vid ID, splice **Value, pos src.XPos) bool {
  2570  	pos = pos.WithNotStmt()
  2571  	occupant := e.contents[loc]
  2572  	if occupant.vid == vid {
  2573  		// Value is already in the correct place.
  2574  		e.contents[loc] = contentRecord{vid, occupant.c, true, pos}
  2575  		if splice != nil {
  2576  			(*splice).Uses--
  2577  			*splice = occupant.c
  2578  			occupant.c.Uses++
  2579  		}
  2580  		// Note: if splice==nil then c will appear dead. This is
  2581  		// non-SSA formed code, so be careful after this pass not to run
  2582  		// deadcode elimination.
  2583  		if _, ok := e.s.copies[occupant.c]; ok {
  2584  			// The copy at occupant.c was used to avoid spill.
  2585  			e.s.copies[occupant.c] = true
  2586  		}
  2587  		return true
  2588  	}
  2589  
  2590  	// Check if we're allowed to clobber the destination location.
  2591  	if len(e.cache[occupant.vid]) == 1 && !e.s.values[occupant.vid].rematerializeable && !opcodeTable[e.s.orig[occupant.vid].Op].fixedReg {
  2592  		// We can't overwrite the last copy
  2593  		// of a value that needs to survive.
  2594  		return false
  2595  	}
  2596  
  2597  	// Copy from a source of v, register preferred.
  2598  	v := e.s.orig[vid]
  2599  	var c *Value
  2600  	var src Location
  2601  	if e.s.f.pass.debug > regDebug {
  2602  		fmt.Printf("moving v%d to %s\n", vid, loc)
  2603  		fmt.Printf("sources of v%d:", vid)
  2604  	}
  2605  	if opcodeTable[v.Op].fixedReg {
  2606  		c = v
  2607  		src = e.s.f.getHome(v.ID)
  2608  	} else {
  2609  		for _, w := range e.cache[vid] {
  2610  			h := e.s.f.getHome(w.ID)
  2611  			if e.s.f.pass.debug > regDebug {
  2612  				fmt.Printf(" %s:%s", h, w)
  2613  			}
  2614  			_, isreg := h.(*Register)
  2615  			if src == nil || isreg {
  2616  				c = w
  2617  				src = h
  2618  			}
  2619  		}
  2620  	}
  2621  	if e.s.f.pass.debug > regDebug {
  2622  		if src != nil {
  2623  			fmt.Printf(" [use %s]\n", src)
  2624  		} else {
  2625  			fmt.Printf(" [no source]\n")
  2626  		}
  2627  	}
  2628  	_, dstReg := loc.(*Register)
  2629  
  2630  	// Pre-clobber destination. This avoids the
  2631  	// following situation:
  2632  	//   - v is currently held in R0 and stacktmp0.
  2633  	//   - We want to copy stacktmp1 to stacktmp0.
  2634  	//   - We choose R0 as the temporary register.
  2635  	// During the copy, both R0 and stacktmp0 are
  2636  	// clobbered, losing both copies of v. Oops!
  2637  	// Erasing the destination early means R0 will not
  2638  	// be chosen as the temp register, as it will then
  2639  	// be the last copy of v.
  2640  	e.erase(loc)
  2641  	var x *Value
  2642  	if c == nil || e.s.values[vid].rematerializeable {
  2643  		if !e.s.values[vid].rematerializeable {
  2644  			e.s.f.Fatalf("can't find source for %s->%s: %s\n", e.p, e.b, v.LongString())
  2645  		}
  2646  		if dstReg {
  2647  			// We want to rematerialize v into a register that is incompatible with v's op's register mask.
  2648  			// Instead of setting the wrong register for the rematerialized v, we should find the right register
  2649  			// for it and emit an additional copy to move to the desired register.
  2650  			// For #70451.
  2651  			if e.s.regspec(v).outputs[0].regs&regMask(1<<register(loc.(*Register).num)) == 0 {
  2652  				_, srcReg := src.(*Register)
  2653  				if srcReg {
  2654  					// It exists in a valid register already, so just copy it to the desired register
  2655  					// If src is a Register, c must have already been set.
  2656  					x = e.p.NewValue1(pos, OpCopy, c.Type, c)
  2657  				} else {
  2658  					// We need a tmp register
  2659  					x = v.copyInto(e.p)
  2660  					r := e.findRegFor(x.Type)
  2661  					e.erase(r)
  2662  					// Rematerialize to the tmp register
  2663  					e.set(r, vid, x, false, pos)
  2664  					// Copy from tmp to the desired register
  2665  					x = e.p.NewValue1(pos, OpCopy, x.Type, x)
  2666  				}
  2667  			} else {
  2668  				x = v.copyInto(e.p)
  2669  			}
  2670  		} else {
  2671  			// Rematerialize into stack slot. Need a free
  2672  			// register to accomplish this.
  2673  			r := e.findRegFor(v.Type)
  2674  			e.erase(r)
  2675  			x = v.copyIntoWithXPos(e.p, pos)
  2676  			e.set(r, vid, x, false, pos)
  2677  			// Make sure we spill with the size of the slot, not the
  2678  			// size of x (which might be wider due to our dropping
  2679  			// of narrowing conversions).
  2680  			x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, x)
  2681  		}
  2682  	} else {
  2683  		// Emit move from src to dst.
  2684  		_, srcReg := src.(*Register)
  2685  		if srcReg {
  2686  			if dstReg {
  2687  				x = e.p.NewValue1(pos, OpCopy, c.Type, c)
  2688  			} else {
  2689  				x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, c)
  2690  			}
  2691  		} else {
  2692  			if dstReg {
  2693  				x = e.p.NewValue1(pos, OpLoadReg, c.Type, c)
  2694  			} else {
  2695  				// mem->mem. Use temp register.
  2696  				r := e.findRegFor(c.Type)
  2697  				e.erase(r)
  2698  				t := e.p.NewValue1(pos, OpLoadReg, c.Type, c)
  2699  				e.set(r, vid, t, false, pos)
  2700  				x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, t)
  2701  			}
  2702  		}
  2703  	}
  2704  	e.set(loc, vid, x, true, pos)
  2705  	if x.Op == OpLoadReg && e.s.isGReg(register(loc.(*Register).num)) {
  2706  		e.s.f.Fatalf("processDest.OpLoadReg targeting g: " + x.LongString())
  2707  	}
  2708  	if splice != nil {
  2709  		(*splice).Uses--
  2710  		*splice = x
  2711  		x.Uses++
  2712  	}
  2713  	return true
  2714  }
  2715  
  2716  // set changes the contents of location loc to hold the given value and its cached representative.
  2717  func (e *edgeState) set(loc Location, vid ID, c *Value, final bool, pos src.XPos) {
  2718  	e.s.f.setHome(c, loc)
  2719  	e.contents[loc] = contentRecord{vid, c, final, pos}
  2720  	a := e.cache[vid]
  2721  	if len(a) == 0 {
  2722  		e.cachedVals = append(e.cachedVals, vid)
  2723  	}
  2724  	a = append(a, c)
  2725  	e.cache[vid] = a
  2726  	if r, ok := loc.(*Register); ok {
  2727  		if e.usedRegs&(regMask(1)<<uint(r.num)) != 0 {
  2728  			e.s.f.Fatalf("%v is already set (v%d/%v)", r, vid, c)
  2729  		}
  2730  		e.usedRegs |= regMask(1) << uint(r.num)
  2731  		if final {
  2732  			e.finalRegs |= regMask(1) << uint(r.num)
  2733  		}
  2734  		if len(a) == 1 {
  2735  			e.uniqueRegs |= regMask(1) << uint(r.num)
  2736  		}
  2737  		if len(a) == 2 {
  2738  			if t, ok := e.s.f.getHome(a[0].ID).(*Register); ok {
  2739  				e.uniqueRegs &^= regMask(1) << uint(t.num)
  2740  			}
  2741  		}
  2742  		if e.s.values[vid].rematerializeable {
  2743  			e.rematerializeableRegs |= regMask(1) << uint(r.num)
  2744  		}
  2745  	}
  2746  	if e.s.f.pass.debug > regDebug {
  2747  		fmt.Printf("%s\n", c.LongString())
  2748  		fmt.Printf("v%d now available in %s:%s\n", vid, loc, c)
  2749  	}
  2750  }
  2751  
  2752  // erase removes any user of loc.
  2753  func (e *edgeState) erase(loc Location) {
  2754  	cr := e.contents[loc]
  2755  	if cr.c == nil {
  2756  		return
  2757  	}
  2758  	vid := cr.vid
  2759  
  2760  	if cr.final {
  2761  		// Add a destination to move this value back into place.
  2762  		// Make sure it gets added to the tail of the destination queue
  2763  		// so we make progress on other moves first.
  2764  		e.extra = append(e.extra, dstRecord{loc, cr.vid, nil, cr.pos})
  2765  	}
  2766  
  2767  	// Remove c from the list of cached values.
  2768  	a := e.cache[vid]
  2769  	for i, c := range a {
  2770  		if e.s.f.getHome(c.ID) == loc {
  2771  			if e.s.f.pass.debug > regDebug {
  2772  				fmt.Printf("v%d no longer available in %s:%s\n", vid, loc, c)
  2773  			}
  2774  			a[i], a = a[len(a)-1], a[:len(a)-1]
  2775  			break
  2776  		}
  2777  	}
  2778  	e.cache[vid] = a
  2779  
  2780  	// Update register masks.
  2781  	if r, ok := loc.(*Register); ok {
  2782  		e.usedRegs &^= regMask(1) << uint(r.num)
  2783  		if cr.final {
  2784  			e.finalRegs &^= regMask(1) << uint(r.num)
  2785  		}
  2786  		e.rematerializeableRegs &^= regMask(1) << uint(r.num)
  2787  	}
  2788  	if len(a) == 1 {
  2789  		if r, ok := e.s.f.getHome(a[0].ID).(*Register); ok {
  2790  			e.uniqueRegs |= regMask(1) << uint(r.num)
  2791  		}
  2792  	}
  2793  }
  2794  
  2795  // findRegFor finds a register we can use to make a temp copy of type typ.
  2796  func (e *edgeState) findRegFor(typ *types.Type) Location {
  2797  	// Which registers are possibilities.
  2798  	m := e.s.compatRegs(typ)
  2799  
  2800  	// Pick a register. In priority order:
  2801  	// 1) an unused register
  2802  	// 2) a non-unique register not holding a final value
  2803  	// 3) a non-unique register
  2804  	// 4) a register holding a rematerializeable value
  2805  	x := m &^ e.usedRegs
  2806  	if x != 0 {
  2807  		return &e.s.registers[pickReg(x)]
  2808  	}
  2809  	x = m &^ e.uniqueRegs &^ e.finalRegs
  2810  	if x != 0 {
  2811  		return &e.s.registers[pickReg(x)]
  2812  	}
  2813  	x = m &^ e.uniqueRegs
  2814  	if x != 0 {
  2815  		return &e.s.registers[pickReg(x)]
  2816  	}
  2817  	x = m & e.rematerializeableRegs
  2818  	if x != 0 {
  2819  		return &e.s.registers[pickReg(x)]
  2820  	}
  2821  
  2822  	// No register is available.
  2823  	// Pick a register to spill.
  2824  	for _, vid := range e.cachedVals {
  2825  		a := e.cache[vid]
  2826  		for _, c := range a {
  2827  			if r, ok := e.s.f.getHome(c.ID).(*Register); ok && m>>uint(r.num)&1 != 0 {
  2828  				if !c.rematerializeable() {
  2829  					x := e.p.NewValue1(c.Pos, OpStoreReg, c.Type, c)
  2830  					// Allocate a temp location to spill a register to.
  2831  					t := LocalSlot{N: e.s.f.NewLocal(c.Pos, c.Type), Type: c.Type}
  2832  					// TODO: reuse these slots. They'll need to be erased first.
  2833  					e.set(t, vid, x, false, c.Pos)
  2834  					if e.s.f.pass.debug > regDebug {
  2835  						fmt.Printf("  SPILL %s->%s %s\n", r, t, x.LongString())
  2836  					}
  2837  				}
  2838  				// r will now be overwritten by the caller. At some point
  2839  				// later, the newly saved value will be moved back to its
  2840  				// final destination in processDest.
  2841  				return r
  2842  			}
  2843  		}
  2844  	}
  2845  
  2846  	fmt.Printf("m:%d unique:%d final:%d rematerializable:%d\n", m, e.uniqueRegs, e.finalRegs, e.rematerializeableRegs)
  2847  	for _, vid := range e.cachedVals {
  2848  		a := e.cache[vid]
  2849  		for _, c := range a {
  2850  			fmt.Printf("v%d: %s %s\n", vid, c, e.s.f.getHome(c.ID))
  2851  		}
  2852  	}
  2853  	e.s.f.Fatalf("can't find empty register on edge %s->%s", e.p, e.b)
  2854  	return nil
  2855  }
  2856  
  2857  // rematerializeable reports whether the register allocator should recompute
  2858  // a value instead of spilling/restoring it.
  2859  func (v *Value) rematerializeable() bool {
  2860  	if !opcodeTable[v.Op].rematerializeable {
  2861  		return false
  2862  	}
  2863  	for _, a := range v.Args {
  2864  		// Fixed-register allocations (SP, SB, etc.) are always available.
  2865  		// Any other argument of an opcode makes it not rematerializeable.
  2866  		if !opcodeTable[a.Op].fixedReg {
  2867  			return false
  2868  		}
  2869  	}
  2870  	return true
  2871  }
  2872  
  2873  type liveInfo struct {
  2874  	ID   ID       // ID of value
  2875  	dist int32    // # of instructions before next use
  2876  	pos  src.XPos // source position of next use
  2877  }
  2878  
  2879  // computeLive computes a map from block ID to a list of value IDs live at the end
  2880  // of that block. Together with the value ID is a count of how many instructions
  2881  // to the next use of that value. The resulting map is stored in s.live.
  2882  func (s *regAllocState) computeLive() {
  2883  	f := s.f
  2884  	// single block functions do not have variables that are live across
  2885  	// branches
  2886  	if len(f.Blocks) == 1 {
  2887  		return
  2888  	}
  2889  	po := f.postorder()
  2890  	s.live = make([][]liveInfo, f.NumBlocks())
  2891  	s.desired = make([]desiredState, f.NumBlocks())
  2892  	s.loopnest = f.loopnest()
  2893  
  2894  	rematIDs := make([]ID, 0, 64)
  2895  
  2896  	live := f.newSparseMapPos(f.NumValues())
  2897  	defer f.retSparseMapPos(live)
  2898  	t := f.newSparseMapPos(f.NumValues())
  2899  	defer f.retSparseMapPos(t)
  2900  
  2901  	s.loopnest.computeUnavoidableCalls()
  2902  
  2903  	// Liveness analysis.
  2904  	// This is an adapted version of the algorithm described in chapter 2.4.2
  2905  	// of Fabrice Rastello's On Sparse Intermediate Representations.
  2906  	//   https://web.archive.org/web/20240417212122if_/https://inria.hal.science/hal-00761555/file/habilitation.pdf#section.50
  2907  	//
  2908  	// For our implementation, we fall back to a traditional iterative algorithm when we encounter
  2909  	// Irreducible CFGs. They are very uncommon in Go code because they need to be constructed with
  2910  	// gotos and our current loopnest definition does not compute all the information that
  2911  	// we'd need to compute the loop ancestors for that step of the algorithm.
  2912  	//
  2913  	// Additionally, instead of only considering non-loop successors in the initial DFS phase,
  2914  	// we compute the liveout as the union of all successors. This larger liveout set is a subset
  2915  	// of the final liveout for the block and adding this information in the DFS phase means that
  2916  	// we get slightly more accurate distance information.
  2917  	var loopLiveIn map[*loop][]liveInfo
  2918  	var numCalls []int32
  2919  	if len(s.loopnest.loops) > 0 && !s.loopnest.hasIrreducible {
  2920  		loopLiveIn = make(map[*loop][]liveInfo)
  2921  		numCalls = f.Cache.allocInt32Slice(f.NumBlocks())
  2922  		defer f.Cache.freeInt32Slice(numCalls)
  2923  	}
  2924  
  2925  	for {
  2926  		changed := false
  2927  
  2928  		for _, b := range po {
  2929  			// Start with known live values at the end of the block.
  2930  			live.clear()
  2931  			for _, e := range s.live[b.ID] {
  2932  				live.set(e.ID, e.dist, e.pos)
  2933  			}
  2934  			update := false
  2935  			// arguments to phi nodes are live at this blocks out
  2936  			for _, e := range b.Succs {
  2937  				succ := e.b
  2938  				delta := branchDistance(b, succ)
  2939  				for _, v := range succ.Values {
  2940  					if v.Op != OpPhi {
  2941  						break
  2942  					}
  2943  					arg := v.Args[e.i]
  2944  					if s.values[arg.ID].needReg && (!live.contains(arg.ID) || delta < live.get(arg.ID)) {
  2945  						live.set(arg.ID, delta, v.Pos)
  2946  						update = true
  2947  					}
  2948  				}
  2949  			}
  2950  			if update {
  2951  				s.live[b.ID] = updateLive(live, s.live[b.ID])
  2952  			}
  2953  			// Add len(b.Values) to adjust from end-of-block distance
  2954  			// to beginning-of-block distance.
  2955  			c := live.contents()
  2956  			for i := range c {
  2957  				c[i].val += int32(len(b.Values))
  2958  			}
  2959  
  2960  			// Mark control values as live
  2961  			for _, c := range b.ControlValues() {
  2962  				if s.values[c.ID].needReg {
  2963  					live.set(c.ID, int32(len(b.Values)), b.Pos)
  2964  				}
  2965  			}
  2966  
  2967  			for i := len(b.Values) - 1; i >= 0; i-- {
  2968  				v := b.Values[i]
  2969  				live.remove(v.ID)
  2970  				if v.Op == OpPhi {
  2971  					continue
  2972  				}
  2973  				if opcodeTable[v.Op].call {
  2974  					if numCalls != nil {
  2975  						numCalls[b.ID]++
  2976  					}
  2977  					rematIDs = rematIDs[:0]
  2978  					c := live.contents()
  2979  					for i := range c {
  2980  						c[i].val += unlikelyDistance
  2981  						vid := c[i].key
  2982  						if s.values[vid].rematerializeable {
  2983  							rematIDs = append(rematIDs, vid)
  2984  						}
  2985  					}
  2986  					// We don't spill rematerializeable values, and assuming they
  2987  					// are live across a call would only force shuffle to add some
  2988  					// (dead) constant rematerialization. Remove them.
  2989  					for _, r := range rematIDs {
  2990  						live.remove(r)
  2991  					}
  2992  				}
  2993  				for _, a := range v.Args {
  2994  					if s.values[a.ID].needReg {
  2995  						live.set(a.ID, int32(i), v.Pos)
  2996  					}
  2997  				}
  2998  			}
  2999  			// This is a loop header, save our live-in so that
  3000  			// we can use it to fill in the loop bodies later
  3001  			if loopLiveIn != nil {
  3002  				loop := s.loopnest.b2l[b.ID]
  3003  				if loop != nil && loop.header.ID == b.ID {
  3004  					loopLiveIn[loop] = updateLive(live, nil)
  3005  				}
  3006  			}
  3007  			// For each predecessor of b, expand its list of live-at-end values.
  3008  			// invariant: live contains the values live at the start of b
  3009  			for _, e := range b.Preds {
  3010  				p := e.b
  3011  				delta := branchDistance(p, b)
  3012  
  3013  				// Start t off with the previously known live values at the end of p.
  3014  				t.clear()
  3015  				for _, e := range s.live[p.ID] {
  3016  					t.set(e.ID, e.dist, e.pos)
  3017  				}
  3018  				update := false
  3019  
  3020  				// Add new live values from scanning this block.
  3021  				for _, e := range live.contents() {
  3022  					d := e.val + delta
  3023  					if !t.contains(e.key) || d < t.get(e.key) {
  3024  						update = true
  3025  						t.set(e.key, d, e.pos)
  3026  					}
  3027  				}
  3028  
  3029  				if !update {
  3030  					continue
  3031  				}
  3032  				s.live[p.ID] = updateLive(t, s.live[p.ID])
  3033  				changed = true
  3034  			}
  3035  		}
  3036  
  3037  		// Doing a traditional iterative algorithm and have run
  3038  		// out of changes
  3039  		if !changed {
  3040  			break
  3041  		}
  3042  
  3043  		// Doing a pre-pass and will fill in the liveness information
  3044  		// later
  3045  		if loopLiveIn != nil {
  3046  			break
  3047  		}
  3048  		// For loopless code, we have full liveness info after a single
  3049  		// iteration
  3050  		if len(s.loopnest.loops) == 0 {
  3051  			break
  3052  		}
  3053  	}
  3054  	if f.pass.debug > regDebug {
  3055  		s.debugPrintLive("after dfs walk", f, s.live, s.desired)
  3056  	}
  3057  
  3058  	// irreducible CFGs and functions without loops are already
  3059  	// done, compute their desired registers and return
  3060  	if loopLiveIn == nil {
  3061  		s.computeDesired()
  3062  		return
  3063  	}
  3064  
  3065  	// Walk the loopnest from outer to inner, adding
  3066  	// all live-in values from their parent. Instead of
  3067  	// a recursive algorithm, iterate in depth order.
  3068  	// TODO(dmo): can we permute the loopnest? can we avoid this copy?
  3069  	loops := slices.Clone(s.loopnest.loops)
  3070  	slices.SortFunc(loops, func(a, b *loop) int {
  3071  		return cmp.Compare(a.depth, b.depth)
  3072  	})
  3073  
  3074  	loopset := f.newSparseMapPos(f.NumValues())
  3075  	defer f.retSparseMapPos(loopset)
  3076  	for _, loop := range loops {
  3077  		if loop.outer == nil {
  3078  			continue
  3079  		}
  3080  		livein := loopLiveIn[loop]
  3081  		loopset.clear()
  3082  		for _, l := range livein {
  3083  			loopset.set(l.ID, l.dist, l.pos)
  3084  		}
  3085  		update := false
  3086  		for _, l := range loopLiveIn[loop.outer] {
  3087  			if !loopset.contains(l.ID) {
  3088  				loopset.set(l.ID, l.dist, l.pos)
  3089  				update = true
  3090  			}
  3091  		}
  3092  		if update {
  3093  			loopLiveIn[loop] = updateLive(loopset, livein)
  3094  		}
  3095  	}
  3096  	// unknownDistance is a sentinel value for when we know a variable
  3097  	// is live at any given block, but we do not yet know how far until it's next
  3098  	// use. The distance will be computed later.
  3099  	const unknownDistance = -1
  3100  
  3101  	// add live-in values of the loop headers to their children.
  3102  	// This includes the loop headers themselves, since they can have values
  3103  	// that die in the middle of the block and aren't live-out
  3104  	for _, b := range po {
  3105  		loop := s.loopnest.b2l[b.ID]
  3106  		if loop == nil {
  3107  			continue
  3108  		}
  3109  		headerLive := loopLiveIn[loop]
  3110  		loopset.clear()
  3111  		for _, l := range s.live[b.ID] {
  3112  			loopset.set(l.ID, l.dist, l.pos)
  3113  		}
  3114  		update := false
  3115  		for _, l := range headerLive {
  3116  			if !loopset.contains(l.ID) {
  3117  				loopset.set(l.ID, unknownDistance, src.NoXPos)
  3118  				update = true
  3119  			}
  3120  		}
  3121  		if update {
  3122  			s.live[b.ID] = updateLive(loopset, s.live[b.ID])
  3123  		}
  3124  	}
  3125  	if f.pass.debug > regDebug {
  3126  		s.debugPrintLive("after live loop prop", f, s.live, s.desired)
  3127  	}
  3128  	// Filling in liveness from loops leaves some blocks with no distance information
  3129  	// Run over them and fill in the information from their successors.
  3130  	// To stabilize faster, we quit when no block has missing values and we only
  3131  	// look at blocks that still have missing values in subsequent iterations
  3132  	unfinishedBlocks := f.Cache.allocBlockSlice(len(po))
  3133  	defer f.Cache.freeBlockSlice(unfinishedBlocks)
  3134  	copy(unfinishedBlocks, po)
  3135  
  3136  	for len(unfinishedBlocks) > 0 {
  3137  		n := 0
  3138  		for _, b := range unfinishedBlocks {
  3139  			live.clear()
  3140  			unfinishedValues := 0
  3141  			for _, l := range s.live[b.ID] {
  3142  				if l.dist == unknownDistance {
  3143  					unfinishedValues++
  3144  				}
  3145  				live.set(l.ID, l.dist, l.pos)
  3146  			}
  3147  			update := false
  3148  			for _, e := range b.Succs {
  3149  				succ := e.b
  3150  				for _, l := range s.live[succ.ID] {
  3151  					if !live.contains(l.ID) || l.dist == unknownDistance {
  3152  						continue
  3153  					}
  3154  					dist := int32(len(succ.Values)) + l.dist + branchDistance(b, succ)
  3155  					dist += numCalls[succ.ID] * unlikelyDistance
  3156  					val := live.get(l.ID)
  3157  					switch {
  3158  					case val == unknownDistance:
  3159  						unfinishedValues--
  3160  						fallthrough
  3161  					case dist < val:
  3162  						update = true
  3163  						live.set(l.ID, dist, l.pos)
  3164  					}
  3165  				}
  3166  			}
  3167  			if update {
  3168  				s.live[b.ID] = updateLive(live, s.live[b.ID])
  3169  			}
  3170  			if unfinishedValues > 0 {
  3171  				unfinishedBlocks[n] = b
  3172  				n++
  3173  			}
  3174  		}
  3175  		unfinishedBlocks = unfinishedBlocks[:n]
  3176  	}
  3177  
  3178  	// Sort live values in order of their nearest next use.
  3179  	// Useful for promoting values to registers, nearest use first.
  3180  	for _, b := range f.Blocks {
  3181  		slices.SortFunc(s.live[b.ID], func(a, b liveInfo) int {
  3182  			if a.dist != b.dist {
  3183  				return cmp.Compare(a.dist, b.dist)
  3184  			}
  3185  			return cmp.Compare(a.ID, b.ID) // for deterministic sorting
  3186  		})
  3187  	}
  3188  
  3189  	s.computeDesired()
  3190  
  3191  	if f.pass.debug > regDebug {
  3192  		s.debugPrintLive("final", f, s.live, s.desired)
  3193  	}
  3194  }
  3195  
  3196  // computeDesired computes the desired register information at the end of each block.
  3197  // It is essentially a liveness analysis on machine registers instead of SSA values
  3198  // The desired register information is stored in s.desired.
  3199  func (s *regAllocState) computeDesired() {
  3200  
  3201  	// TODO: Can we speed this up using the liveness information we have already
  3202  	// from computeLive?
  3203  	// TODO: Since we don't propagate information through phi nodes, can we do
  3204  	// this as a single dominator tree walk instead of the iterative solution?
  3205  	var desired desiredState
  3206  	f := s.f
  3207  	po := f.postorder()
  3208  	for {
  3209  		changed := false
  3210  		for _, b := range po {
  3211  			desired.copy(&s.desired[b.ID])
  3212  			for i := len(b.Values) - 1; i >= 0; i-- {
  3213  				v := b.Values[i]
  3214  				prefs := desired.remove(v.ID)
  3215  				if v.Op == OpPhi {
  3216  					// TODO: if v is a phi, save desired register for phi inputs.
  3217  					// For now, we just drop it and don't propagate
  3218  					// desired registers back though phi nodes.
  3219  					continue
  3220  				}
  3221  				regspec := s.regspec(v)
  3222  				// Cancel desired registers if they get clobbered.
  3223  				desired.clobber(regspec.clobbers)
  3224  				// Update desired registers if there are any fixed register inputs.
  3225  				for _, j := range regspec.inputs {
  3226  					if countRegs(j.regs) != 1 {
  3227  						continue
  3228  					}
  3229  					desired.clobber(j.regs)
  3230  					desired.add(v.Args[j.idx].ID, pickReg(j.regs))
  3231  				}
  3232  				// Set desired register of input 0 if this is a 2-operand instruction.
  3233  				if opcodeTable[v.Op].resultInArg0 || v.Op == OpAMD64ADDQconst || v.Op == OpAMD64ADDLconst || v.Op == OpSelect0 {
  3234  					// ADDQconst is added here because we want to treat it as resultInArg0 for
  3235  					// the purposes of desired registers, even though it is not an absolute requirement.
  3236  					// This is because we'd rather implement it as ADDQ instead of LEAQ.
  3237  					// Same for ADDLconst
  3238  					// Select0 is added here to propagate the desired register to the tuple-generating instruction.
  3239  					if opcodeTable[v.Op].commutative {
  3240  						desired.addList(v.Args[1].ID, prefs)
  3241  					}
  3242  					desired.addList(v.Args[0].ID, prefs)
  3243  				}
  3244  			}
  3245  			for _, e := range b.Preds {
  3246  				p := e.b
  3247  				changed = s.desired[p.ID].merge(&desired) || changed
  3248  			}
  3249  		}
  3250  		if !changed || (!s.loopnest.hasIrreducible && len(s.loopnest.loops) == 0) {
  3251  			break
  3252  		}
  3253  	}
  3254  }
  3255  
  3256  // updateLive updates a given liveInfo slice with the contents of t
  3257  func updateLive(t *sparseMapPos, live []liveInfo) []liveInfo {
  3258  	live = live[:0]
  3259  	if cap(live) < t.size() {
  3260  		live = make([]liveInfo, 0, t.size())
  3261  	}
  3262  	for _, e := range t.contents() {
  3263  		live = append(live, liveInfo{e.key, e.val, e.pos})
  3264  	}
  3265  	return live
  3266  }
  3267  
  3268  // branchDistance calculates the distance between a block and a
  3269  // successor in pseudo-instructions. This is used to indicate
  3270  // likeliness
  3271  func branchDistance(b *Block, s *Block) int32 {
  3272  	if len(b.Succs) == 2 {
  3273  		if b.Succs[0].b == s && b.Likely == BranchLikely ||
  3274  			b.Succs[1].b == s && b.Likely == BranchUnlikely {
  3275  			return likelyDistance
  3276  		}
  3277  		if b.Succs[0].b == s && b.Likely == BranchUnlikely ||
  3278  			b.Succs[1].b == s && b.Likely == BranchLikely {
  3279  			return unlikelyDistance
  3280  		}
  3281  	}
  3282  	// Note: the branch distance must be at least 1 to distinguish the control
  3283  	// value use from the first user in a successor block.
  3284  	return normalDistance
  3285  }
  3286  
  3287  func (s *regAllocState) debugPrintLive(stage string, f *Func, live [][]liveInfo, desired []desiredState) {
  3288  	fmt.Printf("%s: live values at end of each block: %s\n", stage, f.Name)
  3289  	for _, b := range f.Blocks {
  3290  		s.debugPrintLiveBlock(b, live[b.ID], &desired[b.ID])
  3291  	}
  3292  }
  3293  
  3294  func (s *regAllocState) debugPrintLiveBlock(b *Block, live []liveInfo, desired *desiredState) {
  3295  	fmt.Printf("  %s:", b)
  3296  	slices.SortFunc(live, func(a, b liveInfo) int {
  3297  		return cmp.Compare(a.ID, b.ID)
  3298  	})
  3299  	for _, x := range live {
  3300  		fmt.Printf(" v%d(%d)", x.ID, x.dist)
  3301  		for _, e := range desired.entries {
  3302  			if e.ID != x.ID {
  3303  				continue
  3304  			}
  3305  			fmt.Printf("[")
  3306  			first := true
  3307  			for _, r := range e.regs {
  3308  				if r == noRegister {
  3309  					continue
  3310  				}
  3311  				if !first {
  3312  					fmt.Printf(",")
  3313  				}
  3314  				fmt.Print(&s.registers[r])
  3315  				first = false
  3316  			}
  3317  			fmt.Printf("]")
  3318  		}
  3319  	}
  3320  	if avoid := desired.avoid; avoid != 0 {
  3321  		fmt.Printf(" avoid=%v", s.RegMaskString(avoid))
  3322  	}
  3323  	fmt.Println()
  3324  }
  3325  
  3326  // A desiredState represents desired register assignments.
  3327  type desiredState struct {
  3328  	// Desired assignments will be small, so we just use a list
  3329  	// of valueID+registers entries.
  3330  	entries []desiredStateEntry
  3331  	// Registers that other values want to be in.  This value will
  3332  	// contain at least the union of the regs fields of entries, but
  3333  	// may contain additional entries for values that were once in
  3334  	// this data structure but are no longer.
  3335  	avoid regMask
  3336  }
  3337  type desiredStateEntry struct {
  3338  	// (pre-regalloc) value
  3339  	ID ID
  3340  	// Registers it would like to be in, in priority order.
  3341  	// Unused slots are filled with noRegister.
  3342  	// For opcodes that return tuples, we track desired registers only
  3343  	// for the first element of the tuple (see desiredSecondReg for
  3344  	// tracking the desired register for second part of a tuple).
  3345  	regs [4]register
  3346  }
  3347  
  3348  // get returns a list of desired registers for value vid.
  3349  func (d *desiredState) get(vid ID) [4]register {
  3350  	for _, e := range d.entries {
  3351  		if e.ID == vid {
  3352  			return e.regs
  3353  		}
  3354  	}
  3355  	return [4]register{noRegister, noRegister, noRegister, noRegister}
  3356  }
  3357  
  3358  // add records that we'd like value vid to be in register r.
  3359  func (d *desiredState) add(vid ID, r register) {
  3360  	d.avoid |= regMask(1) << r
  3361  	for i := range d.entries {
  3362  		e := &d.entries[i]
  3363  		if e.ID != vid {
  3364  			continue
  3365  		}
  3366  		if e.regs[0] == r {
  3367  			// Already known and highest priority
  3368  			return
  3369  		}
  3370  		for j := 1; j < len(e.regs); j++ {
  3371  			if e.regs[j] == r {
  3372  				// Move from lower priority to top priority
  3373  				copy(e.regs[1:], e.regs[:j])
  3374  				e.regs[0] = r
  3375  				return
  3376  			}
  3377  		}
  3378  		copy(e.regs[1:], e.regs[:])
  3379  		e.regs[0] = r
  3380  		return
  3381  	}
  3382  	d.entries = append(d.entries, desiredStateEntry{vid, [4]register{r, noRegister, noRegister, noRegister}})
  3383  }
  3384  
  3385  func (d *desiredState) addList(vid ID, regs [4]register) {
  3386  	// regs is in priority order, so iterate in reverse order.
  3387  	for i := len(regs) - 1; i >= 0; i-- {
  3388  		r := regs[i]
  3389  		if r != noRegister {
  3390  			d.add(vid, r)
  3391  		}
  3392  	}
  3393  }
  3394  
  3395  // clobber erases any desired registers in the set m.
  3396  func (d *desiredState) clobber(m regMask) {
  3397  	for i := 0; i < len(d.entries); {
  3398  		e := &d.entries[i]
  3399  		j := 0
  3400  		for _, r := range e.regs {
  3401  			if r != noRegister && m>>r&1 == 0 {
  3402  				e.regs[j] = r
  3403  				j++
  3404  			}
  3405  		}
  3406  		if j == 0 {
  3407  			// No more desired registers for this value.
  3408  			d.entries[i] = d.entries[len(d.entries)-1]
  3409  			d.entries = d.entries[:len(d.entries)-1]
  3410  			continue
  3411  		}
  3412  		for ; j < len(e.regs); j++ {
  3413  			e.regs[j] = noRegister
  3414  		}
  3415  		i++
  3416  	}
  3417  	d.avoid &^= m
  3418  }
  3419  
  3420  // copy copies a desired state from another desiredState x.
  3421  func (d *desiredState) copy(x *desiredState) {
  3422  	d.entries = append(d.entries[:0], x.entries...)
  3423  	d.avoid = x.avoid
  3424  }
  3425  
  3426  // remove removes the desired registers for vid and returns them.
  3427  func (d *desiredState) remove(vid ID) [4]register {
  3428  	for i := range d.entries {
  3429  		if d.entries[i].ID == vid {
  3430  			regs := d.entries[i].regs
  3431  			d.entries[i] = d.entries[len(d.entries)-1]
  3432  			d.entries = d.entries[:len(d.entries)-1]
  3433  			return regs
  3434  		}
  3435  	}
  3436  	return [4]register{noRegister, noRegister, noRegister, noRegister}
  3437  }
  3438  
  3439  // merge merges another desired state x into d. Returns whether the set has
  3440  // changed
  3441  func (d *desiredState) merge(x *desiredState) bool {
  3442  	oldAvoid := d.avoid
  3443  	d.avoid |= x.avoid
  3444  	// There should only be a few desired registers, so
  3445  	// linear insert is ok.
  3446  	for _, e := range x.entries {
  3447  		d.addList(e.ID, e.regs)
  3448  	}
  3449  	return oldAvoid != d.avoid
  3450  }
  3451  
  3452  // computeUnavoidableCalls computes the containsUnavoidableCall fields in the loop nest.
  3453  func (loopnest *loopnest) computeUnavoidableCalls() {
  3454  	f := loopnest.f
  3455  
  3456  	hasCall := f.Cache.allocBoolSlice(f.NumBlocks())
  3457  	defer f.Cache.freeBoolSlice(hasCall)
  3458  	for _, b := range f.Blocks {
  3459  		if b.containsCall() {
  3460  			hasCall[b.ID] = true
  3461  		}
  3462  	}
  3463  	found := f.Cache.allocSparseSet(f.NumBlocks())
  3464  	defer f.Cache.freeSparseSet(found)
  3465  	// Run dfs to find path through the loop that avoids all calls.
  3466  	// Such path either escapes the loop or returns back to the header.
  3467  	// It isn't enough to have exit not dominated by any call, for example:
  3468  	// ... some loop
  3469  	// call1    call2
  3470  	//   \       /
  3471  	//     block
  3472  	// ...
  3473  	// block is not dominated by any single call, but we don't have call-free path to it.
  3474  loopLoop:
  3475  	for _, l := range loopnest.loops {
  3476  		found.clear()
  3477  		tovisit := make([]*Block, 0, 8)
  3478  		tovisit = append(tovisit, l.header)
  3479  		for len(tovisit) > 0 {
  3480  			cur := tovisit[len(tovisit)-1]
  3481  			tovisit = tovisit[:len(tovisit)-1]
  3482  			if hasCall[cur.ID] {
  3483  				continue
  3484  			}
  3485  			for _, s := range cur.Succs {
  3486  				nb := s.Block()
  3487  				if nb == l.header {
  3488  					// Found a call-free path around the loop.
  3489  					continue loopLoop
  3490  				}
  3491  				if found.contains(nb.ID) {
  3492  					// Already found via another path.
  3493  					continue
  3494  				}
  3495  				nl := loopnest.b2l[nb.ID]
  3496  				if nl == nil || (nl.depth <= l.depth && nl != l) {
  3497  					// Left the loop.
  3498  					continue
  3499  				}
  3500  				tovisit = append(tovisit, nb)
  3501  				found.add(nb.ID)
  3502  			}
  3503  		}
  3504  		// No call-free path was found.
  3505  		l.containsUnavoidableCall = true
  3506  	}
  3507  }
  3508  
  3509  func (b *Block) containsCall() bool {
  3510  	if b.Kind == BlockDefer {
  3511  		return true
  3512  	}
  3513  	for _, v := range b.Values {
  3514  		if opcodeTable[v.Op].call {
  3515  			return true
  3516  		}
  3517  	}
  3518  	return false
  3519  }
  3520  

View as plain text