Source file src/cmd/compile/internal/ssa/regalloc.go

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Register allocation.
     6  //
     7  // We use a version of a linear scan register allocator. We treat the
     8  // whole function as a single long basic block and run through
     9  // it using a greedy register allocator. Then all merge edges
    10  // (those targeting a block with len(Preds)>1) are processed to
    11  // shuffle data into the place that the target of the edge expects.
    12  //
    13  // The greedy allocator moves values into registers just before they
    14  // are used, spills registers only when necessary, and spills the
    15  // value whose next use is farthest in the future.
    16  //
    17  // The register allocator requires that a block is not scheduled until
    18  // at least one of its predecessors have been scheduled. The most recent
    19  // such predecessor provides the starting register state for a block.
    20  //
    21  // It also requires that there are no critical edges (critical =
    22  // comes from a block with >1 successor and goes to a block with >1
    23  // predecessor).  This makes it easy to add fixup code on merge edges -
    24  // the source of a merge edge has only one successor, so we can add
    25  // fixup code to the end of that block.
    26  
    27  // Spilling
    28  //
    29  // During the normal course of the allocator, we might throw a still-live
    30  // value out of all registers. When that value is subsequently used, we must
    31  // load it from a slot on the stack. We must also issue an instruction to
    32  // initialize that stack location with a copy of v.
    33  //
    34  // pre-regalloc:
    35  //   (1) v = Op ...
    36  //   (2) x = Op ...
    37  //   (3) ... = Op v ...
    38  //
    39  // post-regalloc:
    40  //   (1) v = Op ...    : AX // computes v, store result in AX
    41  //       s = StoreReg v     // spill v to a stack slot
    42  //   (2) x = Op ...    : AX // some other op uses AX
    43  //       c = LoadReg s : CX // restore v from stack slot
    44  //   (3) ... = Op c ...     // use the restored value
    45  //
    46  // Allocation occurs normally until we reach (3) and we realize we have
    47  // a use of v and it isn't in any register. At that point, we allocate
    48  // a spill (a StoreReg) for v. We can't determine the correct place for
    49  // the spill at this point, so we allocate the spill as blockless initially.
    50  // The restore is then generated to load v back into a register so it can
    51  // be used. Subsequent uses of v will use the restored value c instead.
    52  //
    53  // What remains is the question of where to schedule the spill.
    54  // During allocation, we keep track of the dominator of all restores of v.
    55  // The spill of v must dominate that block. The spill must also be issued at
    56  // a point where v is still in a register.
    57  //
    58  // To find the right place, start at b, the block which dominates all restores.
    59  //  - If b is v.Block, then issue the spill right after v.
    60  //    It is known to be in a register at that point, and dominates any restores.
    61  //  - Otherwise, if v is in a register at the start of b,
    62  //    put the spill of v at the start of b.
    63  //  - Otherwise, set b = immediate dominator of b, and repeat.
    64  //
    65  // Phi values are special, as always. We define two kinds of phis, those
    66  // where the merge happens in a register (a "register" phi) and those where
    67  // the merge happens in a stack location (a "stack" phi).
    68  //
    69  // A register phi must have the phi and all of its inputs allocated to the
    70  // same register. Register phis are spilled similarly to regular ops.
    71  //
    72  // A stack phi must have the phi and all of its inputs allocated to the same
    73  // stack location. Stack phis start out life already spilled - each phi
    74  // input must be a store (using StoreReg) at the end of the corresponding
    75  // predecessor block.
    76  //     b1: y = ... : AX        b2: z = ... : BX
    77  //         y2 = StoreReg y         z2 = StoreReg z
    78  //         goto b3                 goto b3
    79  //     b3: x = phi(y2, z2)
    80  // The stack allocator knows that StoreReg args of stack-allocated phis
    81  // must be allocated to the same stack slot as the phi that uses them.
    82  // x is now a spilled value and a restore must appear before its first use.
    83  
    84  // TODO
    85  
    86  // Use an affinity graph to mark two values which should use the
    87  // same register. This affinity graph will be used to prefer certain
    88  // registers for allocation. This affinity helps eliminate moves that
    89  // are required for phi implementations and helps generate allocations
    90  // for 2-register architectures.
    91  
    92  // Note: regalloc generates a not-quite-SSA output. If we have:
    93  //
    94  //             b1: x = ... : AX
    95  //                 x2 = StoreReg x
    96  //                 ... AX gets reused for something else ...
    97  //                 if ... goto b3 else b4
    98  //
    99  //   b3: x3 = LoadReg x2 : BX       b4: x4 = LoadReg x2 : CX
   100  //       ... use x3 ...                 ... use x4 ...
   101  //
   102  //             b2: ... use x3 ...
   103  //
   104  // If b3 is the primary predecessor of b2, then we use x3 in b2 and
   105  // add a x4:CX->BX copy at the end of b4.
   106  // But the definition of x3 doesn't dominate b2.  We should really
   107  // insert an extra phi at the start of b2 (x5=phi(x3,x4):BX) to keep
   108  // SSA form. For now, we ignore this problem as remaining in strict
   109  // SSA form isn't needed after regalloc. We'll just leave the use
   110  // of x3 not dominated by the definition of x3, and the CX->BX copy
   111  // will have no use (so don't run deadcode after regalloc!).
   112  // TODO: maybe we should introduce these extra phis?
   113  
   114  package ssa
   115  
   116  import (
   117  	"cmd/compile/internal/base"
   118  	"cmd/compile/internal/ir"
   119  	"cmd/compile/internal/types"
   120  	"cmd/internal/src"
   121  	"cmd/internal/sys"
   122  	"cmp"
   123  	"fmt"
   124  	"internal/buildcfg"
   125  	"math"
   126  	"math/bits"
   127  	"slices"
   128  	"unsafe"
   129  )
   130  
   131  const (
   132  	moveSpills = iota
   133  	logSpills
   134  	regDebug
   135  	stackDebug
   136  )
   137  
   138  // distance is a measure of how far into the future values are used.
   139  // distance is measured in units of instructions.
   140  const (
   141  	likelyDistance   = 1
   142  	normalDistance   = 10
   143  	unlikelyDistance = 100
   144  )
   145  
   146  // regalloc performs register allocation on f. It sets f.RegAlloc
   147  // to the resulting allocation.
   148  func regalloc(f *Func) {
   149  	var s regAllocState
   150  	s.init(f)
   151  	s.regalloc(f)
   152  	s.close()
   153  }
   154  
   155  type register uint8
   156  
   157  const noRegister register = 255
   158  
   159  // For bulk initializing
   160  var noRegisters [32]register = [32]register{
   161  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   162  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   163  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   164  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   165  }
   166  
   167  // A regMask encodes a set of machine registers.
   168  // TODO: regMask -> regSet?
   169  type regMask uint64
   170  
   171  func (m regMask) String() string {
   172  	s := ""
   173  	for r := register(0); m != 0; r++ {
   174  		if m>>r&1 == 0 {
   175  			continue
   176  		}
   177  		m &^= regMask(1) << r
   178  		if s != "" {
   179  			s += " "
   180  		}
   181  		s += fmt.Sprintf("r%d", r)
   182  	}
   183  	return s
   184  }
   185  
   186  func (m regMask) contains(r register) bool {
   187  	return m>>r&1 != 0
   188  }
   189  
   190  func (s *regAllocState) RegMaskString(m regMask) string {
   191  	str := ""
   192  	for r := register(0); m != 0; r++ {
   193  		if m>>r&1 == 0 {
   194  			continue
   195  		}
   196  		m &^= regMask(1) << r
   197  		if str != "" {
   198  			str += " "
   199  		}
   200  		str += s.registers[r].String()
   201  	}
   202  	return str
   203  }
   204  
   205  // countRegs returns the number of set bits in the register mask.
   206  func countRegs(r regMask) int {
   207  	return bits.OnesCount64(uint64(r))
   208  }
   209  
   210  // pickReg picks an arbitrary register from the register mask.
   211  func pickReg(r regMask) register {
   212  	if r == 0 {
   213  		panic("can't pick a register from an empty set")
   214  	}
   215  	// pick the lowest one
   216  	return register(bits.TrailingZeros64(uint64(r)))
   217  }
   218  
   219  type use struct {
   220  	// distance from start of the block to a use of a value
   221  	//   dist == 0                 used by first instruction in block
   222  	//   dist == len(b.Values)-1   used by last instruction in block
   223  	//   dist == len(b.Values)     used by block's control value
   224  	//   dist  > len(b.Values)     used by a subsequent block
   225  	dist int32
   226  	pos  src.XPos // source position of the use
   227  	next *use     // linked list of uses of a value in nondecreasing dist order
   228  }
   229  
   230  // A valState records the register allocation state for a (pre-regalloc) value.
   231  type valState struct {
   232  	regs              regMask // the set of registers holding a Value (usually just one)
   233  	uses              *use    // list of uses in this block
   234  	spill             *Value  // spilled copy of the Value (if any)
   235  	restoreMin        int32   // minimum of all restores' blocks' sdom.entry
   236  	restoreMax        int32   // maximum of all restores' blocks' sdom.exit
   237  	needReg           bool    // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
   238  	rematerializeable bool    // cached value of v.rematerializeable()
   239  }
   240  
   241  type regState struct {
   242  	v *Value // Original (preregalloc) Value stored in this register.
   243  	c *Value // A Value equal to v which is currently in a register.  Might be v or a copy of it.
   244  	// If a register is unused, v==c==nil
   245  }
   246  
   247  type regAllocState struct {
   248  	f *Func
   249  
   250  	sdom        SparseTree
   251  	registers   []Register
   252  	numRegs     register
   253  	SPReg       register
   254  	SBReg       register
   255  	GReg        register
   256  	ZeroIntReg  register
   257  	allocatable regMask
   258  
   259  	// live values at the end of each block.  live[b.ID] is a list of value IDs
   260  	// which are live at the end of b, together with a count of how many instructions
   261  	// forward to the next use.
   262  	live [][]liveInfo
   263  	// desired register assignments at the end of each block.
   264  	// Note that this is a static map computed before allocation occurs. Dynamic
   265  	// register desires (from partially completed allocations) will trump
   266  	// this information.
   267  	desired []desiredState
   268  
   269  	// current state of each (preregalloc) Value
   270  	values []valState
   271  
   272  	// ID of SP, SB values
   273  	sp, sb ID
   274  
   275  	// For each Value, map from its value ID back to the
   276  	// preregalloc Value it was derived from.
   277  	orig []*Value
   278  
   279  	// current state of each register.
   280  	// Includes only registers in allocatable.
   281  	regs []regState
   282  
   283  	// registers that contain values which can't be kicked out
   284  	nospill regMask
   285  
   286  	// mask of registers currently in use
   287  	used regMask
   288  
   289  	// mask of registers used since the start of the current block
   290  	usedSinceBlockStart regMask
   291  
   292  	// mask of registers used in the current instruction
   293  	tmpused regMask
   294  
   295  	// current block we're working on
   296  	curBlock *Block
   297  
   298  	// cache of use records
   299  	freeUseRecords *use
   300  
   301  	// endRegs[blockid] is the register state at the end of each block.
   302  	// encoded as a set of endReg records.
   303  	endRegs [][]endReg
   304  
   305  	// startRegs[blockid] is the register state at the start of merge blocks.
   306  	// saved state does not include the state of phi ops in the block.
   307  	startRegs [][]startReg
   308  
   309  	// startRegsMask is a mask of the registers in startRegs[curBlock.ID].
   310  	// Registers dropped from startRegsMask are later synchronoized back to
   311  	// startRegs by dropping from there as well.
   312  	startRegsMask regMask
   313  
   314  	// spillLive[blockid] is the set of live spills at the end of each block
   315  	spillLive [][]ID
   316  
   317  	// a set of copies we generated to move things around, and
   318  	// whether it is used in shuffle. Unused copies will be deleted.
   319  	copies map[*Value]bool
   320  
   321  	loopnest *loopnest
   322  
   323  	// choose a good order in which to visit blocks for allocation purposes.
   324  	visitOrder []*Block
   325  
   326  	// blockOrder[b.ID] corresponds to the index of block b in visitOrder.
   327  	blockOrder []int32
   328  
   329  	// whether to insert instructions that clobber dead registers at call sites
   330  	doClobber bool
   331  
   332  	// For each instruction index in a basic block, the index of the next call
   333  	// at or after that instruction index.
   334  	// If there is no next call, returns maxInt32.
   335  	// nextCall for a call instruction points to itself.
   336  	// (Indexes and results are pre-regalloc.)
   337  	nextCall []int32
   338  
   339  	// Index of the instruction we're currently working on.
   340  	// Index is expressed in terms of the pre-regalloc b.Values list.
   341  	curIdx int
   342  }
   343  
   344  type endReg struct {
   345  	r register
   346  	v *Value // pre-regalloc value held in this register (TODO: can we use ID here?)
   347  	c *Value // cached version of the value
   348  }
   349  
   350  type startReg struct {
   351  	r   register
   352  	v   *Value   // pre-regalloc value needed in this register
   353  	c   *Value   // cached version of the value
   354  	pos src.XPos // source position of use of this register
   355  }
   356  
   357  // freeReg frees up register r. Any current user of r is kicked out.
   358  func (s *regAllocState) freeReg(r register) {
   359  	if !s.allocatable.contains(r) && !s.isGReg(r) {
   360  		return
   361  	}
   362  	v := s.regs[r].v
   363  	if v == nil {
   364  		s.f.Fatalf("tried to free an already free register %d\n", r)
   365  	}
   366  
   367  	// Mark r as unused.
   368  	if s.f.pass.debug > regDebug {
   369  		fmt.Printf("freeReg %s (dump %s/%s)\n", &s.registers[r], v, s.regs[r].c)
   370  	}
   371  	s.regs[r] = regState{}
   372  	s.values[v.ID].regs &^= regMask(1) << r
   373  	s.used &^= regMask(1) << r
   374  }
   375  
   376  // freeRegs frees up all registers listed in m.
   377  func (s *regAllocState) freeRegs(m regMask) {
   378  	for m&s.used != 0 {
   379  		s.freeReg(pickReg(m & s.used))
   380  	}
   381  }
   382  
   383  // clobberRegs inserts instructions that clobber registers listed in m.
   384  func (s *regAllocState) clobberRegs(m regMask) {
   385  	m &= s.allocatable & s.f.Config.gpRegMask // only integer register can contain pointers, only clobber them
   386  	for m != 0 {
   387  		r := pickReg(m)
   388  		m &^= 1 << r
   389  		x := s.curBlock.NewValue0(src.NoXPos, OpClobberReg, types.TypeVoid)
   390  		s.f.setHome(x, &s.registers[r])
   391  	}
   392  }
   393  
   394  // setOrig records that c's original value is the same as
   395  // v's original value.
   396  func (s *regAllocState) setOrig(c *Value, v *Value) {
   397  	if int(c.ID) >= cap(s.orig) {
   398  		x := s.f.Cache.allocValueSlice(int(c.ID) + 1)
   399  		copy(x, s.orig)
   400  		s.f.Cache.freeValueSlice(s.orig)
   401  		s.orig = x
   402  	}
   403  	for int(c.ID) >= len(s.orig) {
   404  		s.orig = append(s.orig, nil)
   405  	}
   406  	if s.orig[c.ID] != nil {
   407  		s.f.Fatalf("orig value set twice %s %s", c, v)
   408  	}
   409  	s.orig[c.ID] = s.orig[v.ID]
   410  }
   411  
   412  // assignReg assigns register r to hold c, a copy of v.
   413  // r must be unused.
   414  func (s *regAllocState) assignReg(r register, v *Value, c *Value) {
   415  	if s.f.pass.debug > regDebug {
   416  		fmt.Printf("assignReg %s %s/%s\n", &s.registers[r], v, c)
   417  	}
   418  	// Allocate v to r.
   419  	s.values[v.ID].regs |= regMask(1) << r
   420  	s.f.setHome(c, &s.registers[r])
   421  
   422  	// Allocate r to v.
   423  	if !s.allocatable.contains(r) && !s.isGReg(r) {
   424  		return
   425  	}
   426  	if s.regs[r].v != nil {
   427  		s.f.Fatalf("tried to assign register %d to %s/%s but it is already used by %s", r, v, c, s.regs[r].v)
   428  	}
   429  	s.regs[r] = regState{v, c}
   430  	s.used |= regMask(1) << r
   431  }
   432  
   433  // allocReg chooses a register from the set of registers in mask.
   434  // If there is no unused register, a Value will be kicked out of
   435  // a register to make room.
   436  func (s *regAllocState) allocReg(mask regMask, v *Value) register {
   437  	if v.OnWasmStack {
   438  		return noRegister
   439  	}
   440  
   441  	mask &= s.allocatable
   442  	mask &^= s.nospill
   443  	if mask == 0 {
   444  		s.f.Fatalf("no register available for %s", v.LongString())
   445  	}
   446  
   447  	// Pick an unused register if one is available.
   448  	if mask&^s.used != 0 {
   449  		r := pickReg(mask &^ s.used)
   450  		s.usedSinceBlockStart |= regMask(1) << r
   451  		return r
   452  	}
   453  
   454  	// Pick a value to spill. Spill the value with the
   455  	// farthest-in-the-future use.
   456  	// TODO: Prefer registers with already spilled Values?
   457  	// TODO: Modify preference using affinity graph.
   458  	// TODO: if a single value is in multiple registers, spill one of them
   459  	// before spilling a value in just a single register.
   460  
   461  	// Find a register to spill. We spill the register containing the value
   462  	// whose next use is as far in the future as possible.
   463  	// https://en.wikipedia.org/wiki/Page_replacement_algorithm#The_theoretically_optimal_page_replacement_algorithm
   464  	var r register
   465  	maxuse := int32(-1)
   466  	for t := register(0); t < s.numRegs; t++ {
   467  		if mask>>t&1 == 0 {
   468  			continue
   469  		}
   470  		v := s.regs[t].v
   471  		if n := s.values[v.ID].uses.dist; n > maxuse {
   472  			// v's next use is farther in the future than any value
   473  			// we've seen so far. A new best spill candidate.
   474  			r = t
   475  			maxuse = n
   476  		}
   477  	}
   478  	if maxuse == -1 {
   479  		s.f.Fatalf("couldn't find register to spill")
   480  	}
   481  
   482  	if s.f.Config.ctxt.Arch.Arch == sys.ArchWasm {
   483  		// TODO(neelance): In theory this should never happen, because all wasm registers are equal.
   484  		// So if there is still a free register, the allocation should have picked that one in the first place instead of
   485  		// trying to kick some other value out. In practice, this case does happen and it breaks the stack optimization.
   486  		s.freeReg(r)
   487  		return r
   488  	}
   489  
   490  	// Try to move it around before kicking out, if there is a free register.
   491  	// We generate a Copy and record it. It will be deleted if never used.
   492  	v2 := s.regs[r].v
   493  	m := s.compatRegs(v2.Type) &^ s.used &^ s.tmpused &^ (regMask(1) << r)
   494  	if m != 0 && !s.values[v2.ID].rematerializeable && countRegs(s.values[v2.ID].regs) == 1 {
   495  		s.usedSinceBlockStart |= regMask(1) << r
   496  		r2 := pickReg(m)
   497  		c := s.curBlock.NewValue1(v2.Pos, OpCopy, v2.Type, s.regs[r].c)
   498  		s.copies[c] = false
   499  		if s.f.pass.debug > regDebug {
   500  			fmt.Printf("copy %s to %s : %s\n", v2, c, &s.registers[r2])
   501  		}
   502  		s.setOrig(c, v2)
   503  		s.assignReg(r2, v2, c)
   504  	}
   505  
   506  	// If the evicted register isn't used between the start of the block
   507  	// and now then there is no reason to even request it on entry. We can
   508  	// drop from startRegs in that case.
   509  	if s.usedSinceBlockStart&(regMask(1)<<r) == 0 {
   510  		if s.startRegsMask&(regMask(1)<<r) == 1 {
   511  			if s.f.pass.debug > regDebug {
   512  				fmt.Printf("dropped from startRegs: %s\n", &s.registers[r])
   513  			}
   514  			s.startRegsMask &^= regMask(1) << r
   515  		}
   516  	}
   517  
   518  	s.freeReg(r)
   519  	s.usedSinceBlockStart |= regMask(1) << r
   520  	return r
   521  }
   522  
   523  // makeSpill returns a Value which represents the spilled value of v.
   524  // b is the block in which the spill is used.
   525  func (s *regAllocState) makeSpill(v *Value, b *Block) *Value {
   526  	vi := &s.values[v.ID]
   527  	if vi.spill != nil {
   528  		// Final block not known - keep track of subtree where restores reside.
   529  		vi.restoreMin = min(vi.restoreMin, s.sdom[b.ID].entry)
   530  		vi.restoreMax = max(vi.restoreMax, s.sdom[b.ID].exit)
   531  		return vi.spill
   532  	}
   533  	// Make a spill for v. We don't know where we want
   534  	// to put it yet, so we leave it blockless for now.
   535  	spill := s.f.newValueNoBlock(OpStoreReg, v.Type, v.Pos)
   536  	// We also don't know what the spill's arg will be.
   537  	// Leave it argless for now.
   538  	s.setOrig(spill, v)
   539  	vi.spill = spill
   540  	vi.restoreMin = s.sdom[b.ID].entry
   541  	vi.restoreMax = s.sdom[b.ID].exit
   542  	return spill
   543  }
   544  
   545  // allocValToReg allocates v to a register selected from regMask and
   546  // returns the register copy of v. Any previous user is kicked out and spilled
   547  // (if necessary). Load code is added at the current pc. If nospill is set the
   548  // allocated register is marked nospill so the assignment cannot be
   549  // undone until the caller allows it by clearing nospill. Returns a
   550  // *Value which is either v or a copy of v allocated to the chosen register.
   551  func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos src.XPos) *Value {
   552  	if s.f.Config.ctxt.Arch.Arch == sys.ArchWasm && v.rematerializeable() {
   553  		c := v.copyIntoWithXPos(s.curBlock, pos)
   554  		c.OnWasmStack = true
   555  		s.setOrig(c, v)
   556  		return c
   557  	}
   558  	if v.OnWasmStack {
   559  		return v
   560  	}
   561  
   562  	vi := &s.values[v.ID]
   563  	pos = pos.WithNotStmt()
   564  	// Check if v is already in a requested register.
   565  	if mask&vi.regs != 0 {
   566  		mask &= vi.regs
   567  		r := pickReg(mask)
   568  		if mask.contains(s.SPReg) {
   569  			// Prefer the stack pointer if it is allowed.
   570  			// (Needed because the op might have an Aux symbol
   571  			// that needs SP as its base.)
   572  			r = s.SPReg
   573  		}
   574  		if !s.allocatable.contains(r) {
   575  			return v // v is in a fixed register
   576  		}
   577  		if s.regs[r].v != v || s.regs[r].c == nil {
   578  			panic("bad register state")
   579  		}
   580  		if nospill {
   581  			s.nospill |= regMask(1) << r
   582  		}
   583  		s.usedSinceBlockStart |= regMask(1) << r
   584  		return s.regs[r].c
   585  	}
   586  
   587  	var r register
   588  	// If nospill is set, the value is used immediately, so it can live on the WebAssembly stack.
   589  	onWasmStack := nospill && s.f.Config.ctxt.Arch.Arch == sys.ArchWasm
   590  	if !onWasmStack {
   591  		// Allocate a register.
   592  		r = s.allocReg(mask, v)
   593  	}
   594  
   595  	// Allocate v to the new register.
   596  	var c *Value
   597  	if vi.regs != 0 {
   598  		// Copy from a register that v is already in.
   599  		r2 := pickReg(vi.regs)
   600  		var current *Value
   601  		if !s.allocatable.contains(r2) {
   602  			current = v // v is in a fixed register
   603  		} else {
   604  			if s.regs[r2].v != v {
   605  				panic("bad register state")
   606  			}
   607  			current = s.regs[r2].c
   608  		}
   609  		s.usedSinceBlockStart |= regMask(1) << r2
   610  		c = s.curBlock.NewValue1(pos, OpCopy, v.Type, current)
   611  	} else if v.rematerializeable() {
   612  		// Rematerialize instead of loading from the spill location.
   613  		c = v.copyIntoWithXPos(s.curBlock, pos)
   614  		// We need to consider its output mask and potentially issue a Copy
   615  		// if there are register mask conflicts.
   616  		// This currently happens for the SIMD package only between GP and FP
   617  		// register. Because Intel's vector extension can put integer value into
   618  		// FP, which is seen as a vector. Example instruction: VPSLL[BWDQ]
   619  		// Because GP and FP masks do not overlap, mask & outputMask == 0
   620  		// detects this situation thoroughly.
   621  		sourceMask := s.regspec(c).outputs[0].regs
   622  		if mask&sourceMask == 0 && !onWasmStack {
   623  			s.setOrig(c, v)
   624  			s.assignReg(s.allocReg(sourceMask, v), v, c)
   625  			// v.Type for the new OpCopy is likely wrong and it might delay the problem
   626  			// until ssa to asm lowering, which might need the types to generate the right
   627  			// assembly for OpCopy. For Intel's GP to FP move, it happens to be that
   628  			// MOV instruction has such a variant so it happens to be right.
   629  			// But it's unclear for other architectures or situations, and the problem
   630  			// might be exposed when the assembler sees illegal instructions.
   631  			// Right now make we still pick v.Type, because at least its size should be correct
   632  			// for the rematerialization case the amd64 SIMD package exposed.
   633  			// TODO: We might need to figure out a way to find the correct type or make
   634  			// the asm lowering use reg info only for OpCopy.
   635  			c = s.curBlock.NewValue1(pos, OpCopy, v.Type, c)
   636  		}
   637  	} else {
   638  		// Load v from its spill location.
   639  		spill := s.makeSpill(v, s.curBlock)
   640  		if s.f.pass.debug > logSpills {
   641  			s.f.Warnl(vi.spill.Pos, "load spill for %v from %v", v, spill)
   642  		}
   643  		c = s.curBlock.NewValue1(pos, OpLoadReg, v.Type, spill)
   644  	}
   645  
   646  	s.setOrig(c, v)
   647  
   648  	if onWasmStack {
   649  		c.OnWasmStack = true
   650  		return c
   651  	}
   652  
   653  	s.assignReg(r, v, c)
   654  	if c.Op == OpLoadReg && s.isGReg(r) {
   655  		s.f.Fatalf("allocValToReg.OpLoadReg targeting g: " + c.LongString())
   656  	}
   657  	if nospill {
   658  		s.nospill |= regMask(1) << r
   659  	}
   660  	return c
   661  }
   662  
   663  // isLeaf reports whether f performs any calls.
   664  func isLeaf(f *Func) bool {
   665  	for _, b := range f.Blocks {
   666  		for _, v := range b.Values {
   667  			if v.Op.IsCall() && !v.Op.IsTailCall() {
   668  				// tail call is not counted as it does not save the return PC or need a frame
   669  				return false
   670  			}
   671  		}
   672  	}
   673  	return true
   674  }
   675  
   676  // needRegister reports whether v needs a register.
   677  func (v *Value) needRegister() bool {
   678  	return !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && !v.Type.IsTuple()
   679  }
   680  
   681  func (s *regAllocState) init(f *Func) {
   682  	s.f = f
   683  	s.f.RegAlloc = s.f.Cache.locs[:0]
   684  	s.registers = f.Config.registers
   685  	if nr := len(s.registers); nr == 0 || nr > int(noRegister) || nr > int(unsafe.Sizeof(regMask(0))*8) {
   686  		s.f.Fatalf("bad number of registers: %d", nr)
   687  	} else {
   688  		s.numRegs = register(nr)
   689  	}
   690  	// Locate SP, SB, and g registers.
   691  	s.SPReg = noRegister
   692  	s.SBReg = noRegister
   693  	s.GReg = noRegister
   694  	s.ZeroIntReg = noRegister
   695  	for r := register(0); r < s.numRegs; r++ {
   696  		switch s.registers[r].String() {
   697  		case "SP":
   698  			s.SPReg = r
   699  		case "SB":
   700  			s.SBReg = r
   701  		case "g":
   702  			s.GReg = r
   703  		case "ZERO": // TODO: arch-specific?
   704  			s.ZeroIntReg = r
   705  		}
   706  	}
   707  	// Make sure we found all required registers.
   708  	switch noRegister {
   709  	case s.SPReg:
   710  		s.f.Fatalf("no SP register found")
   711  	case s.SBReg:
   712  		s.f.Fatalf("no SB register found")
   713  	case s.GReg:
   714  		if f.Config.hasGReg {
   715  			s.f.Fatalf("no g register found")
   716  		}
   717  	}
   718  
   719  	// Figure out which registers we're allowed to use.
   720  	s.allocatable = s.f.Config.gpRegMask | s.f.Config.fpRegMask | s.f.Config.specialRegMask
   721  	s.allocatable &^= 1 << s.SPReg
   722  	s.allocatable &^= 1 << s.SBReg
   723  	if s.f.Config.hasGReg {
   724  		s.allocatable &^= 1 << s.GReg
   725  	}
   726  	if s.ZeroIntReg != noRegister {
   727  		s.allocatable &^= 1 << s.ZeroIntReg
   728  	}
   729  	if buildcfg.FramePointerEnabled && s.f.Config.FPReg >= 0 {
   730  		s.allocatable &^= 1 << uint(s.f.Config.FPReg)
   731  	}
   732  	if s.f.Config.LinkReg != -1 {
   733  		if isLeaf(f) {
   734  			// Leaf functions don't save/restore the link register.
   735  			s.allocatable &^= 1 << uint(s.f.Config.LinkReg)
   736  		}
   737  	}
   738  	if s.f.Config.ctxt.Flag_dynlink {
   739  		switch s.f.Config.arch {
   740  		case "386":
   741  			// nothing to do.
   742  			// Note that for Flag_shared (position independent code)
   743  			// we do need to be careful, but that carefulness is hidden
   744  			// in the rewrite rules so we always have a free register
   745  			// available for global load/stores. See _gen/386.rules (search for Flag_shared).
   746  		case "amd64":
   747  			s.allocatable &^= 1 << 15 // R15
   748  		case "arm":
   749  			s.allocatable &^= 1 << 9 // R9
   750  		case "arm64":
   751  			// nothing to do
   752  		case "loong64": // R2 (aka TP) already reserved.
   753  			// nothing to do
   754  		case "ppc64le": // R2 already reserved.
   755  			// nothing to do
   756  		case "riscv64": // X3 (aka GP) and X4 (aka TP) already reserved.
   757  			// nothing to do
   758  		case "s390x":
   759  			s.allocatable &^= 1 << 11 // R11
   760  		default:
   761  			s.f.fe.Fatalf(src.NoXPos, "arch %s not implemented", s.f.Config.arch)
   762  		}
   763  	}
   764  
   765  	// Linear scan register allocation can be influenced by the order in which blocks appear.
   766  	// Decouple the register allocation order from the generated block order.
   767  	// This also creates an opportunity for experiments to find a better order.
   768  	s.visitOrder = layoutRegallocOrder(f)
   769  
   770  	// Compute block order. This array allows us to distinguish forward edges
   771  	// from backward edges and compute how far they go.
   772  	s.blockOrder = make([]int32, f.NumBlocks())
   773  	for i, b := range s.visitOrder {
   774  		s.blockOrder[b.ID] = int32(i)
   775  	}
   776  
   777  	s.regs = make([]regState, s.numRegs)
   778  	nv := f.NumValues()
   779  	if cap(s.f.Cache.regallocValues) >= nv {
   780  		s.f.Cache.regallocValues = s.f.Cache.regallocValues[:nv]
   781  	} else {
   782  		s.f.Cache.regallocValues = make([]valState, nv)
   783  	}
   784  	s.values = s.f.Cache.regallocValues
   785  	s.orig = s.f.Cache.allocValueSlice(nv)
   786  	s.copies = make(map[*Value]bool)
   787  	for _, b := range s.visitOrder {
   788  		for _, v := range b.Values {
   789  			if v.needRegister() {
   790  				s.values[v.ID].needReg = true
   791  				s.values[v.ID].rematerializeable = v.rematerializeable()
   792  				s.orig[v.ID] = v
   793  			}
   794  			// Note: needReg is false for values returning Tuple types.
   795  			// Instead, we mark the corresponding Selects as needReg.
   796  		}
   797  	}
   798  	s.computeLive()
   799  
   800  	s.endRegs = make([][]endReg, f.NumBlocks())
   801  	s.startRegs = make([][]startReg, f.NumBlocks())
   802  	s.spillLive = make([][]ID, f.NumBlocks())
   803  	s.sdom = f.Sdom()
   804  
   805  	// wasm: Mark instructions that can be optimized to have their values only on the WebAssembly stack.
   806  	if f.Config.ctxt.Arch.Arch == sys.ArchWasm {
   807  		canLiveOnStack := f.newSparseSet(f.NumValues())
   808  		defer f.retSparseSet(canLiveOnStack)
   809  		for _, b := range f.Blocks {
   810  			// New block. Clear candidate set.
   811  			canLiveOnStack.clear()
   812  			for _, c := range b.ControlValues() {
   813  				if c.Uses == 1 && !opcodeTable[c.Op].generic {
   814  					canLiveOnStack.add(c.ID)
   815  				}
   816  			}
   817  			// Walking backwards.
   818  			for i := len(b.Values) - 1; i >= 0; i-- {
   819  				v := b.Values[i]
   820  				if canLiveOnStack.contains(v.ID) {
   821  					v.OnWasmStack = true
   822  				} else {
   823  					// Value can not live on stack. Values are not allowed to be reordered, so clear candidate set.
   824  					canLiveOnStack.clear()
   825  				}
   826  				for _, arg := range v.Args {
   827  					// Value can live on the stack if:
   828  					// - it is only used once
   829  					// - it is used in the same basic block
   830  					// - it is not a "mem" value
   831  					// - it is a WebAssembly op
   832  					if arg.Uses == 1 && arg.Block == v.Block && !arg.Type.IsMemory() && !opcodeTable[arg.Op].generic {
   833  						canLiveOnStack.add(arg.ID)
   834  					}
   835  				}
   836  			}
   837  		}
   838  	}
   839  
   840  	// The clobberdeadreg experiment inserts code to clobber dead registers
   841  	// at call sites.
   842  	// Ignore huge functions to avoid doing too much work.
   843  	if base.Flag.ClobberDeadReg && len(s.f.Blocks) <= 10000 {
   844  		// TODO: honor GOCLOBBERDEADHASH, or maybe GOSSAHASH.
   845  		s.doClobber = true
   846  	}
   847  }
   848  
   849  func (s *regAllocState) close() {
   850  	s.f.Cache.freeValueSlice(s.orig)
   851  }
   852  
   853  // Adds a use record for id at distance dist from the start of the block.
   854  // All calls to addUse must happen with nonincreasing dist.
   855  func (s *regAllocState) addUse(id ID, dist int32, pos src.XPos) {
   856  	r := s.freeUseRecords
   857  	if r != nil {
   858  		s.freeUseRecords = r.next
   859  	} else {
   860  		r = &use{}
   861  	}
   862  	r.dist = dist
   863  	r.pos = pos
   864  	r.next = s.values[id].uses
   865  	s.values[id].uses = r
   866  	if r.next != nil && dist > r.next.dist {
   867  		s.f.Fatalf("uses added in wrong order")
   868  	}
   869  }
   870  
   871  // advanceUses advances the uses of v's args from the state before v to the state after v.
   872  // Any values which have no more uses are deallocated from registers.
   873  func (s *regAllocState) advanceUses(v *Value) {
   874  	for _, a := range v.Args {
   875  		if !s.values[a.ID].needReg {
   876  			continue
   877  		}
   878  		ai := &s.values[a.ID]
   879  		r := ai.uses
   880  		ai.uses = r.next
   881  		if r.next == nil || (!opcodeTable[a.Op].fixedReg && r.next.dist > s.nextCall[s.curIdx]) {
   882  			// Value is dead (or is not used again until after a call), free all registers that hold it.
   883  			s.freeRegs(ai.regs)
   884  		}
   885  		r.next = s.freeUseRecords
   886  		s.freeUseRecords = r
   887  	}
   888  	s.dropIfUnused(v)
   889  }
   890  
   891  // Drop v from registers if it isn't used again, or its only uses are after
   892  // a call instruction.
   893  func (s *regAllocState) dropIfUnused(v *Value) {
   894  	if !s.values[v.ID].needReg {
   895  		return
   896  	}
   897  	vi := &s.values[v.ID]
   898  	r := vi.uses
   899  	if r == nil || (!opcodeTable[v.Op].fixedReg && r.dist > s.nextCall[s.curIdx]) {
   900  		s.freeRegs(vi.regs)
   901  	}
   902  }
   903  
   904  // liveAfterCurrentInstruction reports whether v is live after
   905  // the current instruction is completed.  v must be used by the
   906  // current instruction.
   907  func (s *regAllocState) liveAfterCurrentInstruction(v *Value) bool {
   908  	u := s.values[v.ID].uses
   909  	if u == nil {
   910  		panic(fmt.Errorf("u is nil, v = %s, s.values[v.ID] = %v", v.LongString(), s.values[v.ID]))
   911  	}
   912  	d := u.dist
   913  	for u != nil && u.dist == d {
   914  		u = u.next
   915  	}
   916  	return u != nil && u.dist > d
   917  }
   918  
   919  // Sets the state of the registers to that encoded in regs.
   920  func (s *regAllocState) setState(regs []endReg) {
   921  	s.freeRegs(s.used)
   922  	for _, x := range regs {
   923  		s.assignReg(x.r, x.v, x.c)
   924  	}
   925  }
   926  
   927  // compatRegs returns the set of registers which can store a type t.
   928  func (s *regAllocState) compatRegs(t *types.Type) regMask {
   929  	var m regMask
   930  	if t.IsTuple() || t.IsFlags() {
   931  		return 0
   932  	}
   933  	if t.IsFloat() || t == types.TypeInt128 {
   934  		if t.Kind() == types.TFLOAT32 && s.f.Config.fp32RegMask != 0 {
   935  			m = s.f.Config.fp32RegMask
   936  		} else if t.Kind() == types.TFLOAT64 && s.f.Config.fp64RegMask != 0 {
   937  			m = s.f.Config.fp64RegMask
   938  		} else {
   939  			m = s.f.Config.fpRegMask
   940  		}
   941  	} else {
   942  		m = s.f.Config.gpRegMask
   943  	}
   944  	return m & s.allocatable
   945  }
   946  
   947  // regspec returns the regInfo for operation op.
   948  func (s *regAllocState) regspec(v *Value) regInfo {
   949  	op := v.Op
   950  	if op == OpConvert {
   951  		// OpConvert is a generic op, so it doesn't have a
   952  		// register set in the static table. It can use any
   953  		// allocatable integer register.
   954  		m := s.allocatable & s.f.Config.gpRegMask
   955  		return regInfo{inputs: []inputInfo{{regs: m}}, outputs: []outputInfo{{regs: m}}}
   956  	}
   957  	if op == OpArgIntReg {
   958  		reg := v.Block.Func.Config.intParamRegs[v.AuxInt8()]
   959  		return regInfo{outputs: []outputInfo{{regs: 1 << uint(reg)}}}
   960  	}
   961  	if op == OpArgFloatReg {
   962  		reg := v.Block.Func.Config.floatParamRegs[v.AuxInt8()]
   963  		return regInfo{outputs: []outputInfo{{regs: 1 << uint(reg)}}}
   964  	}
   965  	if op.IsCall() {
   966  		if ac, ok := v.Aux.(*AuxCall); ok && ac.reg != nil {
   967  			return *ac.Reg(&opcodeTable[op].reg, s.f.Config)
   968  		}
   969  	}
   970  	if op == OpMakeResult && s.f.OwnAux.reg != nil {
   971  		return *s.f.OwnAux.ResultReg(s.f.Config)
   972  	}
   973  	return opcodeTable[op].reg
   974  }
   975  
   976  func (s *regAllocState) isGReg(r register) bool {
   977  	return s.f.Config.hasGReg && s.GReg == r
   978  }
   979  
   980  // Dummy value used to represent the value being held in a temporary register.
   981  var tmpVal Value
   982  
   983  func (s *regAllocState) regalloc(f *Func) {
   984  	regValLiveSet := f.newSparseSet(f.NumValues()) // set of values that may be live in register
   985  	defer f.retSparseSet(regValLiveSet)
   986  	var oldSched []*Value
   987  	var phis []*Value
   988  	var phiRegs []register
   989  	var args []*Value
   990  
   991  	// Data structure used for computing desired registers.
   992  	var desired desiredState
   993  	desiredSecondReg := map[ID][4]register{} // desired register allocation for 2nd part of a tuple
   994  
   995  	// Desired registers for inputs & outputs for each instruction in the block.
   996  	type dentry struct {
   997  		out [4]register    // desired output registers
   998  		in  [3][4]register // desired input registers (for inputs 0,1, and 2)
   999  	}
  1000  	var dinfo []dentry
  1001  
  1002  	if f.Entry != f.Blocks[0] {
  1003  		f.Fatalf("entry block must be first")
  1004  	}
  1005  
  1006  	for _, b := range s.visitOrder {
  1007  		if s.f.pass.debug > regDebug {
  1008  			fmt.Printf("Begin processing block %v\n", b)
  1009  		}
  1010  		s.curBlock = b
  1011  		s.startRegsMask = 0
  1012  		s.usedSinceBlockStart = 0
  1013  		clear(desiredSecondReg)
  1014  
  1015  		// Initialize regValLiveSet and uses fields for this block.
  1016  		// Walk backwards through the block doing liveness analysis.
  1017  		regValLiveSet.clear()
  1018  		if s.live != nil {
  1019  			for _, e := range s.live[b.ID] {
  1020  				s.addUse(e.ID, int32(len(b.Values))+e.dist, e.pos) // pseudo-uses from beyond end of block
  1021  				regValLiveSet.add(e.ID)
  1022  			}
  1023  		}
  1024  		for _, v := range b.ControlValues() {
  1025  			if s.values[v.ID].needReg {
  1026  				s.addUse(v.ID, int32(len(b.Values)), b.Pos) // pseudo-use by control values
  1027  				regValLiveSet.add(v.ID)
  1028  			}
  1029  		}
  1030  		if len(s.nextCall) < len(b.Values) {
  1031  			s.nextCall = append(s.nextCall, make([]int32, len(b.Values)-len(s.nextCall))...)
  1032  		}
  1033  		var nextCall int32 = math.MaxInt32
  1034  		for i := len(b.Values) - 1; i >= 0; i-- {
  1035  			v := b.Values[i]
  1036  			regValLiveSet.remove(v.ID)
  1037  			if v.Op == OpPhi {
  1038  				// Remove v from the live set, but don't add
  1039  				// any inputs. This is the state the len(b.Preds)>1
  1040  				// case below desires; it wants to process phis specially.
  1041  				s.nextCall[i] = nextCall
  1042  				continue
  1043  			}
  1044  			if opcodeTable[v.Op].call {
  1045  				// Function call clobbers all the registers but SP and SB.
  1046  				regValLiveSet.clear()
  1047  				if s.sp != 0 && s.values[s.sp].uses != nil {
  1048  					regValLiveSet.add(s.sp)
  1049  				}
  1050  				if s.sb != 0 && s.values[s.sb].uses != nil {
  1051  					regValLiveSet.add(s.sb)
  1052  				}
  1053  				nextCall = int32(i)
  1054  			}
  1055  			for _, a := range v.Args {
  1056  				if !s.values[a.ID].needReg {
  1057  					continue
  1058  				}
  1059  				s.addUse(a.ID, int32(i), v.Pos)
  1060  				regValLiveSet.add(a.ID)
  1061  			}
  1062  			s.nextCall[i] = nextCall
  1063  		}
  1064  		if s.f.pass.debug > regDebug {
  1065  			fmt.Printf("use distances for %s\n", b)
  1066  			for i := range s.values {
  1067  				vi := &s.values[i]
  1068  				u := vi.uses
  1069  				if u == nil {
  1070  					continue
  1071  				}
  1072  				fmt.Printf("  v%d:", i)
  1073  				for u != nil {
  1074  					fmt.Printf(" %d", u.dist)
  1075  					u = u.next
  1076  				}
  1077  				fmt.Println()
  1078  			}
  1079  		}
  1080  
  1081  		// Make a copy of the block schedule so we can generate a new one in place.
  1082  		// We make a separate copy for phis and regular values.
  1083  		nphi := 0
  1084  		for _, v := range b.Values {
  1085  			if v.Op != OpPhi {
  1086  				break
  1087  			}
  1088  			nphi++
  1089  		}
  1090  		phis = append(phis[:0], b.Values[:nphi]...)
  1091  		oldSched = append(oldSched[:0], b.Values[nphi:]...)
  1092  		b.Values = b.Values[:0]
  1093  
  1094  		// Initialize start state of block.
  1095  		if b == f.Entry {
  1096  			// Regalloc state is empty to start.
  1097  			if nphi > 0 {
  1098  				f.Fatalf("phis in entry block")
  1099  			}
  1100  		} else if len(b.Preds) == 1 {
  1101  			// Start regalloc state with the end state of the previous block.
  1102  			s.setState(s.endRegs[b.Preds[0].b.ID])
  1103  			if nphi > 0 {
  1104  				f.Fatalf("phis in single-predecessor block")
  1105  			}
  1106  			// Drop any values which are no longer live.
  1107  			// This may happen because at the end of p, a value may be
  1108  			// live but only used by some other successor of p.
  1109  			for r := register(0); r < s.numRegs; r++ {
  1110  				v := s.regs[r].v
  1111  				if v != nil && !regValLiveSet.contains(v.ID) {
  1112  					s.freeReg(r)
  1113  				}
  1114  			}
  1115  		} else {
  1116  			// This is the complicated case. We have more than one predecessor,
  1117  			// which means we may have Phi ops.
  1118  
  1119  			// Start with the final register state of the predecessor with least spill values.
  1120  			// This is based on the following points:
  1121  			// 1, The less spill value indicates that the register pressure of this path is smaller,
  1122  			//    so the values of this block are more likely to be allocated to registers.
  1123  			// 2, Avoid the predecessor that contains the function call, because the predecessor that
  1124  			//    contains the function call usually generates a lot of spills and lose the previous
  1125  			//    allocation state.
  1126  			// TODO: Improve this part. At least the size of endRegs of the predecessor also has
  1127  			// an impact on the code size and compiler speed. But it is not easy to find a simple
  1128  			// and efficient method that combines multiple factors.
  1129  			idx := -1
  1130  			for i, p := range b.Preds {
  1131  				// If the predecessor has not been visited yet, skip it because its end state
  1132  				// (redRegs and spillLive) has not been computed yet.
  1133  				pb := p.b
  1134  				if s.blockOrder[pb.ID] >= s.blockOrder[b.ID] {
  1135  					continue
  1136  				}
  1137  				if idx == -1 {
  1138  					idx = i
  1139  					continue
  1140  				}
  1141  				pSel := b.Preds[idx].b
  1142  				if len(s.spillLive[pb.ID]) < len(s.spillLive[pSel.ID]) {
  1143  					idx = i
  1144  				} else if len(s.spillLive[pb.ID]) == len(s.spillLive[pSel.ID]) {
  1145  					// Use a bit of likely information. After critical pass, pb and pSel must
  1146  					// be plain blocks, so check edge pb->pb.Preds instead of edge pb->b.
  1147  					// TODO: improve the prediction of the likely predecessor. The following
  1148  					// method is only suitable for the simplest cases. For complex cases,
  1149  					// the prediction may be inaccurate, but this does not affect the
  1150  					// correctness of the program.
  1151  					// According to the layout algorithm, the predecessor with the
  1152  					// smaller blockOrder is the true branch, and the test results show
  1153  					// that it is better to choose the predecessor with a smaller
  1154  					// blockOrder than no choice.
  1155  					if pb.likelyBranch() && !pSel.likelyBranch() || s.blockOrder[pb.ID] < s.blockOrder[pSel.ID] {
  1156  						idx = i
  1157  					}
  1158  				}
  1159  			}
  1160  			if idx < 0 {
  1161  				f.Fatalf("bad visitOrder, no predecessor of %s has been visited before it", b)
  1162  			}
  1163  			p := b.Preds[idx].b
  1164  			s.setState(s.endRegs[p.ID])
  1165  
  1166  			if s.f.pass.debug > regDebug {
  1167  				fmt.Printf("starting merge block %s with end state of %s:\n", b, p)
  1168  				for _, x := range s.endRegs[p.ID] {
  1169  					fmt.Printf("  %s: orig:%s cache:%s\n", &s.registers[x.r], x.v, x.c)
  1170  				}
  1171  			}
  1172  
  1173  			// Decide on registers for phi ops. Use the registers determined
  1174  			// by the primary predecessor if we can.
  1175  			// TODO: pick best of (already processed) predecessors?
  1176  			// Majority vote? Deepest nesting level?
  1177  			phiRegs = phiRegs[:0]
  1178  			var phiUsed regMask
  1179  
  1180  			for _, v := range phis {
  1181  				if !s.values[v.ID].needReg {
  1182  					phiRegs = append(phiRegs, noRegister)
  1183  					continue
  1184  				}
  1185  				a := v.Args[idx]
  1186  				// Some instructions target not-allocatable registers.
  1187  				// They're not suitable for further (phi-function) allocation.
  1188  				m := s.values[a.ID].regs &^ phiUsed & s.allocatable
  1189  				if m != 0 {
  1190  					r := pickReg(m)
  1191  					phiUsed |= regMask(1) << r
  1192  					phiRegs = append(phiRegs, r)
  1193  				} else {
  1194  					phiRegs = append(phiRegs, noRegister)
  1195  				}
  1196  			}
  1197  
  1198  			// Second pass - deallocate all in-register phi inputs.
  1199  			for i, v := range phis {
  1200  				if !s.values[v.ID].needReg {
  1201  					continue
  1202  				}
  1203  				a := v.Args[idx]
  1204  				r := phiRegs[i]
  1205  				if r == noRegister {
  1206  					continue
  1207  				}
  1208  				if regValLiveSet.contains(a.ID) {
  1209  					// Input value is still live (it is used by something other than Phi).
  1210  					// Try to move it around before kicking out, if there is a free register.
  1211  					// We generate a Copy in the predecessor block and record it. It will be
  1212  					// deleted later if never used.
  1213  					//
  1214  					// Pick a free register. At this point some registers used in the predecessor
  1215  					// block may have been deallocated. Those are the ones used for Phis. Exclude
  1216  					// them (and they are not going to be helpful anyway).
  1217  					m := s.compatRegs(a.Type) &^ s.used &^ phiUsed
  1218  					if m != 0 && !s.values[a.ID].rematerializeable && countRegs(s.values[a.ID].regs) == 1 {
  1219  						r2 := pickReg(m)
  1220  						c := p.NewValue1(a.Pos, OpCopy, a.Type, s.regs[r].c)
  1221  						s.copies[c] = false
  1222  						if s.f.pass.debug > regDebug {
  1223  							fmt.Printf("copy %s to %s : %s\n", a, c, &s.registers[r2])
  1224  						}
  1225  						s.setOrig(c, a)
  1226  						s.assignReg(r2, a, c)
  1227  						s.endRegs[p.ID] = append(s.endRegs[p.ID], endReg{r2, a, c})
  1228  					}
  1229  				}
  1230  				s.freeReg(r)
  1231  			}
  1232  
  1233  			// Copy phi ops into new schedule.
  1234  			b.Values = append(b.Values, phis...)
  1235  
  1236  			// Third pass - pick registers for phis whose input
  1237  			// was not in a register in the primary predecessor.
  1238  			for i, v := range phis {
  1239  				if !s.values[v.ID].needReg {
  1240  					continue
  1241  				}
  1242  				if phiRegs[i] != noRegister {
  1243  					continue
  1244  				}
  1245  				m := s.compatRegs(v.Type) &^ phiUsed &^ s.used
  1246  				// If one of the other inputs of v is in a register, and the register is available,
  1247  				// select this register, which can save some unnecessary copies.
  1248  				for i, pe := range b.Preds {
  1249  					if i == idx {
  1250  						continue
  1251  					}
  1252  					ri := noRegister
  1253  					for _, er := range s.endRegs[pe.b.ID] {
  1254  						if er.v == s.orig[v.Args[i].ID] {
  1255  							ri = er.r
  1256  							break
  1257  						}
  1258  					}
  1259  					if ri != noRegister && m>>ri&1 != 0 {
  1260  						m = regMask(1) << ri
  1261  						break
  1262  					}
  1263  				}
  1264  				if m != 0 {
  1265  					r := pickReg(m)
  1266  					phiRegs[i] = r
  1267  					phiUsed |= regMask(1) << r
  1268  				}
  1269  			}
  1270  
  1271  			// Set registers for phis. Add phi spill code.
  1272  			for i, v := range phis {
  1273  				if !s.values[v.ID].needReg {
  1274  					continue
  1275  				}
  1276  				r := phiRegs[i]
  1277  				if r == noRegister {
  1278  					// stack-based phi
  1279  					// Spills will be inserted in all the predecessors below.
  1280  					s.values[v.ID].spill = v // v starts life spilled
  1281  					continue
  1282  				}
  1283  				// register-based phi
  1284  				s.assignReg(r, v, v)
  1285  			}
  1286  
  1287  			// Deallocate any values which are no longer live. Phis are excluded.
  1288  			for r := register(0); r < s.numRegs; r++ {
  1289  				if phiUsed>>r&1 != 0 {
  1290  					continue
  1291  				}
  1292  				v := s.regs[r].v
  1293  				if v != nil && !regValLiveSet.contains(v.ID) {
  1294  					s.freeReg(r)
  1295  				}
  1296  			}
  1297  
  1298  			// Save the starting state for use by merge edges.
  1299  			// We append to a stack allocated variable that we'll
  1300  			// later copy into s.startRegs in one fell swoop, to save
  1301  			// on allocations.
  1302  			regList := make([]startReg, 0, 32)
  1303  			for r := register(0); r < s.numRegs; r++ {
  1304  				v := s.regs[r].v
  1305  				if v == nil {
  1306  					continue
  1307  				}
  1308  				if phiUsed>>r&1 != 0 {
  1309  					// Skip registers that phis used, we'll handle those
  1310  					// specially during merge edge processing.
  1311  					continue
  1312  				}
  1313  				regList = append(regList, startReg{r, v, s.regs[r].c, s.values[v.ID].uses.pos})
  1314  				s.startRegsMask |= regMask(1) << r
  1315  			}
  1316  			s.startRegs[b.ID] = make([]startReg, len(regList))
  1317  			copy(s.startRegs[b.ID], regList)
  1318  
  1319  			if s.f.pass.debug > regDebug {
  1320  				fmt.Printf("after phis\n")
  1321  				for _, x := range s.startRegs[b.ID] {
  1322  					fmt.Printf("  %s: v%d\n", &s.registers[x.r], x.v.ID)
  1323  				}
  1324  			}
  1325  		}
  1326  
  1327  		// Drop phis from registers if they immediately go dead.
  1328  		for i, v := range phis {
  1329  			s.curIdx = i
  1330  			s.dropIfUnused(v)
  1331  		}
  1332  
  1333  		// Allocate space to record the desired registers for each value.
  1334  		if l := len(oldSched); cap(dinfo) < l {
  1335  			dinfo = make([]dentry, l)
  1336  		} else {
  1337  			dinfo = dinfo[:l]
  1338  			clear(dinfo)
  1339  		}
  1340  
  1341  		// Load static desired register info at the end of the block.
  1342  		if s.desired != nil {
  1343  			desired.copy(&s.desired[b.ID])
  1344  		}
  1345  
  1346  		// Check actual assigned registers at the start of the next block(s).
  1347  		// Dynamically assigned registers will trump the static
  1348  		// desired registers computed during liveness analysis.
  1349  		// Note that we do this phase after startRegs is set above, so that
  1350  		// we get the right behavior for a block which branches to itself.
  1351  		for _, e := range b.Succs {
  1352  			succ := e.b
  1353  			// TODO: prioritize likely successor?
  1354  			for _, x := range s.startRegs[succ.ID] {
  1355  				desired.add(x.v.ID, x.r)
  1356  			}
  1357  			// Process phi ops in succ.
  1358  			pidx := e.i
  1359  			for _, v := range succ.Values {
  1360  				if v.Op != OpPhi {
  1361  					break
  1362  				}
  1363  				if !s.values[v.ID].needReg {
  1364  					continue
  1365  				}
  1366  				rp, ok := s.f.getHome(v.ID).(*Register)
  1367  				if !ok {
  1368  					// If v is not assigned a register, pick a register assigned to one of v's inputs.
  1369  					// Hopefully v will get assigned that register later.
  1370  					// If the inputs have allocated register information, add it to desired,
  1371  					// which may reduce spill or copy operations when the register is available.
  1372  					for _, a := range v.Args {
  1373  						rp, ok = s.f.getHome(a.ID).(*Register)
  1374  						if ok {
  1375  							break
  1376  						}
  1377  					}
  1378  					if !ok {
  1379  						continue
  1380  					}
  1381  				}
  1382  				desired.add(v.Args[pidx].ID, register(rp.num))
  1383  			}
  1384  		}
  1385  		// Walk values backwards computing desired register info.
  1386  		// See computeDesired for more comments.
  1387  		for i := len(oldSched) - 1; i >= 0; i-- {
  1388  			v := oldSched[i]
  1389  			prefs := desired.remove(v.ID)
  1390  			regspec := s.regspec(v)
  1391  			desired.clobber(regspec.clobbers)
  1392  			for _, j := range regspec.inputs {
  1393  				if countRegs(j.regs) != 1 {
  1394  					continue
  1395  				}
  1396  				desired.clobber(j.regs)
  1397  				desired.add(v.Args[j.idx].ID, pickReg(j.regs))
  1398  			}
  1399  			if opcodeTable[v.Op].resultInArg0 || v.Op == OpAMD64ADDQconst || v.Op == OpAMD64ADDLconst || v.Op == OpSelect0 {
  1400  				if opcodeTable[v.Op].commutative {
  1401  					desired.addList(v.Args[1].ID, prefs)
  1402  				}
  1403  				desired.addList(v.Args[0].ID, prefs)
  1404  			}
  1405  			// Save desired registers for this value.
  1406  			dinfo[i].out = prefs
  1407  			for j, a := range v.Args {
  1408  				if j >= len(dinfo[i].in) {
  1409  					break
  1410  				}
  1411  				dinfo[i].in[j] = desired.get(a.ID)
  1412  			}
  1413  			if v.Op == OpSelect1 && prefs[0] != noRegister {
  1414  				// Save desired registers of select1 for
  1415  				// use by the tuple generating instruction.
  1416  				desiredSecondReg[v.Args[0].ID] = prefs
  1417  			}
  1418  		}
  1419  
  1420  		// Process all the non-phi values.
  1421  		for idx, v := range oldSched {
  1422  			s.curIdx = nphi + idx
  1423  			tmpReg := noRegister
  1424  			if s.f.pass.debug > regDebug {
  1425  				fmt.Printf("  processing %s\n", v.LongString())
  1426  			}
  1427  			regspec := s.regspec(v)
  1428  			if v.Op == OpPhi {
  1429  				f.Fatalf("phi %s not at start of block", v)
  1430  			}
  1431  			if opcodeTable[v.Op].fixedReg {
  1432  				switch v.Op {
  1433  				case OpSP:
  1434  					s.assignReg(s.SPReg, v, v)
  1435  					s.sp = v.ID
  1436  				case OpSB:
  1437  					s.assignReg(s.SBReg, v, v)
  1438  					s.sb = v.ID
  1439  				case OpARM64ZERO, OpLOONG64ZERO, OpMIPS64ZERO:
  1440  					s.assignReg(s.ZeroIntReg, v, v)
  1441  				default:
  1442  					f.Fatalf("unknown fixed-register op %s", v)
  1443  				}
  1444  				b.Values = append(b.Values, v)
  1445  				s.advanceUses(v)
  1446  				continue
  1447  			}
  1448  			if v.Op == OpSelect0 || v.Op == OpSelect1 || v.Op == OpSelectN {
  1449  				if s.values[v.ID].needReg {
  1450  					if v.Op == OpSelectN {
  1451  						s.assignReg(register(s.f.getHome(v.Args[0].ID).(LocResults)[int(v.AuxInt)].(*Register).num), v, v)
  1452  					} else {
  1453  						var i = 0
  1454  						if v.Op == OpSelect1 {
  1455  							i = 1
  1456  						}
  1457  						s.assignReg(register(s.f.getHome(v.Args[0].ID).(LocPair)[i].(*Register).num), v, v)
  1458  					}
  1459  				}
  1460  				b.Values = append(b.Values, v)
  1461  				s.advanceUses(v)
  1462  				continue
  1463  			}
  1464  			if v.Op == OpGetG && s.f.Config.hasGReg {
  1465  				// use hardware g register
  1466  				if s.regs[s.GReg].v != nil {
  1467  					s.freeReg(s.GReg) // kick out the old value
  1468  				}
  1469  				s.assignReg(s.GReg, v, v)
  1470  				b.Values = append(b.Values, v)
  1471  				s.advanceUses(v)
  1472  				continue
  1473  			}
  1474  			if v.Op == OpArg {
  1475  				// Args are "pre-spilled" values. We don't allocate
  1476  				// any register here. We just set up the spill pointer to
  1477  				// point at itself and any later user will restore it to use it.
  1478  				s.values[v.ID].spill = v
  1479  				b.Values = append(b.Values, v)
  1480  				s.advanceUses(v)
  1481  				continue
  1482  			}
  1483  			if v.Op == OpKeepAlive {
  1484  				// Make sure the argument to v is still live here.
  1485  				s.advanceUses(v)
  1486  				a := v.Args[0]
  1487  				vi := &s.values[a.ID]
  1488  				if vi.regs == 0 && !vi.rematerializeable {
  1489  					// Use the spill location.
  1490  					// This forces later liveness analysis to make the
  1491  					// value live at this point.
  1492  					v.SetArg(0, s.makeSpill(a, b))
  1493  				} else if _, ok := a.Aux.(*ir.Name); ok && vi.rematerializeable {
  1494  					// Rematerializeable value with a *ir.Name. This is the address of
  1495  					// a stack object (e.g. an LEAQ). Keep the object live.
  1496  					// Change it to VarLive, which is what plive expects for locals.
  1497  					v.Op = OpVarLive
  1498  					v.SetArgs1(v.Args[1])
  1499  					v.Aux = a.Aux
  1500  				} else {
  1501  					// In-register and rematerializeable values are already live.
  1502  					// These are typically rematerializeable constants like nil,
  1503  					// or values of a variable that were modified since the last call.
  1504  					v.Op = OpCopy
  1505  					v.SetArgs1(v.Args[1])
  1506  				}
  1507  				b.Values = append(b.Values, v)
  1508  				continue
  1509  			}
  1510  			if len(regspec.inputs) == 0 && len(regspec.outputs) == 0 {
  1511  				// No register allocation required (or none specified yet)
  1512  				if s.doClobber && v.Op.IsCall() {
  1513  					s.clobberRegs(regspec.clobbers)
  1514  				}
  1515  				s.freeRegs(regspec.clobbers)
  1516  				b.Values = append(b.Values, v)
  1517  				s.advanceUses(v)
  1518  				continue
  1519  			}
  1520  
  1521  			if s.values[v.ID].rematerializeable {
  1522  				// Value is rematerializeable, don't issue it here.
  1523  				// It will get issued just before each use (see
  1524  				// allocValueToReg).
  1525  				for _, a := range v.Args {
  1526  					a.Uses--
  1527  				}
  1528  				s.advanceUses(v)
  1529  				continue
  1530  			}
  1531  
  1532  			if s.f.pass.debug > regDebug {
  1533  				fmt.Printf("value %s\n", v.LongString())
  1534  				fmt.Printf("  out:")
  1535  				for _, r := range dinfo[idx].out {
  1536  					if r != noRegister {
  1537  						fmt.Printf(" %s", &s.registers[r])
  1538  					}
  1539  				}
  1540  				fmt.Println()
  1541  				for i := 0; i < len(v.Args) && i < 3; i++ {
  1542  					fmt.Printf("  in%d:", i)
  1543  					for _, r := range dinfo[idx].in[i] {
  1544  						if r != noRegister {
  1545  							fmt.Printf(" %s", &s.registers[r])
  1546  						}
  1547  					}
  1548  					fmt.Println()
  1549  				}
  1550  			}
  1551  
  1552  			// Move arguments to registers.
  1553  			// First, if an arg must be in a specific register and it is already
  1554  			// in place, keep it.
  1555  			args = append(args[:0], make([]*Value, len(v.Args))...)
  1556  			for i, a := range v.Args {
  1557  				if !s.values[a.ID].needReg {
  1558  					args[i] = a
  1559  				}
  1560  			}
  1561  			for _, i := range regspec.inputs {
  1562  				mask := i.regs
  1563  				if countRegs(mask) == 1 && mask&s.values[v.Args[i.idx].ID].regs != 0 {
  1564  					args[i.idx] = s.allocValToReg(v.Args[i.idx], mask, true, v.Pos)
  1565  				}
  1566  			}
  1567  			// Then, if an arg must be in a specific register and that
  1568  			// register is free, allocate that one. Otherwise when processing
  1569  			// another input we may kick a value into the free register, which
  1570  			// then will be kicked out again.
  1571  			// This is a common case for passing-in-register arguments for
  1572  			// function calls.
  1573  			for {
  1574  				freed := false
  1575  				for _, i := range regspec.inputs {
  1576  					if args[i.idx] != nil {
  1577  						continue // already allocated
  1578  					}
  1579  					mask := i.regs
  1580  					if countRegs(mask) == 1 && mask&^s.used != 0 {
  1581  						args[i.idx] = s.allocValToReg(v.Args[i.idx], mask, true, v.Pos)
  1582  						// If the input is in other registers that will be clobbered by v,
  1583  						// or the input is dead, free the registers. This may make room
  1584  						// for other inputs.
  1585  						oldregs := s.values[v.Args[i.idx].ID].regs
  1586  						if oldregs&^regspec.clobbers == 0 || !s.liveAfterCurrentInstruction(v.Args[i.idx]) {
  1587  							s.freeRegs(oldregs &^ mask &^ s.nospill)
  1588  							freed = true
  1589  						}
  1590  					}
  1591  				}
  1592  				if !freed {
  1593  					break
  1594  				}
  1595  			}
  1596  			// Last, allocate remaining ones, in an ordering defined
  1597  			// by the register specification (most constrained first).
  1598  			for _, i := range regspec.inputs {
  1599  				if args[i.idx] != nil {
  1600  					continue // already allocated
  1601  				}
  1602  				mask := i.regs
  1603  				if mask&s.values[v.Args[i.idx].ID].regs == 0 {
  1604  					// Need a new register for the input.
  1605  					mask &= s.allocatable
  1606  					mask &^= s.nospill
  1607  					// Used desired register if available.
  1608  					if i.idx < 3 {
  1609  						for _, r := range dinfo[idx].in[i.idx] {
  1610  							if r != noRegister && (mask&^s.used)>>r&1 != 0 {
  1611  								// Desired register is allowed and unused.
  1612  								mask = regMask(1) << r
  1613  								break
  1614  							}
  1615  						}
  1616  					}
  1617  					// Avoid registers we're saving for other values.
  1618  					if mask&^desired.avoid != 0 {
  1619  						mask &^= desired.avoid
  1620  					}
  1621  				}
  1622  				if mask&s.values[v.Args[i.idx].ID].regs&(1<<s.SPReg) != 0 {
  1623  					// Prefer SP register. This ensures that local variables
  1624  					// use SP as their base register (instead of a copy of the
  1625  					// stack pointer living in another register). See issue 74836.
  1626  					mask = 1 << s.SPReg
  1627  				}
  1628  				args[i.idx] = s.allocValToReg(v.Args[i.idx], mask, true, v.Pos)
  1629  			}
  1630  
  1631  			// If the output clobbers the input register, make sure we have
  1632  			// at least two copies of the input register so we don't
  1633  			// have to reload the value from the spill location.
  1634  			if opcodeTable[v.Op].resultInArg0 {
  1635  				var m regMask
  1636  				if !s.liveAfterCurrentInstruction(v.Args[0]) {
  1637  					// arg0 is dead.  We can clobber its register.
  1638  					goto ok
  1639  				}
  1640  				if opcodeTable[v.Op].commutative && !s.liveAfterCurrentInstruction(v.Args[1]) {
  1641  					args[0], args[1] = args[1], args[0]
  1642  					goto ok
  1643  				}
  1644  				if s.values[v.Args[0].ID].rematerializeable {
  1645  					// We can rematerialize the input, don't worry about clobbering it.
  1646  					goto ok
  1647  				}
  1648  				if opcodeTable[v.Op].commutative && s.values[v.Args[1].ID].rematerializeable {
  1649  					args[0], args[1] = args[1], args[0]
  1650  					goto ok
  1651  				}
  1652  				if countRegs(s.values[v.Args[0].ID].regs) >= 2 {
  1653  					// we have at least 2 copies of arg0.  We can afford to clobber one.
  1654  					goto ok
  1655  				}
  1656  				if opcodeTable[v.Op].commutative && countRegs(s.values[v.Args[1].ID].regs) >= 2 {
  1657  					args[0], args[1] = args[1], args[0]
  1658  					goto ok
  1659  				}
  1660  
  1661  				// We can't overwrite arg0 (or arg1, if commutative).  So we
  1662  				// need to make a copy of an input so we have a register we can modify.
  1663  
  1664  				// Possible new registers to copy into.
  1665  				m = s.compatRegs(v.Args[0].Type) &^ s.used
  1666  				if m == 0 {
  1667  					// No free registers.  In this case we'll just clobber
  1668  					// an input and future uses of that input must use a restore.
  1669  					// TODO(khr): We should really do this like allocReg does it,
  1670  					// spilling the value with the most distant next use.
  1671  					goto ok
  1672  				}
  1673  
  1674  				// Try to move an input to the desired output, if allowed.
  1675  				for _, r := range dinfo[idx].out {
  1676  					if r != noRegister && (m&regspec.outputs[0].regs)>>r&1 != 0 {
  1677  						m = regMask(1) << r
  1678  						args[0] = s.allocValToReg(v.Args[0], m, true, v.Pos)
  1679  						// Note: we update args[0] so the instruction will
  1680  						// use the register copy we just made.
  1681  						goto ok
  1682  					}
  1683  				}
  1684  				// Try to copy input to its desired location & use its old
  1685  				// location as the result register.
  1686  				for _, r := range dinfo[idx].in[0] {
  1687  					if r != noRegister && m>>r&1 != 0 {
  1688  						m = regMask(1) << r
  1689  						c := s.allocValToReg(v.Args[0], m, true, v.Pos)
  1690  						s.copies[c] = false
  1691  						// Note: no update to args[0] so the instruction will
  1692  						// use the original copy.
  1693  						goto ok
  1694  					}
  1695  				}
  1696  				if opcodeTable[v.Op].commutative {
  1697  					for _, r := range dinfo[idx].in[1] {
  1698  						if r != noRegister && m>>r&1 != 0 {
  1699  							m = regMask(1) << r
  1700  							c := s.allocValToReg(v.Args[1], m, true, v.Pos)
  1701  							s.copies[c] = false
  1702  							args[0], args[1] = args[1], args[0]
  1703  							goto ok
  1704  						}
  1705  					}
  1706  				}
  1707  
  1708  				// Avoid future fixed uses if we can.
  1709  				if m&^desired.avoid != 0 {
  1710  					m &^= desired.avoid
  1711  				}
  1712  				// Save input 0 to a new register so we can clobber it.
  1713  				c := s.allocValToReg(v.Args[0], m, true, v.Pos)
  1714  				s.copies[c] = false
  1715  
  1716  				// Normally we use the register of the old copy of input 0 as the target.
  1717  				// However, if input 0 is already in its desired register then we use
  1718  				// the register of the new copy instead.
  1719  				if regspec.outputs[0].regs>>s.f.getHome(c.ID).(*Register).num&1 != 0 {
  1720  					if rp, ok := s.f.getHome(args[0].ID).(*Register); ok {
  1721  						r := register(rp.num)
  1722  						for _, r2 := range dinfo[idx].in[0] {
  1723  							if r == r2 {
  1724  								args[0] = c
  1725  								break
  1726  							}
  1727  						}
  1728  					}
  1729  				}
  1730  			}
  1731  		ok:
  1732  			for i := 0; i < 2; i++ {
  1733  				if !(i == 0 && regspec.clobbersArg0 || i == 1 && regspec.clobbersArg1) {
  1734  					continue
  1735  				}
  1736  				if !s.liveAfterCurrentInstruction(v.Args[i]) {
  1737  					// arg is dead.  We can clobber its register.
  1738  					continue
  1739  				}
  1740  				if s.values[v.Args[i].ID].rematerializeable {
  1741  					// We can rematerialize the input, don't worry about clobbering it.
  1742  					continue
  1743  				}
  1744  				if countRegs(s.values[v.Args[i].ID].regs) >= 2 {
  1745  					// We have at least 2 copies of arg.  We can afford to clobber one.
  1746  					continue
  1747  				}
  1748  				// Possible new registers to copy into.
  1749  				m := s.compatRegs(v.Args[i].Type) &^ s.used
  1750  				if m == 0 {
  1751  					// No free registers.  In this case we'll just clobber the
  1752  					// input and future uses of that input must use a restore.
  1753  					// TODO(khr): We should really do this like allocReg does it,
  1754  					// spilling the value with the most distant next use.
  1755  					continue
  1756  				}
  1757  				// Copy input to a different register that won't be clobbered.
  1758  				c := s.allocValToReg(v.Args[i], m, true, v.Pos)
  1759  				s.copies[c] = false
  1760  			}
  1761  
  1762  			// Pick a temporary register if needed.
  1763  			// It should be distinct from all the input registers, so we
  1764  			// allocate it after all the input registers, but before
  1765  			// the input registers are freed via advanceUses below.
  1766  			// (Not all instructions need that distinct part, but it is conservative.)
  1767  			// We also ensure it is not any of the single-choice output registers.
  1768  			if opcodeTable[v.Op].needIntTemp {
  1769  				m := s.allocatable & s.f.Config.gpRegMask
  1770  				for _, out := range regspec.outputs {
  1771  					if countRegs(out.regs) == 1 {
  1772  						m &^= out.regs
  1773  					}
  1774  				}
  1775  				if m&^desired.avoid&^s.nospill != 0 {
  1776  					m &^= desired.avoid
  1777  				}
  1778  				tmpReg = s.allocReg(m, &tmpVal)
  1779  				s.nospill |= regMask(1) << tmpReg
  1780  				s.tmpused |= regMask(1) << tmpReg
  1781  			}
  1782  
  1783  			if regspec.clobbersArg0 {
  1784  				s.freeReg(register(s.f.getHome(args[0].ID).(*Register).num))
  1785  			}
  1786  			if regspec.clobbersArg1 {
  1787  				s.freeReg(register(s.f.getHome(args[1].ID).(*Register).num))
  1788  			}
  1789  
  1790  			// Now that all args are in regs, we're ready to issue the value itself.
  1791  			// Before we pick a register for the output value, allow input registers
  1792  			// to be deallocated. We do this here so that the output can use the
  1793  			// same register as a dying input.
  1794  			if !opcodeTable[v.Op].resultNotInArgs {
  1795  				s.tmpused = s.nospill
  1796  				s.nospill = 0
  1797  				s.advanceUses(v) // frees any registers holding args that are no longer live
  1798  			}
  1799  
  1800  			// Dump any registers which will be clobbered
  1801  			if s.doClobber && v.Op.IsCall() {
  1802  				// clobber registers that are marked as clobber in regmask, but
  1803  				// don't clobber inputs.
  1804  				s.clobberRegs(regspec.clobbers &^ s.tmpused &^ s.nospill)
  1805  			}
  1806  			s.freeRegs(regspec.clobbers)
  1807  			s.tmpused |= regspec.clobbers
  1808  
  1809  			// Pick registers for outputs.
  1810  			{
  1811  				outRegs := noRegisters // TODO if this is costly, hoist and clear incrementally below.
  1812  				maxOutIdx := -1
  1813  				var used regMask
  1814  				if tmpReg != noRegister {
  1815  					// Ensure output registers are distinct from the temporary register.
  1816  					// (Not all instructions need that distinct part, but it is conservative.)
  1817  					used |= regMask(1) << tmpReg
  1818  				}
  1819  				for _, out := range regspec.outputs {
  1820  					if out.regs == 0 {
  1821  						continue
  1822  					}
  1823  					mask := out.regs & s.allocatable &^ used
  1824  					if mask == 0 {
  1825  						s.f.Fatalf("can't find any output register %s", v.LongString())
  1826  					}
  1827  					if opcodeTable[v.Op].resultInArg0 && out.idx == 0 {
  1828  						if !opcodeTable[v.Op].commutative {
  1829  							// Output must use the same register as input 0.
  1830  							r := register(s.f.getHome(args[0].ID).(*Register).num)
  1831  							if mask>>r&1 == 0 {
  1832  								s.f.Fatalf("resultInArg0 value's input %v cannot be an output of %s", s.f.getHome(args[0].ID).(*Register), v.LongString())
  1833  							}
  1834  							mask = regMask(1) << r
  1835  						} else {
  1836  							// Output must use the same register as input 0 or 1.
  1837  							r0 := register(s.f.getHome(args[0].ID).(*Register).num)
  1838  							r1 := register(s.f.getHome(args[1].ID).(*Register).num)
  1839  							// Check r0 and r1 for desired output register.
  1840  							found := false
  1841  							for _, r := range dinfo[idx].out {
  1842  								if (r == r0 || r == r1) && (mask&^s.used)>>r&1 != 0 {
  1843  									mask = regMask(1) << r
  1844  									found = true
  1845  									if r == r1 {
  1846  										args[0], args[1] = args[1], args[0]
  1847  									}
  1848  									break
  1849  								}
  1850  							}
  1851  							if !found {
  1852  								// Neither are desired, pick r0.
  1853  								mask = regMask(1) << r0
  1854  							}
  1855  						}
  1856  					}
  1857  					if out.idx == 0 { // desired registers only apply to the first element of a tuple result
  1858  						for _, r := range dinfo[idx].out {
  1859  							if r != noRegister && (mask&^s.used)>>r&1 != 0 {
  1860  								// Desired register is allowed and unused.
  1861  								mask = regMask(1) << r
  1862  								break
  1863  							}
  1864  						}
  1865  					}
  1866  					if out.idx == 1 {
  1867  						if prefs, ok := desiredSecondReg[v.ID]; ok {
  1868  							for _, r := range prefs {
  1869  								if r != noRegister && (mask&^s.used)>>r&1 != 0 {
  1870  									// Desired register is allowed and unused.
  1871  									mask = regMask(1) << r
  1872  									break
  1873  								}
  1874  							}
  1875  						}
  1876  					}
  1877  					// Avoid registers we're saving for other values.
  1878  					if mask&^desired.avoid&^s.nospill&^s.used != 0 {
  1879  						mask &^= desired.avoid
  1880  					}
  1881  					r := s.allocReg(mask, v)
  1882  					if out.idx > maxOutIdx {
  1883  						maxOutIdx = out.idx
  1884  					}
  1885  					outRegs[out.idx] = r
  1886  					used |= regMask(1) << r
  1887  					s.tmpused |= regMask(1) << r
  1888  				}
  1889  				// Record register choices
  1890  				if v.Type.IsTuple() {
  1891  					var outLocs LocPair
  1892  					if r := outRegs[0]; r != noRegister {
  1893  						outLocs[0] = &s.registers[r]
  1894  					}
  1895  					if r := outRegs[1]; r != noRegister {
  1896  						outLocs[1] = &s.registers[r]
  1897  					}
  1898  					s.f.setHome(v, outLocs)
  1899  					// Note that subsequent SelectX instructions will do the assignReg calls.
  1900  				} else if v.Type.IsResults() {
  1901  					// preallocate outLocs to the right size, which is maxOutIdx+1
  1902  					outLocs := make(LocResults, maxOutIdx+1, maxOutIdx+1)
  1903  					for i := 0; i <= maxOutIdx; i++ {
  1904  						if r := outRegs[i]; r != noRegister {
  1905  							outLocs[i] = &s.registers[r]
  1906  						}
  1907  					}
  1908  					s.f.setHome(v, outLocs)
  1909  				} else {
  1910  					if r := outRegs[0]; r != noRegister {
  1911  						s.assignReg(r, v, v)
  1912  					}
  1913  				}
  1914  				if tmpReg != noRegister {
  1915  					// Remember the temp register allocation, if any.
  1916  					if s.f.tempRegs == nil {
  1917  						s.f.tempRegs = map[ID]*Register{}
  1918  					}
  1919  					s.f.tempRegs[v.ID] = &s.registers[tmpReg]
  1920  				}
  1921  			}
  1922  
  1923  			// deallocate dead args, if we have not done so
  1924  			if opcodeTable[v.Op].resultNotInArgs {
  1925  				s.nospill = 0
  1926  				s.advanceUses(v) // frees any registers holding args that are no longer live
  1927  			}
  1928  			s.tmpused = 0
  1929  
  1930  			// Issue the Value itself.
  1931  			for i, a := range args {
  1932  				v.SetArg(i, a) // use register version of arguments
  1933  			}
  1934  			b.Values = append(b.Values, v)
  1935  			s.dropIfUnused(v)
  1936  		}
  1937  
  1938  		// Copy the control values - we need this so we can reduce the
  1939  		// uses property of these values later.
  1940  		controls := append(make([]*Value, 0, 2), b.ControlValues()...)
  1941  
  1942  		// Load control values into registers.
  1943  		for i, v := range b.ControlValues() {
  1944  			if !s.values[v.ID].needReg {
  1945  				continue
  1946  			}
  1947  			if s.f.pass.debug > regDebug {
  1948  				fmt.Printf("  processing control %s\n", v.LongString())
  1949  			}
  1950  			// We assume that a control input can be passed in any
  1951  			// type-compatible register. If this turns out not to be true,
  1952  			// we'll need to introduce a regspec for a block's control value.
  1953  			b.ReplaceControl(i, s.allocValToReg(v, s.compatRegs(v.Type), false, b.Pos))
  1954  		}
  1955  
  1956  		// Reduce the uses of the control values once registers have been loaded.
  1957  		// This loop is equivalent to the advanceUses method.
  1958  		for _, v := range controls {
  1959  			vi := &s.values[v.ID]
  1960  			if !vi.needReg {
  1961  				continue
  1962  			}
  1963  			// Remove this use from the uses list.
  1964  			u := vi.uses
  1965  			vi.uses = u.next
  1966  			if u.next == nil {
  1967  				s.freeRegs(vi.regs) // value is dead
  1968  			}
  1969  			u.next = s.freeUseRecords
  1970  			s.freeUseRecords = u
  1971  		}
  1972  
  1973  		// If we are approaching a merge point and we are the primary
  1974  		// predecessor of it, find live values that we use soon after
  1975  		// the merge point and promote them to registers now.
  1976  		if len(b.Succs) == 1 {
  1977  			if s.f.Config.hasGReg && s.regs[s.GReg].v != nil {
  1978  				s.freeReg(s.GReg) // Spill value in G register before any merge.
  1979  			}
  1980  			if s.blockOrder[b.ID] > s.blockOrder[b.Succs[0].b.ID] {
  1981  				// No point if we've already regalloc'd the destination.
  1982  				goto badloop
  1983  			}
  1984  			// For this to be worthwhile, the loop must have no calls in it.
  1985  			top := b.Succs[0].b
  1986  			loop := s.loopnest.b2l[top.ID]
  1987  			if loop == nil || loop.header != top || loop.containsUnavoidableCall {
  1988  				goto badloop
  1989  			}
  1990  
  1991  			// TODO: sort by distance, pick the closest ones?
  1992  			for _, live := range s.live[b.ID] {
  1993  				if live.dist >= unlikelyDistance {
  1994  					// Don't preload anything live after the loop.
  1995  					continue
  1996  				}
  1997  				vid := live.ID
  1998  				vi := &s.values[vid]
  1999  				if vi.regs != 0 {
  2000  					continue
  2001  				}
  2002  				if vi.rematerializeable {
  2003  					continue
  2004  				}
  2005  				v := s.orig[vid]
  2006  				m := s.compatRegs(v.Type) &^ s.used
  2007  				// Used desired register if available.
  2008  			outerloop:
  2009  				for _, e := range desired.entries {
  2010  					if e.ID != v.ID {
  2011  						continue
  2012  					}
  2013  					for _, r := range e.regs {
  2014  						if r != noRegister && m>>r&1 != 0 {
  2015  							m = regMask(1) << r
  2016  							break outerloop
  2017  						}
  2018  					}
  2019  				}
  2020  				if m&^desired.avoid != 0 {
  2021  					m &^= desired.avoid
  2022  				}
  2023  				if m != 0 {
  2024  					s.allocValToReg(v, m, false, b.Pos)
  2025  				}
  2026  			}
  2027  		}
  2028  	badloop:
  2029  		;
  2030  
  2031  		// Save end-of-block register state.
  2032  		// First count how many, this cuts allocations in half.
  2033  		k := 0
  2034  		for r := register(0); r < s.numRegs; r++ {
  2035  			v := s.regs[r].v
  2036  			if v == nil {
  2037  				continue
  2038  			}
  2039  			k++
  2040  		}
  2041  		regList := make([]endReg, 0, k)
  2042  		for r := register(0); r < s.numRegs; r++ {
  2043  			v := s.regs[r].v
  2044  			if v == nil {
  2045  				continue
  2046  			}
  2047  			regList = append(regList, endReg{r, v, s.regs[r].c})
  2048  		}
  2049  		s.endRegs[b.ID] = regList
  2050  
  2051  		if checkEnabled {
  2052  			regValLiveSet.clear()
  2053  			if s.live != nil {
  2054  				for _, x := range s.live[b.ID] {
  2055  					regValLiveSet.add(x.ID)
  2056  				}
  2057  			}
  2058  			for r := register(0); r < s.numRegs; r++ {
  2059  				v := s.regs[r].v
  2060  				if v == nil {
  2061  					continue
  2062  				}
  2063  				if !regValLiveSet.contains(v.ID) {
  2064  					s.f.Fatalf("val %s is in reg but not live at end of %s", v, b)
  2065  				}
  2066  			}
  2067  		}
  2068  
  2069  		// If a value is live at the end of the block and
  2070  		// isn't in a register, generate a use for the spill location.
  2071  		// We need to remember this information so that
  2072  		// the liveness analysis in stackalloc is correct.
  2073  		if s.live != nil {
  2074  			for _, e := range s.live[b.ID] {
  2075  				vi := &s.values[e.ID]
  2076  				if vi.regs != 0 {
  2077  					// in a register, we'll use that source for the merge.
  2078  					continue
  2079  				}
  2080  				if vi.rematerializeable {
  2081  					// we'll rematerialize during the merge.
  2082  					continue
  2083  				}
  2084  				if s.f.pass.debug > regDebug {
  2085  					fmt.Printf("live-at-end spill for %s at %s\n", s.orig[e.ID], b)
  2086  				}
  2087  				spill := s.makeSpill(s.orig[e.ID], b)
  2088  				s.spillLive[b.ID] = append(s.spillLive[b.ID], spill.ID)
  2089  			}
  2090  
  2091  			// Clear any final uses.
  2092  			// All that is left should be the pseudo-uses added for values which
  2093  			// are live at the end of b.
  2094  			for _, e := range s.live[b.ID] {
  2095  				u := s.values[e.ID].uses
  2096  				if u == nil {
  2097  					f.Fatalf("live at end, no uses v%d", e.ID)
  2098  				}
  2099  				if u.next != nil {
  2100  					f.Fatalf("live at end, too many uses v%d", e.ID)
  2101  				}
  2102  				s.values[e.ID].uses = nil
  2103  				u.next = s.freeUseRecords
  2104  				s.freeUseRecords = u
  2105  			}
  2106  		}
  2107  
  2108  		// allocReg may have dropped registers from startRegsMask that
  2109  		// aren't actually needed in startRegs. Synchronize back to
  2110  		// startRegs.
  2111  		//
  2112  		// This must be done before placing spills, which will look at
  2113  		// startRegs to decide if a block is a valid block for a spill.
  2114  		if c := countRegs(s.startRegsMask); c != len(s.startRegs[b.ID]) {
  2115  			regs := make([]startReg, 0, c)
  2116  			for _, sr := range s.startRegs[b.ID] {
  2117  				if s.startRegsMask&(regMask(1)<<sr.r) == 0 {
  2118  					continue
  2119  				}
  2120  				regs = append(regs, sr)
  2121  			}
  2122  			s.startRegs[b.ID] = regs
  2123  		}
  2124  	}
  2125  
  2126  	// Decide where the spills we generated will go.
  2127  	s.placeSpills()
  2128  
  2129  	// Anything that didn't get a register gets a stack location here.
  2130  	// (StoreReg, stack-based phis, inputs, ...)
  2131  	stacklive := stackalloc(s.f, s.spillLive)
  2132  
  2133  	// Fix up all merge edges.
  2134  	s.shuffle(stacklive)
  2135  
  2136  	// Erase any copies we never used.
  2137  	// Also, an unused copy might be the only use of another copy,
  2138  	// so continue erasing until we reach a fixed point.
  2139  	for {
  2140  		progress := false
  2141  		for c, used := range s.copies {
  2142  			if !used && c.Uses == 0 {
  2143  				if s.f.pass.debug > regDebug {
  2144  					fmt.Printf("delete copied value %s\n", c.LongString())
  2145  				}
  2146  				c.resetArgs()
  2147  				f.freeValue(c)
  2148  				delete(s.copies, c)
  2149  				progress = true
  2150  			}
  2151  		}
  2152  		if !progress {
  2153  			break
  2154  		}
  2155  	}
  2156  
  2157  	for _, b := range s.visitOrder {
  2158  		i := 0
  2159  		for _, v := range b.Values {
  2160  			if v.Op == OpInvalid {
  2161  				continue
  2162  			}
  2163  			b.Values[i] = v
  2164  			i++
  2165  		}
  2166  		b.Values = b.Values[:i]
  2167  	}
  2168  }
  2169  
  2170  func (s *regAllocState) placeSpills() {
  2171  	mustBeFirst := func(op Op) bool {
  2172  		return op.isLoweredGetClosurePtr() || op == OpPhi || op == OpArgIntReg || op == OpArgFloatReg
  2173  	}
  2174  
  2175  	// Start maps block IDs to the list of spills
  2176  	// that go at the start of the block (but after any phis).
  2177  	start := map[ID][]*Value{}
  2178  	// After maps value IDs to the list of spills
  2179  	// that go immediately after that value ID.
  2180  	after := map[ID][]*Value{}
  2181  
  2182  	for i := range s.values {
  2183  		vi := s.values[i]
  2184  		spill := vi.spill
  2185  		if spill == nil {
  2186  			continue
  2187  		}
  2188  		if spill.Block != nil {
  2189  			// Some spills are already fully set up,
  2190  			// like OpArgs and stack-based phis.
  2191  			continue
  2192  		}
  2193  		v := s.orig[i]
  2194  
  2195  		// Walk down the dominator tree looking for a good place to
  2196  		// put the spill of v.  At the start "best" is the best place
  2197  		// we have found so far.
  2198  		// TODO: find a way to make this O(1) without arbitrary cutoffs.
  2199  		if v == nil {
  2200  			panic(fmt.Errorf("nil v, s.orig[%d], vi = %v, spill = %s", i, vi, spill.LongString()))
  2201  		}
  2202  		best := v.Block
  2203  		bestArg := v
  2204  		var bestDepth int16
  2205  		if s.loopnest != nil && s.loopnest.b2l[best.ID] != nil {
  2206  			bestDepth = s.loopnest.b2l[best.ID].depth
  2207  		}
  2208  		b := best
  2209  		const maxSpillSearch = 100
  2210  		for i := 0; i < maxSpillSearch; i++ {
  2211  			// Find the child of b in the dominator tree which
  2212  			// dominates all restores.
  2213  			p := b
  2214  			b = nil
  2215  			for c := s.sdom.Child(p); c != nil && i < maxSpillSearch; c, i = s.sdom.Sibling(c), i+1 {
  2216  				if s.sdom[c.ID].entry <= vi.restoreMin && s.sdom[c.ID].exit >= vi.restoreMax {
  2217  					// c also dominates all restores.  Walk down into c.
  2218  					b = c
  2219  					break
  2220  				}
  2221  			}
  2222  			if b == nil {
  2223  				// Ran out of blocks which dominate all restores.
  2224  				break
  2225  			}
  2226  
  2227  			var depth int16
  2228  			if s.loopnest != nil && s.loopnest.b2l[b.ID] != nil {
  2229  				depth = s.loopnest.b2l[b.ID].depth
  2230  			}
  2231  			if depth > bestDepth {
  2232  				// Don't push the spill into a deeper loop.
  2233  				continue
  2234  			}
  2235  
  2236  			// If v is in a register at the start of b, we can
  2237  			// place the spill here (after the phis).
  2238  			if len(b.Preds) == 1 {
  2239  				for _, e := range s.endRegs[b.Preds[0].b.ID] {
  2240  					if e.v == v {
  2241  						// Found a better spot for the spill.
  2242  						best = b
  2243  						bestArg = e.c
  2244  						bestDepth = depth
  2245  						break
  2246  					}
  2247  				}
  2248  			} else {
  2249  				for _, e := range s.startRegs[b.ID] {
  2250  					if e.v == v {
  2251  						// Found a better spot for the spill.
  2252  						best = b
  2253  						bestArg = e.c
  2254  						bestDepth = depth
  2255  						break
  2256  					}
  2257  				}
  2258  			}
  2259  		}
  2260  
  2261  		// Put the spill in the best block we found.
  2262  		spill.Block = best
  2263  		spill.AddArg(bestArg)
  2264  		if best == v.Block && !mustBeFirst(v.Op) {
  2265  			// Place immediately after v.
  2266  			after[v.ID] = append(after[v.ID], spill)
  2267  		} else {
  2268  			// Place at the start of best block.
  2269  			start[best.ID] = append(start[best.ID], spill)
  2270  		}
  2271  	}
  2272  
  2273  	// Insert spill instructions into the block schedules.
  2274  	var oldSched []*Value
  2275  	for _, b := range s.visitOrder {
  2276  		nfirst := 0
  2277  		for _, v := range b.Values {
  2278  			if !mustBeFirst(v.Op) {
  2279  				break
  2280  			}
  2281  			nfirst++
  2282  		}
  2283  		oldSched = append(oldSched[:0], b.Values[nfirst:]...)
  2284  		b.Values = b.Values[:nfirst]
  2285  		b.Values = append(b.Values, start[b.ID]...)
  2286  		for _, v := range oldSched {
  2287  			b.Values = append(b.Values, v)
  2288  			b.Values = append(b.Values, after[v.ID]...)
  2289  		}
  2290  	}
  2291  }
  2292  
  2293  // shuffle fixes up all the merge edges (those going into blocks of indegree > 1).
  2294  func (s *regAllocState) shuffle(stacklive [][]ID) {
  2295  	var e edgeState
  2296  	e.s = s
  2297  	e.cache = map[ID][]*Value{}
  2298  	e.contents = map[Location]contentRecord{}
  2299  	if s.f.pass.debug > regDebug {
  2300  		fmt.Printf("shuffle %s\n", s.f.Name)
  2301  		fmt.Println(s.f.String())
  2302  	}
  2303  
  2304  	for _, b := range s.visitOrder {
  2305  		if len(b.Preds) <= 1 {
  2306  			continue
  2307  		}
  2308  		e.b = b
  2309  		for i, edge := range b.Preds {
  2310  			p := edge.b
  2311  			e.p = p
  2312  			e.setup(i, s.endRegs[p.ID], s.startRegs[b.ID], stacklive[p.ID])
  2313  			e.process()
  2314  		}
  2315  	}
  2316  
  2317  	if s.f.pass.debug > regDebug {
  2318  		fmt.Printf("post shuffle %s\n", s.f.Name)
  2319  		fmt.Println(s.f.String())
  2320  	}
  2321  }
  2322  
  2323  type edgeState struct {
  2324  	s    *regAllocState
  2325  	p, b *Block // edge goes from p->b.
  2326  
  2327  	// for each pre-regalloc value, a list of equivalent cached values
  2328  	cache      map[ID][]*Value
  2329  	cachedVals []ID // (superset of) keys of the above map, for deterministic iteration
  2330  
  2331  	// map from location to the value it contains
  2332  	contents map[Location]contentRecord
  2333  
  2334  	// desired destination locations
  2335  	destinations []dstRecord
  2336  	extra        []dstRecord
  2337  
  2338  	usedRegs              regMask // registers currently holding something
  2339  	uniqueRegs            regMask // registers holding the only copy of a value
  2340  	finalRegs             regMask // registers holding final target
  2341  	rematerializeableRegs regMask // registers that hold rematerializeable values
  2342  }
  2343  
  2344  type contentRecord struct {
  2345  	vid   ID       // pre-regalloc value
  2346  	c     *Value   // cached value
  2347  	final bool     // this is a satisfied destination
  2348  	pos   src.XPos // source position of use of the value
  2349  }
  2350  
  2351  type dstRecord struct {
  2352  	loc    Location // register or stack slot
  2353  	vid    ID       // pre-regalloc value it should contain
  2354  	splice **Value  // place to store reference to the generating instruction
  2355  	pos    src.XPos // source position of use of this location
  2356  }
  2357  
  2358  // setup initializes the edge state for shuffling.
  2359  func (e *edgeState) setup(idx int, srcReg []endReg, dstReg []startReg, stacklive []ID) {
  2360  	if e.s.f.pass.debug > regDebug {
  2361  		fmt.Printf("edge %s->%s\n", e.p, e.b)
  2362  	}
  2363  
  2364  	// Clear state.
  2365  	clear(e.cache)
  2366  	e.cachedVals = e.cachedVals[:0]
  2367  	clear(e.contents)
  2368  	e.usedRegs = 0
  2369  	e.uniqueRegs = 0
  2370  	e.finalRegs = 0
  2371  	e.rematerializeableRegs = 0
  2372  
  2373  	// Live registers can be sources.
  2374  	for _, x := range srcReg {
  2375  		e.set(&e.s.registers[x.r], x.v.ID, x.c, false, src.NoXPos) // don't care the position of the source
  2376  	}
  2377  	// So can all of the spill locations.
  2378  	for _, spillID := range stacklive {
  2379  		v := e.s.orig[spillID]
  2380  		spill := e.s.values[v.ID].spill
  2381  		if !e.s.sdom.IsAncestorEq(spill.Block, e.p) {
  2382  			// Spills were placed that only dominate the uses found
  2383  			// during the first regalloc pass. The edge fixup code
  2384  			// can't use a spill location if the spill doesn't dominate
  2385  			// the edge.
  2386  			// We are guaranteed that if the spill doesn't dominate this edge,
  2387  			// then the value is available in a register (because we called
  2388  			// makeSpill for every value not in a register at the start
  2389  			// of an edge).
  2390  			continue
  2391  		}
  2392  		e.set(e.s.f.getHome(spillID), v.ID, spill, false, src.NoXPos) // don't care the position of the source
  2393  	}
  2394  
  2395  	// Figure out all the destinations we need.
  2396  	dsts := e.destinations[:0]
  2397  	for _, x := range dstReg {
  2398  		dsts = append(dsts, dstRecord{&e.s.registers[x.r], x.v.ID, nil, x.pos})
  2399  	}
  2400  	// Phis need their args to end up in a specific location.
  2401  	for _, v := range e.b.Values {
  2402  		if v.Op != OpPhi {
  2403  			break
  2404  		}
  2405  		loc := e.s.f.getHome(v.ID)
  2406  		if loc == nil {
  2407  			continue
  2408  		}
  2409  		dsts = append(dsts, dstRecord{loc, v.Args[idx].ID, &v.Args[idx], v.Pos})
  2410  	}
  2411  	e.destinations = dsts
  2412  
  2413  	if e.s.f.pass.debug > regDebug {
  2414  		for _, vid := range e.cachedVals {
  2415  			a := e.cache[vid]
  2416  			for _, c := range a {
  2417  				fmt.Printf("src %s: v%d cache=%s\n", e.s.f.getHome(c.ID), vid, c)
  2418  			}
  2419  		}
  2420  		for _, d := range e.destinations {
  2421  			fmt.Printf("dst %s: v%d\n", d.loc, d.vid)
  2422  		}
  2423  	}
  2424  }
  2425  
  2426  // process generates code to move all the values to the right destination locations.
  2427  func (e *edgeState) process() {
  2428  	dsts := e.destinations
  2429  
  2430  	// Process the destinations until they are all satisfied.
  2431  	for len(dsts) > 0 {
  2432  		i := 0
  2433  		for _, d := range dsts {
  2434  			if !e.processDest(d.loc, d.vid, d.splice, d.pos) {
  2435  				// Failed - save for next iteration.
  2436  				dsts[i] = d
  2437  				i++
  2438  			}
  2439  		}
  2440  		if i < len(dsts) {
  2441  			// Made some progress. Go around again.
  2442  			dsts = dsts[:i]
  2443  
  2444  			// Append any extras destinations we generated.
  2445  			dsts = append(dsts, e.extra...)
  2446  			e.extra = e.extra[:0]
  2447  			continue
  2448  		}
  2449  
  2450  		// We made no progress. That means that any
  2451  		// remaining unsatisfied moves are in simple cycles.
  2452  		// For example, A -> B -> C -> D -> A.
  2453  		//   A ----> B
  2454  		//   ^       |
  2455  		//   |       |
  2456  		//   |       v
  2457  		//   D <---- C
  2458  
  2459  		// To break the cycle, we pick an unused register, say R,
  2460  		// and put a copy of B there.
  2461  		//   A ----> B
  2462  		//   ^       |
  2463  		//   |       |
  2464  		//   |       v
  2465  		//   D <---- C <---- R=copyofB
  2466  		// When we resume the outer loop, the A->B move can now proceed,
  2467  		// and eventually the whole cycle completes.
  2468  
  2469  		// Copy any cycle location to a temp register. This duplicates
  2470  		// one of the cycle entries, allowing the just duplicated value
  2471  		// to be overwritten and the cycle to proceed.
  2472  		d := dsts[0]
  2473  		loc := d.loc
  2474  		vid := e.contents[loc].vid
  2475  		c := e.contents[loc].c
  2476  		r := e.findRegFor(c.Type)
  2477  		if e.s.f.pass.debug > regDebug {
  2478  			fmt.Printf("breaking cycle with v%d in %s:%s\n", vid, loc, c)
  2479  		}
  2480  		e.erase(r)
  2481  		pos := d.pos.WithNotStmt()
  2482  		if _, isReg := loc.(*Register); isReg {
  2483  			c = e.p.NewValue1(pos, OpCopy, c.Type, c)
  2484  		} else {
  2485  			c = e.p.NewValue1(pos, OpLoadReg, c.Type, c)
  2486  		}
  2487  		e.set(r, vid, c, false, pos)
  2488  		if c.Op == OpLoadReg && e.s.isGReg(register(r.(*Register).num)) {
  2489  			e.s.f.Fatalf("process.OpLoadReg targeting g: " + c.LongString())
  2490  		}
  2491  	}
  2492  }
  2493  
  2494  // processDest generates code to put value vid into location loc. Returns true
  2495  // if progress was made.
  2496  func (e *edgeState) processDest(loc Location, vid ID, splice **Value, pos src.XPos) bool {
  2497  	pos = pos.WithNotStmt()
  2498  	occupant := e.contents[loc]
  2499  	if occupant.vid == vid {
  2500  		// Value is already in the correct place.
  2501  		e.contents[loc] = contentRecord{vid, occupant.c, true, pos}
  2502  		if splice != nil {
  2503  			(*splice).Uses--
  2504  			*splice = occupant.c
  2505  			occupant.c.Uses++
  2506  		}
  2507  		// Note: if splice==nil then c will appear dead. This is
  2508  		// non-SSA formed code, so be careful after this pass not to run
  2509  		// deadcode elimination.
  2510  		if _, ok := e.s.copies[occupant.c]; ok {
  2511  			// The copy at occupant.c was used to avoid spill.
  2512  			e.s.copies[occupant.c] = true
  2513  		}
  2514  		return true
  2515  	}
  2516  
  2517  	// Check if we're allowed to clobber the destination location.
  2518  	if len(e.cache[occupant.vid]) == 1 && !e.s.values[occupant.vid].rematerializeable && !opcodeTable[e.s.orig[occupant.vid].Op].fixedReg {
  2519  		// We can't overwrite the last copy
  2520  		// of a value that needs to survive.
  2521  		return false
  2522  	}
  2523  
  2524  	// Copy from a source of v, register preferred.
  2525  	v := e.s.orig[vid]
  2526  	var c *Value
  2527  	var src Location
  2528  	if e.s.f.pass.debug > regDebug {
  2529  		fmt.Printf("moving v%d to %s\n", vid, loc)
  2530  		fmt.Printf("sources of v%d:", vid)
  2531  	}
  2532  	if opcodeTable[v.Op].fixedReg {
  2533  		c = v
  2534  		src = e.s.f.getHome(v.ID)
  2535  	} else {
  2536  		for _, w := range e.cache[vid] {
  2537  			h := e.s.f.getHome(w.ID)
  2538  			if e.s.f.pass.debug > regDebug {
  2539  				fmt.Printf(" %s:%s", h, w)
  2540  			}
  2541  			_, isreg := h.(*Register)
  2542  			if src == nil || isreg {
  2543  				c = w
  2544  				src = h
  2545  			}
  2546  		}
  2547  	}
  2548  	if e.s.f.pass.debug > regDebug {
  2549  		if src != nil {
  2550  			fmt.Printf(" [use %s]\n", src)
  2551  		} else {
  2552  			fmt.Printf(" [no source]\n")
  2553  		}
  2554  	}
  2555  	_, dstReg := loc.(*Register)
  2556  
  2557  	// Pre-clobber destination. This avoids the
  2558  	// following situation:
  2559  	//   - v is currently held in R0 and stacktmp0.
  2560  	//   - We want to copy stacktmp1 to stacktmp0.
  2561  	//   - We choose R0 as the temporary register.
  2562  	// During the copy, both R0 and stacktmp0 are
  2563  	// clobbered, losing both copies of v. Oops!
  2564  	// Erasing the destination early means R0 will not
  2565  	// be chosen as the temp register, as it will then
  2566  	// be the last copy of v.
  2567  	e.erase(loc)
  2568  	var x *Value
  2569  	if c == nil || e.s.values[vid].rematerializeable {
  2570  		if !e.s.values[vid].rematerializeable {
  2571  			e.s.f.Fatalf("can't find source for %s->%s: %s\n", e.p, e.b, v.LongString())
  2572  		}
  2573  		if dstReg {
  2574  			// We want to rematerialize v into a register that is incompatible with v's op's register mask.
  2575  			// Instead of setting the wrong register for the rematerialized v, we should find the right register
  2576  			// for it and emit an additional copy to move to the desired register.
  2577  			// For #70451.
  2578  			if e.s.regspec(v).outputs[0].regs&regMask(1<<register(loc.(*Register).num)) == 0 {
  2579  				_, srcReg := src.(*Register)
  2580  				if srcReg {
  2581  					// It exists in a valid register already, so just copy it to the desired register
  2582  					// If src is a Register, c must have already been set.
  2583  					x = e.p.NewValue1(pos, OpCopy, c.Type, c)
  2584  				} else {
  2585  					// We need a tmp register
  2586  					x = v.copyInto(e.p)
  2587  					r := e.findRegFor(x.Type)
  2588  					e.erase(r)
  2589  					// Rematerialize to the tmp register
  2590  					e.set(r, vid, x, false, pos)
  2591  					// Copy from tmp to the desired register
  2592  					x = e.p.NewValue1(pos, OpCopy, x.Type, x)
  2593  				}
  2594  			} else {
  2595  				x = v.copyInto(e.p)
  2596  			}
  2597  		} else {
  2598  			// Rematerialize into stack slot. Need a free
  2599  			// register to accomplish this.
  2600  			r := e.findRegFor(v.Type)
  2601  			e.erase(r)
  2602  			x = v.copyIntoWithXPos(e.p, pos)
  2603  			e.set(r, vid, x, false, pos)
  2604  			// Make sure we spill with the size of the slot, not the
  2605  			// size of x (which might be wider due to our dropping
  2606  			// of narrowing conversions).
  2607  			x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, x)
  2608  		}
  2609  	} else {
  2610  		// Emit move from src to dst.
  2611  		_, srcReg := src.(*Register)
  2612  		if srcReg {
  2613  			if dstReg {
  2614  				x = e.p.NewValue1(pos, OpCopy, c.Type, c)
  2615  			} else {
  2616  				x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, c)
  2617  			}
  2618  		} else {
  2619  			if dstReg {
  2620  				x = e.p.NewValue1(pos, OpLoadReg, c.Type, c)
  2621  			} else {
  2622  				// mem->mem. Use temp register.
  2623  				r := e.findRegFor(c.Type)
  2624  				e.erase(r)
  2625  				t := e.p.NewValue1(pos, OpLoadReg, c.Type, c)
  2626  				e.set(r, vid, t, false, pos)
  2627  				x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, t)
  2628  			}
  2629  		}
  2630  	}
  2631  	e.set(loc, vid, x, true, pos)
  2632  	if x.Op == OpLoadReg && e.s.isGReg(register(loc.(*Register).num)) {
  2633  		e.s.f.Fatalf("processDest.OpLoadReg targeting g: " + x.LongString())
  2634  	}
  2635  	if splice != nil {
  2636  		(*splice).Uses--
  2637  		*splice = x
  2638  		x.Uses++
  2639  	}
  2640  	return true
  2641  }
  2642  
  2643  // set changes the contents of location loc to hold the given value and its cached representative.
  2644  func (e *edgeState) set(loc Location, vid ID, c *Value, final bool, pos src.XPos) {
  2645  	e.s.f.setHome(c, loc)
  2646  	e.contents[loc] = contentRecord{vid, c, final, pos}
  2647  	a := e.cache[vid]
  2648  	if len(a) == 0 {
  2649  		e.cachedVals = append(e.cachedVals, vid)
  2650  	}
  2651  	a = append(a, c)
  2652  	e.cache[vid] = a
  2653  	if r, ok := loc.(*Register); ok {
  2654  		if e.usedRegs&(regMask(1)<<uint(r.num)) != 0 {
  2655  			e.s.f.Fatalf("%v is already set (v%d/%v)", r, vid, c)
  2656  		}
  2657  		e.usedRegs |= regMask(1) << uint(r.num)
  2658  		if final {
  2659  			e.finalRegs |= regMask(1) << uint(r.num)
  2660  		}
  2661  		if len(a) == 1 {
  2662  			e.uniqueRegs |= regMask(1) << uint(r.num)
  2663  		}
  2664  		if len(a) == 2 {
  2665  			if t, ok := e.s.f.getHome(a[0].ID).(*Register); ok {
  2666  				e.uniqueRegs &^= regMask(1) << uint(t.num)
  2667  			}
  2668  		}
  2669  		if e.s.values[vid].rematerializeable {
  2670  			e.rematerializeableRegs |= regMask(1) << uint(r.num)
  2671  		}
  2672  	}
  2673  	if e.s.f.pass.debug > regDebug {
  2674  		fmt.Printf("%s\n", c.LongString())
  2675  		fmt.Printf("v%d now available in %s:%s\n", vid, loc, c)
  2676  	}
  2677  }
  2678  
  2679  // erase removes any user of loc.
  2680  func (e *edgeState) erase(loc Location) {
  2681  	cr := e.contents[loc]
  2682  	if cr.c == nil {
  2683  		return
  2684  	}
  2685  	vid := cr.vid
  2686  
  2687  	if cr.final {
  2688  		// Add a destination to move this value back into place.
  2689  		// Make sure it gets added to the tail of the destination queue
  2690  		// so we make progress on other moves first.
  2691  		e.extra = append(e.extra, dstRecord{loc, cr.vid, nil, cr.pos})
  2692  	}
  2693  
  2694  	// Remove c from the list of cached values.
  2695  	a := e.cache[vid]
  2696  	for i, c := range a {
  2697  		if e.s.f.getHome(c.ID) == loc {
  2698  			if e.s.f.pass.debug > regDebug {
  2699  				fmt.Printf("v%d no longer available in %s:%s\n", vid, loc, c)
  2700  			}
  2701  			a[i], a = a[len(a)-1], a[:len(a)-1]
  2702  			break
  2703  		}
  2704  	}
  2705  	e.cache[vid] = a
  2706  
  2707  	// Update register masks.
  2708  	if r, ok := loc.(*Register); ok {
  2709  		e.usedRegs &^= regMask(1) << uint(r.num)
  2710  		if cr.final {
  2711  			e.finalRegs &^= regMask(1) << uint(r.num)
  2712  		}
  2713  		e.rematerializeableRegs &^= regMask(1) << uint(r.num)
  2714  	}
  2715  	if len(a) == 1 {
  2716  		if r, ok := e.s.f.getHome(a[0].ID).(*Register); ok {
  2717  			e.uniqueRegs |= regMask(1) << uint(r.num)
  2718  		}
  2719  	}
  2720  }
  2721  
  2722  // findRegFor finds a register we can use to make a temp copy of type typ.
  2723  func (e *edgeState) findRegFor(typ *types.Type) Location {
  2724  	// Which registers are possibilities.
  2725  	m := e.s.compatRegs(typ)
  2726  
  2727  	// Pick a register. In priority order:
  2728  	// 1) an unused register
  2729  	// 2) a non-unique register not holding a final value
  2730  	// 3) a non-unique register
  2731  	// 4) a register holding a rematerializeable value
  2732  	x := m &^ e.usedRegs
  2733  	if x != 0 {
  2734  		return &e.s.registers[pickReg(x)]
  2735  	}
  2736  	x = m &^ e.uniqueRegs &^ e.finalRegs
  2737  	if x != 0 {
  2738  		return &e.s.registers[pickReg(x)]
  2739  	}
  2740  	x = m &^ e.uniqueRegs
  2741  	if x != 0 {
  2742  		return &e.s.registers[pickReg(x)]
  2743  	}
  2744  	x = m & e.rematerializeableRegs
  2745  	if x != 0 {
  2746  		return &e.s.registers[pickReg(x)]
  2747  	}
  2748  
  2749  	// No register is available.
  2750  	// Pick a register to spill.
  2751  	for _, vid := range e.cachedVals {
  2752  		a := e.cache[vid]
  2753  		for _, c := range a {
  2754  			if r, ok := e.s.f.getHome(c.ID).(*Register); ok && m>>uint(r.num)&1 != 0 {
  2755  				if !c.rematerializeable() {
  2756  					x := e.p.NewValue1(c.Pos, OpStoreReg, c.Type, c)
  2757  					// Allocate a temp location to spill a register to.
  2758  					t := LocalSlot{N: e.s.f.NewLocal(c.Pos, c.Type), Type: c.Type}
  2759  					// TODO: reuse these slots. They'll need to be erased first.
  2760  					e.set(t, vid, x, false, c.Pos)
  2761  					if e.s.f.pass.debug > regDebug {
  2762  						fmt.Printf("  SPILL %s->%s %s\n", r, t, x.LongString())
  2763  					}
  2764  				}
  2765  				// r will now be overwritten by the caller. At some point
  2766  				// later, the newly saved value will be moved back to its
  2767  				// final destination in processDest.
  2768  				return r
  2769  			}
  2770  		}
  2771  	}
  2772  
  2773  	fmt.Printf("m:%d unique:%d final:%d rematerializable:%d\n", m, e.uniqueRegs, e.finalRegs, e.rematerializeableRegs)
  2774  	for _, vid := range e.cachedVals {
  2775  		a := e.cache[vid]
  2776  		for _, c := range a {
  2777  			fmt.Printf("v%d: %s %s\n", vid, c, e.s.f.getHome(c.ID))
  2778  		}
  2779  	}
  2780  	e.s.f.Fatalf("can't find empty register on edge %s->%s", e.p, e.b)
  2781  	return nil
  2782  }
  2783  
  2784  // rematerializeable reports whether the register allocator should recompute
  2785  // a value instead of spilling/restoring it.
  2786  func (v *Value) rematerializeable() bool {
  2787  	if !opcodeTable[v.Op].rematerializeable {
  2788  		return false
  2789  	}
  2790  	for _, a := range v.Args {
  2791  		// Fixed-register allocations (SP, SB, etc.) are always available.
  2792  		// Any other argument of an opcode makes it not rematerializeable.
  2793  		if !opcodeTable[a.Op].fixedReg {
  2794  			return false
  2795  		}
  2796  	}
  2797  	return true
  2798  }
  2799  
  2800  type liveInfo struct {
  2801  	ID   ID       // ID of value
  2802  	dist int32    // # of instructions before next use
  2803  	pos  src.XPos // source position of next use
  2804  }
  2805  
  2806  // computeLive computes a map from block ID to a list of value IDs live at the end
  2807  // of that block. Together with the value ID is a count of how many instructions
  2808  // to the next use of that value. The resulting map is stored in s.live.
  2809  func (s *regAllocState) computeLive() {
  2810  	f := s.f
  2811  	// single block functions do not have variables that are live across
  2812  	// branches
  2813  	if len(f.Blocks) == 1 {
  2814  		return
  2815  	}
  2816  	po := f.postorder()
  2817  	s.live = make([][]liveInfo, f.NumBlocks())
  2818  	s.desired = make([]desiredState, f.NumBlocks())
  2819  	s.loopnest = f.loopnest()
  2820  
  2821  	rematIDs := make([]ID, 0, 64)
  2822  
  2823  	live := f.newSparseMapPos(f.NumValues())
  2824  	defer f.retSparseMapPos(live)
  2825  	t := f.newSparseMapPos(f.NumValues())
  2826  	defer f.retSparseMapPos(t)
  2827  
  2828  	s.loopnest.computeUnavoidableCalls()
  2829  
  2830  	// Liveness analysis.
  2831  	// This is an adapted version of the algorithm described in chapter 2.4.2
  2832  	// of Fabrice Rastello's On Sparse Intermediate Representations.
  2833  	//   https://web.archive.org/web/20240417212122if_/https://inria.hal.science/hal-00761555/file/habilitation.pdf#section.50
  2834  	//
  2835  	// For our implementation, we fall back to a traditional iterative algorithm when we encounter
  2836  	// Irreducible CFGs. They are very uncommon in Go code because they need to be constructed with
  2837  	// gotos and our current loopnest definition does not compute all the information that
  2838  	// we'd need to compute the loop ancestors for that step of the algorithm.
  2839  	//
  2840  	// Additionally, instead of only considering non-loop successors in the initial DFS phase,
  2841  	// we compute the liveout as the union of all successors. This larger liveout set is a subset
  2842  	// of the final liveout for the block and adding this information in the DFS phase means that
  2843  	// we get slightly more accurate distance information.
  2844  	var loopLiveIn map[*loop][]liveInfo
  2845  	var numCalls []int32
  2846  	if len(s.loopnest.loops) > 0 && !s.loopnest.hasIrreducible {
  2847  		loopLiveIn = make(map[*loop][]liveInfo)
  2848  		numCalls = f.Cache.allocInt32Slice(f.NumBlocks())
  2849  		defer f.Cache.freeInt32Slice(numCalls)
  2850  	}
  2851  
  2852  	for {
  2853  		changed := false
  2854  
  2855  		for _, b := range po {
  2856  			// Start with known live values at the end of the block.
  2857  			live.clear()
  2858  			for _, e := range s.live[b.ID] {
  2859  				live.set(e.ID, e.dist, e.pos)
  2860  			}
  2861  			update := false
  2862  			// arguments to phi nodes are live at this blocks out
  2863  			for _, e := range b.Succs {
  2864  				succ := e.b
  2865  				delta := branchDistance(b, succ)
  2866  				for _, v := range succ.Values {
  2867  					if v.Op != OpPhi {
  2868  						break
  2869  					}
  2870  					arg := v.Args[e.i]
  2871  					if s.values[arg.ID].needReg && (!live.contains(arg.ID) || delta < live.get(arg.ID)) {
  2872  						live.set(arg.ID, delta, v.Pos)
  2873  						update = true
  2874  					}
  2875  				}
  2876  			}
  2877  			if update {
  2878  				s.live[b.ID] = updateLive(live, s.live[b.ID])
  2879  			}
  2880  			// Add len(b.Values) to adjust from end-of-block distance
  2881  			// to beginning-of-block distance.
  2882  			c := live.contents()
  2883  			for i := range c {
  2884  				c[i].val += int32(len(b.Values))
  2885  			}
  2886  
  2887  			// Mark control values as live
  2888  			for _, c := range b.ControlValues() {
  2889  				if s.values[c.ID].needReg {
  2890  					live.set(c.ID, int32(len(b.Values)), b.Pos)
  2891  				}
  2892  			}
  2893  
  2894  			for i := len(b.Values) - 1; i >= 0; i-- {
  2895  				v := b.Values[i]
  2896  				live.remove(v.ID)
  2897  				if v.Op == OpPhi {
  2898  					continue
  2899  				}
  2900  				if opcodeTable[v.Op].call {
  2901  					if numCalls != nil {
  2902  						numCalls[b.ID]++
  2903  					}
  2904  					rematIDs = rematIDs[:0]
  2905  					c := live.contents()
  2906  					for i := range c {
  2907  						c[i].val += unlikelyDistance
  2908  						vid := c[i].key
  2909  						if s.values[vid].rematerializeable {
  2910  							rematIDs = append(rematIDs, vid)
  2911  						}
  2912  					}
  2913  					// We don't spill rematerializeable values, and assuming they
  2914  					// are live across a call would only force shuffle to add some
  2915  					// (dead) constant rematerialization. Remove them.
  2916  					for _, r := range rematIDs {
  2917  						live.remove(r)
  2918  					}
  2919  				}
  2920  				for _, a := range v.Args {
  2921  					if s.values[a.ID].needReg {
  2922  						live.set(a.ID, int32(i), v.Pos)
  2923  					}
  2924  				}
  2925  			}
  2926  			// This is a loop header, save our live-in so that
  2927  			// we can use it to fill in the loop bodies later
  2928  			if loopLiveIn != nil {
  2929  				loop := s.loopnest.b2l[b.ID]
  2930  				if loop != nil && loop.header.ID == b.ID {
  2931  					loopLiveIn[loop] = updateLive(live, nil)
  2932  				}
  2933  			}
  2934  			// For each predecessor of b, expand its list of live-at-end values.
  2935  			// invariant: live contains the values live at the start of b
  2936  			for _, e := range b.Preds {
  2937  				p := e.b
  2938  				delta := branchDistance(p, b)
  2939  
  2940  				// Start t off with the previously known live values at the end of p.
  2941  				t.clear()
  2942  				for _, e := range s.live[p.ID] {
  2943  					t.set(e.ID, e.dist, e.pos)
  2944  				}
  2945  				update := false
  2946  
  2947  				// Add new live values from scanning this block.
  2948  				for _, e := range live.contents() {
  2949  					d := e.val + delta
  2950  					if !t.contains(e.key) || d < t.get(e.key) {
  2951  						update = true
  2952  						t.set(e.key, d, e.pos)
  2953  					}
  2954  				}
  2955  
  2956  				if !update {
  2957  					continue
  2958  				}
  2959  				s.live[p.ID] = updateLive(t, s.live[p.ID])
  2960  				changed = true
  2961  			}
  2962  		}
  2963  
  2964  		// Doing a traditional iterative algorithm and have run
  2965  		// out of changes
  2966  		if !changed {
  2967  			break
  2968  		}
  2969  
  2970  		// Doing a pre-pass and will fill in the liveness information
  2971  		// later
  2972  		if loopLiveIn != nil {
  2973  			break
  2974  		}
  2975  		// For loopless code, we have full liveness info after a single
  2976  		// iteration
  2977  		if len(s.loopnest.loops) == 0 {
  2978  			break
  2979  		}
  2980  	}
  2981  	if f.pass.debug > regDebug {
  2982  		s.debugPrintLive("after dfs walk", f, s.live, s.desired)
  2983  	}
  2984  
  2985  	// irreducible CFGs and functions without loops are already
  2986  	// done, compute their desired registers and return
  2987  	if loopLiveIn == nil {
  2988  		s.computeDesired()
  2989  		return
  2990  	}
  2991  
  2992  	// Walk the loopnest from outer to inner, adding
  2993  	// all live-in values from their parent. Instead of
  2994  	// a recursive algorithm, iterate in depth order.
  2995  	// TODO(dmo): can we permute the loopnest? can we avoid this copy?
  2996  	loops := slices.Clone(s.loopnest.loops)
  2997  	slices.SortFunc(loops, func(a, b *loop) int {
  2998  		return cmp.Compare(a.depth, b.depth)
  2999  	})
  3000  
  3001  	loopset := f.newSparseMapPos(f.NumValues())
  3002  	defer f.retSparseMapPos(loopset)
  3003  	for _, loop := range loops {
  3004  		if loop.outer == nil {
  3005  			continue
  3006  		}
  3007  		livein := loopLiveIn[loop]
  3008  		loopset.clear()
  3009  		for _, l := range livein {
  3010  			loopset.set(l.ID, l.dist, l.pos)
  3011  		}
  3012  		update := false
  3013  		for _, l := range loopLiveIn[loop.outer] {
  3014  			if !loopset.contains(l.ID) {
  3015  				loopset.set(l.ID, l.dist, l.pos)
  3016  				update = true
  3017  			}
  3018  		}
  3019  		if update {
  3020  			loopLiveIn[loop] = updateLive(loopset, livein)
  3021  		}
  3022  	}
  3023  	// unknownDistance is a sentinel value for when we know a variable
  3024  	// is live at any given block, but we do not yet know how far until it's next
  3025  	// use. The distance will be computed later.
  3026  	const unknownDistance = -1
  3027  
  3028  	// add live-in values of the loop headers to their children.
  3029  	// This includes the loop headers themselves, since they can have values
  3030  	// that die in the middle of the block and aren't live-out
  3031  	for _, b := range po {
  3032  		loop := s.loopnest.b2l[b.ID]
  3033  		if loop == nil {
  3034  			continue
  3035  		}
  3036  		headerLive := loopLiveIn[loop]
  3037  		loopset.clear()
  3038  		for _, l := range s.live[b.ID] {
  3039  			loopset.set(l.ID, l.dist, l.pos)
  3040  		}
  3041  		update := false
  3042  		for _, l := range headerLive {
  3043  			if !loopset.contains(l.ID) {
  3044  				loopset.set(l.ID, unknownDistance, src.NoXPos)
  3045  				update = true
  3046  			}
  3047  		}
  3048  		if update {
  3049  			s.live[b.ID] = updateLive(loopset, s.live[b.ID])
  3050  		}
  3051  	}
  3052  	if f.pass.debug > regDebug {
  3053  		s.debugPrintLive("after live loop prop", f, s.live, s.desired)
  3054  	}
  3055  	// Filling in liveness from loops leaves some blocks with no distance information
  3056  	// Run over them and fill in the information from their successors.
  3057  	// To stabilize faster, we quit when no block has missing values and we only
  3058  	// look at blocks that still have missing values in subsequent iterations
  3059  	unfinishedBlocks := f.Cache.allocBlockSlice(len(po))
  3060  	defer f.Cache.freeBlockSlice(unfinishedBlocks)
  3061  	copy(unfinishedBlocks, po)
  3062  
  3063  	for len(unfinishedBlocks) > 0 {
  3064  		n := 0
  3065  		for _, b := range unfinishedBlocks {
  3066  			live.clear()
  3067  			unfinishedValues := 0
  3068  			for _, l := range s.live[b.ID] {
  3069  				if l.dist == unknownDistance {
  3070  					unfinishedValues++
  3071  				}
  3072  				live.set(l.ID, l.dist, l.pos)
  3073  			}
  3074  			update := false
  3075  			for _, e := range b.Succs {
  3076  				succ := e.b
  3077  				for _, l := range s.live[succ.ID] {
  3078  					if !live.contains(l.ID) || l.dist == unknownDistance {
  3079  						continue
  3080  					}
  3081  					dist := int32(len(succ.Values)) + l.dist + branchDistance(b, succ)
  3082  					dist += numCalls[succ.ID] * unlikelyDistance
  3083  					val := live.get(l.ID)
  3084  					switch {
  3085  					case val == unknownDistance:
  3086  						unfinishedValues--
  3087  						fallthrough
  3088  					case dist < val:
  3089  						update = true
  3090  						live.set(l.ID, dist, l.pos)
  3091  					}
  3092  				}
  3093  			}
  3094  			if update {
  3095  				s.live[b.ID] = updateLive(live, s.live[b.ID])
  3096  			}
  3097  			if unfinishedValues > 0 {
  3098  				unfinishedBlocks[n] = b
  3099  				n++
  3100  			}
  3101  		}
  3102  		unfinishedBlocks = unfinishedBlocks[:n]
  3103  	}
  3104  
  3105  	s.computeDesired()
  3106  
  3107  	if f.pass.debug > regDebug {
  3108  		s.debugPrintLive("final", f, s.live, s.desired)
  3109  	}
  3110  }
  3111  
  3112  // computeDesired computes the desired register information at the end of each block.
  3113  // It is essentially a liveness analysis on machine registers instead of SSA values
  3114  // The desired register information is stored in s.desired.
  3115  func (s *regAllocState) computeDesired() {
  3116  
  3117  	// TODO: Can we speed this up using the liveness information we have already
  3118  	// from computeLive?
  3119  	// TODO: Since we don't propagate information through phi nodes, can we do
  3120  	// this as a single dominator tree walk instead of the iterative solution?
  3121  	var desired desiredState
  3122  	f := s.f
  3123  	po := f.postorder()
  3124  	for {
  3125  		changed := false
  3126  		for _, b := range po {
  3127  			desired.copy(&s.desired[b.ID])
  3128  			for i := len(b.Values) - 1; i >= 0; i-- {
  3129  				v := b.Values[i]
  3130  				prefs := desired.remove(v.ID)
  3131  				if v.Op == OpPhi {
  3132  					// TODO: if v is a phi, save desired register for phi inputs.
  3133  					// For now, we just drop it and don't propagate
  3134  					// desired registers back though phi nodes.
  3135  					continue
  3136  				}
  3137  				regspec := s.regspec(v)
  3138  				// Cancel desired registers if they get clobbered.
  3139  				desired.clobber(regspec.clobbers)
  3140  				// Update desired registers if there are any fixed register inputs.
  3141  				for _, j := range regspec.inputs {
  3142  					if countRegs(j.regs) != 1 {
  3143  						continue
  3144  					}
  3145  					desired.clobber(j.regs)
  3146  					desired.add(v.Args[j.idx].ID, pickReg(j.regs))
  3147  				}
  3148  				// Set desired register of input 0 if this is a 2-operand instruction.
  3149  				if opcodeTable[v.Op].resultInArg0 || v.Op == OpAMD64ADDQconst || v.Op == OpAMD64ADDLconst || v.Op == OpSelect0 {
  3150  					// ADDQconst is added here because we want to treat it as resultInArg0 for
  3151  					// the purposes of desired registers, even though it is not an absolute requirement.
  3152  					// This is because we'd rather implement it as ADDQ instead of LEAQ.
  3153  					// Same for ADDLconst
  3154  					// Select0 is added here to propagate the desired register to the tuple-generating instruction.
  3155  					if opcodeTable[v.Op].commutative {
  3156  						desired.addList(v.Args[1].ID, prefs)
  3157  					}
  3158  					desired.addList(v.Args[0].ID, prefs)
  3159  				}
  3160  			}
  3161  			for _, e := range b.Preds {
  3162  				p := e.b
  3163  				changed = s.desired[p.ID].merge(&desired) || changed
  3164  			}
  3165  		}
  3166  		if !changed || (!s.loopnest.hasIrreducible && len(s.loopnest.loops) == 0) {
  3167  			break
  3168  		}
  3169  	}
  3170  }
  3171  
  3172  // updateLive updates a given liveInfo slice with the contents of t
  3173  func updateLive(t *sparseMapPos, live []liveInfo) []liveInfo {
  3174  	live = live[:0]
  3175  	if cap(live) < t.size() {
  3176  		live = make([]liveInfo, 0, t.size())
  3177  	}
  3178  	for _, e := range t.contents() {
  3179  		live = append(live, liveInfo{e.key, e.val, e.pos})
  3180  	}
  3181  	return live
  3182  }
  3183  
  3184  // branchDistance calculates the distance between a block and a
  3185  // successor in pseudo-instructions. This is used to indicate
  3186  // likeliness
  3187  func branchDistance(b *Block, s *Block) int32 {
  3188  	if len(b.Succs) == 2 {
  3189  		if b.Succs[0].b == s && b.Likely == BranchLikely ||
  3190  			b.Succs[1].b == s && b.Likely == BranchUnlikely {
  3191  			return likelyDistance
  3192  		}
  3193  		if b.Succs[0].b == s && b.Likely == BranchUnlikely ||
  3194  			b.Succs[1].b == s && b.Likely == BranchLikely {
  3195  			return unlikelyDistance
  3196  		}
  3197  	}
  3198  	// Note: the branch distance must be at least 1 to distinguish the control
  3199  	// value use from the first user in a successor block.
  3200  	return normalDistance
  3201  }
  3202  
  3203  func (s *regAllocState) debugPrintLive(stage string, f *Func, live [][]liveInfo, desired []desiredState) {
  3204  	fmt.Printf("%s: live values at end of each block: %s\n", stage, f.Name)
  3205  	for _, b := range f.Blocks {
  3206  		s.debugPrintLiveBlock(b, live[b.ID], &desired[b.ID])
  3207  	}
  3208  }
  3209  
  3210  func (s *regAllocState) debugPrintLiveBlock(b *Block, live []liveInfo, desired *desiredState) {
  3211  	fmt.Printf("  %s:", b)
  3212  	slices.SortFunc(live, func(a, b liveInfo) int {
  3213  		return cmp.Compare(a.ID, b.ID)
  3214  	})
  3215  	for _, x := range live {
  3216  		fmt.Printf(" v%d(%d)", x.ID, x.dist)
  3217  		for _, e := range desired.entries {
  3218  			if e.ID != x.ID {
  3219  				continue
  3220  			}
  3221  			fmt.Printf("[")
  3222  			first := true
  3223  			for _, r := range e.regs {
  3224  				if r == noRegister {
  3225  					continue
  3226  				}
  3227  				if !first {
  3228  					fmt.Printf(",")
  3229  				}
  3230  				fmt.Print(&s.registers[r])
  3231  				first = false
  3232  			}
  3233  			fmt.Printf("]")
  3234  		}
  3235  	}
  3236  	if avoid := desired.avoid; avoid != 0 {
  3237  		fmt.Printf(" avoid=%v", s.RegMaskString(avoid))
  3238  	}
  3239  	fmt.Println()
  3240  }
  3241  
  3242  // A desiredState represents desired register assignments.
  3243  type desiredState struct {
  3244  	// Desired assignments will be small, so we just use a list
  3245  	// of valueID+registers entries.
  3246  	entries []desiredStateEntry
  3247  	// Registers that other values want to be in.  This value will
  3248  	// contain at least the union of the regs fields of entries, but
  3249  	// may contain additional entries for values that were once in
  3250  	// this data structure but are no longer.
  3251  	avoid regMask
  3252  }
  3253  type desiredStateEntry struct {
  3254  	// (pre-regalloc) value
  3255  	ID ID
  3256  	// Registers it would like to be in, in priority order.
  3257  	// Unused slots are filled with noRegister.
  3258  	// For opcodes that return tuples, we track desired registers only
  3259  	// for the first element of the tuple (see desiredSecondReg for
  3260  	// tracking the desired register for second part of a tuple).
  3261  	regs [4]register
  3262  }
  3263  
  3264  // get returns a list of desired registers for value vid.
  3265  func (d *desiredState) get(vid ID) [4]register {
  3266  	for _, e := range d.entries {
  3267  		if e.ID == vid {
  3268  			return e.regs
  3269  		}
  3270  	}
  3271  	return [4]register{noRegister, noRegister, noRegister, noRegister}
  3272  }
  3273  
  3274  // add records that we'd like value vid to be in register r.
  3275  func (d *desiredState) add(vid ID, r register) {
  3276  	d.avoid |= regMask(1) << r
  3277  	for i := range d.entries {
  3278  		e := &d.entries[i]
  3279  		if e.ID != vid {
  3280  			continue
  3281  		}
  3282  		if e.regs[0] == r {
  3283  			// Already known and highest priority
  3284  			return
  3285  		}
  3286  		for j := 1; j < len(e.regs); j++ {
  3287  			if e.regs[j] == r {
  3288  				// Move from lower priority to top priority
  3289  				copy(e.regs[1:], e.regs[:j])
  3290  				e.regs[0] = r
  3291  				return
  3292  			}
  3293  		}
  3294  		copy(e.regs[1:], e.regs[:])
  3295  		e.regs[0] = r
  3296  		return
  3297  	}
  3298  	d.entries = append(d.entries, desiredStateEntry{vid, [4]register{r, noRegister, noRegister, noRegister}})
  3299  }
  3300  
  3301  func (d *desiredState) addList(vid ID, regs [4]register) {
  3302  	// regs is in priority order, so iterate in reverse order.
  3303  	for i := len(regs) - 1; i >= 0; i-- {
  3304  		r := regs[i]
  3305  		if r != noRegister {
  3306  			d.add(vid, r)
  3307  		}
  3308  	}
  3309  }
  3310  
  3311  // clobber erases any desired registers in the set m.
  3312  func (d *desiredState) clobber(m regMask) {
  3313  	for i := 0; i < len(d.entries); {
  3314  		e := &d.entries[i]
  3315  		j := 0
  3316  		for _, r := range e.regs {
  3317  			if r != noRegister && m>>r&1 == 0 {
  3318  				e.regs[j] = r
  3319  				j++
  3320  			}
  3321  		}
  3322  		if j == 0 {
  3323  			// No more desired registers for this value.
  3324  			d.entries[i] = d.entries[len(d.entries)-1]
  3325  			d.entries = d.entries[:len(d.entries)-1]
  3326  			continue
  3327  		}
  3328  		for ; j < len(e.regs); j++ {
  3329  			e.regs[j] = noRegister
  3330  		}
  3331  		i++
  3332  	}
  3333  	d.avoid &^= m
  3334  }
  3335  
  3336  // copy copies a desired state from another desiredState x.
  3337  func (d *desiredState) copy(x *desiredState) {
  3338  	d.entries = append(d.entries[:0], x.entries...)
  3339  	d.avoid = x.avoid
  3340  }
  3341  
  3342  // remove removes the desired registers for vid and returns them.
  3343  func (d *desiredState) remove(vid ID) [4]register {
  3344  	for i := range d.entries {
  3345  		if d.entries[i].ID == vid {
  3346  			regs := d.entries[i].regs
  3347  			d.entries[i] = d.entries[len(d.entries)-1]
  3348  			d.entries = d.entries[:len(d.entries)-1]
  3349  			return regs
  3350  		}
  3351  	}
  3352  	return [4]register{noRegister, noRegister, noRegister, noRegister}
  3353  }
  3354  
  3355  // merge merges another desired state x into d. Returns whether the set has
  3356  // changed
  3357  func (d *desiredState) merge(x *desiredState) bool {
  3358  	oldAvoid := d.avoid
  3359  	d.avoid |= x.avoid
  3360  	// There should only be a few desired registers, so
  3361  	// linear insert is ok.
  3362  	for _, e := range x.entries {
  3363  		d.addList(e.ID, e.regs)
  3364  	}
  3365  	return oldAvoid != d.avoid
  3366  }
  3367  
  3368  // computeUnavoidableCalls computes the containsUnavoidableCall fields in the loop nest.
  3369  func (loopnest *loopnest) computeUnavoidableCalls() {
  3370  	f := loopnest.f
  3371  
  3372  	hasCall := f.Cache.allocBoolSlice(f.NumBlocks())
  3373  	defer f.Cache.freeBoolSlice(hasCall)
  3374  	for _, b := range f.Blocks {
  3375  		if b.containsCall() {
  3376  			hasCall[b.ID] = true
  3377  		}
  3378  	}
  3379  	found := f.Cache.allocSparseSet(f.NumBlocks())
  3380  	defer f.Cache.freeSparseSet(found)
  3381  	// Run dfs to find path through the loop that avoids all calls.
  3382  	// Such path either escapes the loop or returns back to the header.
  3383  	// It isn't enough to have exit not dominated by any call, for example:
  3384  	// ... some loop
  3385  	// call1    call2
  3386  	//   \       /
  3387  	//     block
  3388  	// ...
  3389  	// block is not dominated by any single call, but we don't have call-free path to it.
  3390  loopLoop:
  3391  	for _, l := range loopnest.loops {
  3392  		found.clear()
  3393  		tovisit := make([]*Block, 0, 8)
  3394  		tovisit = append(tovisit, l.header)
  3395  		for len(tovisit) > 0 {
  3396  			cur := tovisit[len(tovisit)-1]
  3397  			tovisit = tovisit[:len(tovisit)-1]
  3398  			if hasCall[cur.ID] {
  3399  				continue
  3400  			}
  3401  			for _, s := range cur.Succs {
  3402  				nb := s.Block()
  3403  				if nb == l.header {
  3404  					// Found a call-free path around the loop.
  3405  					continue loopLoop
  3406  				}
  3407  				if found.contains(nb.ID) {
  3408  					// Already found via another path.
  3409  					continue
  3410  				}
  3411  				nl := loopnest.b2l[nb.ID]
  3412  				if nl == nil || (nl.depth <= l.depth && nl != l) {
  3413  					// Left the loop.
  3414  					continue
  3415  				}
  3416  				tovisit = append(tovisit, nb)
  3417  				found.add(nb.ID)
  3418  			}
  3419  		}
  3420  		// No call-free path was found.
  3421  		l.containsUnavoidableCall = true
  3422  	}
  3423  }
  3424  
  3425  func (b *Block) containsCall() bool {
  3426  	if b.Kind == BlockDefer {
  3427  		return true
  3428  	}
  3429  	for _, v := range b.Values {
  3430  		if opcodeTable[v.Op].call {
  3431  			return true
  3432  		}
  3433  	}
  3434  	return false
  3435  }
  3436  

View as plain text