Source file src/cmd/compile/internal/ssa/regalloc.go

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Register allocation.
     6  //
     7  // We use a version of a linear scan register allocator. We treat the
     8  // whole function as a single long basic block and run through
     9  // it using a greedy register allocator. Then all merge edges
    10  // (those targeting a block with len(Preds)>1) are processed to
    11  // shuffle data into the place that the target of the edge expects.
    12  //
    13  // The greedy allocator moves values into registers just before they
    14  // are used, spills registers only when necessary, and spills the
    15  // value whose next use is farthest in the future.
    16  //
    17  // The register allocator requires that a block is not scheduled until
    18  // at least one of its predecessors have been scheduled. The most recent
    19  // such predecessor provides the starting register state for a block.
    20  //
    21  // It also requires that there are no critical edges (critical =
    22  // comes from a block with >1 successor and goes to a block with >1
    23  // predecessor).  This makes it easy to add fixup code on merge edges -
    24  // the source of a merge edge has only one successor, so we can add
    25  // fixup code to the end of that block.
    26  
    27  // Spilling
    28  //
    29  // During the normal course of the allocator, we might throw a still-live
    30  // value out of all registers. When that value is subsequently used, we must
    31  // load it from a slot on the stack. We must also issue an instruction to
    32  // initialize that stack location with a copy of v.
    33  //
    34  // pre-regalloc:
    35  //   (1) v = Op ...
    36  //   (2) x = Op ...
    37  //   (3) ... = Op v ...
    38  //
    39  // post-regalloc:
    40  //   (1) v = Op ...    : AX // computes v, store result in AX
    41  //       s = StoreReg v     // spill v to a stack slot
    42  //   (2) x = Op ...    : AX // some other op uses AX
    43  //       c = LoadReg s : CX // restore v from stack slot
    44  //   (3) ... = Op c ...     // use the restored value
    45  //
    46  // Allocation occurs normally until we reach (3) and we realize we have
    47  // a use of v and it isn't in any register. At that point, we allocate
    48  // a spill (a StoreReg) for v. We can't determine the correct place for
    49  // the spill at this point, so we allocate the spill as blockless initially.
    50  // The restore is then generated to load v back into a register so it can
    51  // be used. Subsequent uses of v will use the restored value c instead.
    52  //
    53  // What remains is the question of where to schedule the spill.
    54  // During allocation, we keep track of the dominator of all restores of v.
    55  // The spill of v must dominate that block. The spill must also be issued at
    56  // a point where v is still in a register.
    57  //
    58  // To find the right place, start at b, the block which dominates all restores.
    59  //  - If b is v.Block, then issue the spill right after v.
    60  //    It is known to be in a register at that point, and dominates any restores.
    61  //  - Otherwise, if v is in a register at the start of b,
    62  //    put the spill of v at the start of b.
    63  //  - Otherwise, set b = immediate dominator of b, and repeat.
    64  //
    65  // Phi values are special, as always. We define two kinds of phis, those
    66  // where the merge happens in a register (a "register" phi) and those where
    67  // the merge happens in a stack location (a "stack" phi).
    68  //
    69  // A register phi must have the phi and all of its inputs allocated to the
    70  // same register. Register phis are spilled similarly to regular ops.
    71  //
    72  // A stack phi must have the phi and all of its inputs allocated to the same
    73  // stack location. Stack phis start out life already spilled - each phi
    74  // input must be a store (using StoreReg) at the end of the corresponding
    75  // predecessor block.
    76  //     b1: y = ... : AX        b2: z = ... : BX
    77  //         y2 = StoreReg y         z2 = StoreReg z
    78  //         goto b3                 goto b3
    79  //     b3: x = phi(y2, z2)
    80  // The stack allocator knows that StoreReg args of stack-allocated phis
    81  // must be allocated to the same stack slot as the phi that uses them.
    82  // x is now a spilled value and a restore must appear before its first use.
    83  
    84  // TODO
    85  
    86  // Use an affinity graph to mark two values which should use the
    87  // same register. This affinity graph will be used to prefer certain
    88  // registers for allocation. This affinity helps eliminate moves that
    89  // are required for phi implementations and helps generate allocations
    90  // for 2-register architectures.
    91  
    92  // Note: regalloc generates a not-quite-SSA output. If we have:
    93  //
    94  //             b1: x = ... : AX
    95  //                 x2 = StoreReg x
    96  //                 ... AX gets reused for something else ...
    97  //                 if ... goto b3 else b4
    98  //
    99  //   b3: x3 = LoadReg x2 : BX       b4: x4 = LoadReg x2 : CX
   100  //       ... use x3 ...                 ... use x4 ...
   101  //
   102  //             b2: ... use x3 ...
   103  //
   104  // If b3 is the primary predecessor of b2, then we use x3 in b2 and
   105  // add a x4:CX->BX copy at the end of b4.
   106  // But the definition of x3 doesn't dominate b2.  We should really
   107  // insert an extra phi at the start of b2 (x5=phi(x3,x4):BX) to keep
   108  // SSA form. For now, we ignore this problem as remaining in strict
   109  // SSA form isn't needed after regalloc. We'll just leave the use
   110  // of x3 not dominated by the definition of x3, and the CX->BX copy
   111  // will have no use (so don't run deadcode after regalloc!).
   112  // TODO: maybe we should introduce these extra phis?
   113  
   114  package ssa
   115  
   116  import (
   117  	"cmd/compile/internal/base"
   118  	"cmd/compile/internal/ir"
   119  	"cmd/compile/internal/types"
   120  	"cmd/internal/src"
   121  	"cmd/internal/sys"
   122  	"cmp"
   123  	"fmt"
   124  	"internal/buildcfg"
   125  	"math"
   126  	"math/bits"
   127  	"slices"
   128  	"unsafe"
   129  )
   130  
   131  const (
   132  	moveSpills = iota
   133  	logSpills
   134  	regDebug
   135  	stackDebug
   136  )
   137  
   138  // distance is a measure of how far into the future values are used.
   139  // distance is measured in units of instructions.
   140  const (
   141  	likelyDistance   = 1
   142  	normalDistance   = 10
   143  	unlikelyDistance = 100
   144  )
   145  
   146  // regalloc performs register allocation on f. It sets f.RegAlloc
   147  // to the resulting allocation.
   148  func regalloc(f *Func) {
   149  	var s regAllocState
   150  	s.init(f)
   151  	s.regalloc(f)
   152  	s.close()
   153  }
   154  
   155  type register uint8
   156  
   157  const noRegister register = 255
   158  
   159  // For bulk initializing
   160  var noRegisters [32]register = [32]register{
   161  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   162  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   163  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   164  	noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister, noRegister,
   165  }
   166  
   167  // A regMask encodes a set of machine registers.
   168  // TODO: regMask -> regSet?
   169  type regMask uint64
   170  
   171  func (m regMask) String() string {
   172  	s := ""
   173  	for r := register(0); m != 0; r++ {
   174  		if m>>r&1 == 0 {
   175  			continue
   176  		}
   177  		m &^= regMask(1) << r
   178  		if s != "" {
   179  			s += " "
   180  		}
   181  		s += fmt.Sprintf("r%d", r)
   182  	}
   183  	return s
   184  }
   185  
   186  func (m regMask) contains(r register) bool {
   187  	return m>>r&1 != 0
   188  }
   189  
   190  func (s *regAllocState) RegMaskString(m regMask) string {
   191  	str := ""
   192  	for r := register(0); m != 0; r++ {
   193  		if m>>r&1 == 0 {
   194  			continue
   195  		}
   196  		m &^= regMask(1) << r
   197  		if str != "" {
   198  			str += " "
   199  		}
   200  		str += s.registers[r].String()
   201  	}
   202  	return str
   203  }
   204  
   205  // countRegs returns the number of set bits in the register mask.
   206  func countRegs(r regMask) int {
   207  	return bits.OnesCount64(uint64(r))
   208  }
   209  
   210  // pickReg picks an arbitrary register from the register mask.
   211  func pickReg(r regMask) register {
   212  	if r == 0 {
   213  		panic("can't pick a register from an empty set")
   214  	}
   215  	// pick the lowest one
   216  	return register(bits.TrailingZeros64(uint64(r)))
   217  }
   218  
   219  type use struct {
   220  	// distance from start of the block to a use of a value
   221  	//   dist == 0                 used by first instruction in block
   222  	//   dist == len(b.Values)-1   used by last instruction in block
   223  	//   dist == len(b.Values)     used by block's control value
   224  	//   dist  > len(b.Values)     used by a subsequent block
   225  	dist int32
   226  	pos  src.XPos // source position of the use
   227  	next *use     // linked list of uses of a value in nondecreasing dist order
   228  }
   229  
   230  // A valState records the register allocation state for a (pre-regalloc) value.
   231  type valState struct {
   232  	regs              regMask // the set of registers holding a Value (usually just one)
   233  	uses              *use    // list of uses in this block
   234  	spill             *Value  // spilled copy of the Value (if any)
   235  	restoreMin        int32   // minimum of all restores' blocks' sdom.entry
   236  	restoreMax        int32   // maximum of all restores' blocks' sdom.exit
   237  	needReg           bool    // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
   238  	rematerializeable bool    // cached value of v.rematerializeable()
   239  }
   240  
   241  type regState struct {
   242  	v *Value // Original (preregalloc) Value stored in this register.
   243  	c *Value // A Value equal to v which is currently in a register.  Might be v or a copy of it.
   244  	// If a register is unused, v==c==nil
   245  }
   246  
   247  type regAllocState struct {
   248  	f *Func
   249  
   250  	sdom        SparseTree
   251  	registers   []Register
   252  	numRegs     register
   253  	SPReg       register
   254  	SBReg       register
   255  	GReg        register
   256  	ZeroIntReg  register
   257  	allocatable regMask
   258  
   259  	// live values at the end of each block.  live[b.ID] is a list of value IDs
   260  	// which are live at the end of b, together with a count of how many instructions
   261  	// forward to the next use.
   262  	live [][]liveInfo
   263  	// desired register assignments at the end of each block.
   264  	// Note that this is a static map computed before allocation occurs. Dynamic
   265  	// register desires (from partially completed allocations) will trump
   266  	// this information.
   267  	desired []desiredState
   268  
   269  	// current state of each (preregalloc) Value
   270  	values []valState
   271  
   272  	// ID of SP, SB values
   273  	sp, sb ID
   274  
   275  	// For each Value, map from its value ID back to the
   276  	// preregalloc Value it was derived from.
   277  	orig []*Value
   278  
   279  	// current state of each register.
   280  	// Includes only registers in allocatable.
   281  	regs []regState
   282  
   283  	// registers that contain values which can't be kicked out
   284  	nospill regMask
   285  
   286  	// mask of registers currently in use
   287  	used regMask
   288  
   289  	// mask of registers used since the start of the current block
   290  	usedSinceBlockStart regMask
   291  
   292  	// mask of registers used in the current instruction
   293  	tmpused regMask
   294  
   295  	// current block we're working on
   296  	curBlock *Block
   297  
   298  	// cache of use records
   299  	freeUseRecords *use
   300  
   301  	// endRegs[blockid] is the register state at the end of each block.
   302  	// encoded as a set of endReg records.
   303  	endRegs [][]endReg
   304  
   305  	// startRegs[blockid] is the register state at the start of merge blocks.
   306  	// saved state does not include the state of phi ops in the block.
   307  	startRegs [][]startReg
   308  
   309  	// startRegsMask is a mask of the registers in startRegs[curBlock.ID].
   310  	// Registers dropped from startRegsMask are later synchronoized back to
   311  	// startRegs by dropping from there as well.
   312  	startRegsMask regMask
   313  
   314  	// spillLive[blockid] is the set of live spills at the end of each block
   315  	spillLive [][]ID
   316  
   317  	// a set of copies we generated to move things around, and
   318  	// whether it is used in shuffle. Unused copies will be deleted.
   319  	copies map[*Value]bool
   320  
   321  	loopnest *loopnest
   322  
   323  	// choose a good order in which to visit blocks for allocation purposes.
   324  	visitOrder []*Block
   325  
   326  	// blockOrder[b.ID] corresponds to the index of block b in visitOrder.
   327  	blockOrder []int32
   328  
   329  	// whether to insert instructions that clobber dead registers at call sites
   330  	doClobber bool
   331  
   332  	// For each instruction index in a basic block, the index of the next call
   333  	// at or after that instruction index.
   334  	// If there is no next call, returns maxInt32.
   335  	// nextCall for a call instruction points to itself.
   336  	// (Indexes and results are pre-regalloc.)
   337  	nextCall []int32
   338  
   339  	// Index of the instruction we're currently working on.
   340  	// Index is expressed in terms of the pre-regalloc b.Values list.
   341  	curIdx int
   342  }
   343  
   344  type endReg struct {
   345  	r register
   346  	v *Value // pre-regalloc value held in this register (TODO: can we use ID here?)
   347  	c *Value // cached version of the value
   348  }
   349  
   350  type startReg struct {
   351  	r   register
   352  	v   *Value   // pre-regalloc value needed in this register
   353  	c   *Value   // cached version of the value
   354  	pos src.XPos // source position of use of this register
   355  }
   356  
   357  // freeReg frees up register r. Any current user of r is kicked out.
   358  func (s *regAllocState) freeReg(r register) {
   359  	if !s.allocatable.contains(r) && !s.isGReg(r) {
   360  		return
   361  	}
   362  	v := s.regs[r].v
   363  	if v == nil {
   364  		s.f.Fatalf("tried to free an already free register %d\n", r)
   365  	}
   366  
   367  	// Mark r as unused.
   368  	if s.f.pass.debug > regDebug {
   369  		fmt.Printf("freeReg %s (dump %s/%s)\n", &s.registers[r], v, s.regs[r].c)
   370  	}
   371  	s.regs[r] = regState{}
   372  	s.values[v.ID].regs &^= regMask(1) << r
   373  	s.used &^= regMask(1) << r
   374  }
   375  
   376  // freeRegs frees up all registers listed in m.
   377  func (s *regAllocState) freeRegs(m regMask) {
   378  	for m&s.used != 0 {
   379  		s.freeReg(pickReg(m & s.used))
   380  	}
   381  }
   382  
   383  // clobberRegs inserts instructions that clobber registers listed in m.
   384  func (s *regAllocState) clobberRegs(m regMask) {
   385  	m &= s.allocatable & s.f.Config.gpRegMask // only integer register can contain pointers, only clobber them
   386  	for m != 0 {
   387  		r := pickReg(m)
   388  		m &^= 1 << r
   389  		x := s.curBlock.NewValue0(src.NoXPos, OpClobberReg, types.TypeVoid)
   390  		s.f.setHome(x, &s.registers[r])
   391  	}
   392  }
   393  
   394  // setOrig records that c's original value is the same as
   395  // v's original value.
   396  func (s *regAllocState) setOrig(c *Value, v *Value) {
   397  	if int(c.ID) >= cap(s.orig) {
   398  		x := s.f.Cache.allocValueSlice(int(c.ID) + 1)
   399  		copy(x, s.orig)
   400  		s.f.Cache.freeValueSlice(s.orig)
   401  		s.orig = x
   402  	}
   403  	for int(c.ID) >= len(s.orig) {
   404  		s.orig = append(s.orig, nil)
   405  	}
   406  	if s.orig[c.ID] != nil {
   407  		s.f.Fatalf("orig value set twice %s %s", c, v)
   408  	}
   409  	s.orig[c.ID] = s.orig[v.ID]
   410  }
   411  
   412  // assignReg assigns register r to hold c, a copy of v.
   413  // r must be unused.
   414  func (s *regAllocState) assignReg(r register, v *Value, c *Value) {
   415  	if s.f.pass.debug > regDebug {
   416  		fmt.Printf("assignReg %s %s/%s\n", &s.registers[r], v, c)
   417  	}
   418  	// Allocate v to r.
   419  	s.values[v.ID].regs |= regMask(1) << r
   420  	s.f.setHome(c, &s.registers[r])
   421  
   422  	// Allocate r to v.
   423  	if !s.allocatable.contains(r) && !s.isGReg(r) {
   424  		return
   425  	}
   426  	if s.regs[r].v != nil {
   427  		s.f.Fatalf("tried to assign register %d to %s/%s but it is already used by %s", r, v, c, s.regs[r].v)
   428  	}
   429  	s.regs[r] = regState{v, c}
   430  	s.used |= regMask(1) << r
   431  }
   432  
   433  // allocReg chooses a register from the set of registers in mask.
   434  // If there is no unused register, a Value will be kicked out of
   435  // a register to make room.
   436  func (s *regAllocState) allocReg(mask regMask, v *Value) register {
   437  	if v.OnWasmStack {
   438  		return noRegister
   439  	}
   440  
   441  	mask &= s.allocatable
   442  	mask &^= s.nospill
   443  	if mask == 0 {
   444  		s.f.Fatalf("no register available for %s", v.LongString())
   445  	}
   446  
   447  	// Pick an unused register if one is available.
   448  	if mask&^s.used != 0 {
   449  		r := pickReg(mask &^ s.used)
   450  		s.usedSinceBlockStart |= regMask(1) << r
   451  		return r
   452  	}
   453  
   454  	// Pick a value to spill. Spill the value with the
   455  	// farthest-in-the-future use.
   456  	// TODO: Prefer registers with already spilled Values?
   457  	// TODO: Modify preference using affinity graph.
   458  	// TODO: if a single value is in multiple registers, spill one of them
   459  	// before spilling a value in just a single register.
   460  
   461  	// Find a register to spill. We spill the register containing the value
   462  	// whose next use is as far in the future as possible.
   463  	// https://en.wikipedia.org/wiki/Page_replacement_algorithm#The_theoretically_optimal_page_replacement_algorithm
   464  	var r register
   465  	maxuse := int32(-1)
   466  	for t := register(0); t < s.numRegs; t++ {
   467  		if mask>>t&1 == 0 {
   468  			continue
   469  		}
   470  		v := s.regs[t].v
   471  		if n := s.values[v.ID].uses.dist; n > maxuse {
   472  			// v's next use is farther in the future than any value
   473  			// we've seen so far. A new best spill candidate.
   474  			r = t
   475  			maxuse = n
   476  		}
   477  	}
   478  	if maxuse == -1 {
   479  		s.f.Fatalf("couldn't find register to spill")
   480  	}
   481  
   482  	if s.f.Config.ctxt.Arch.Arch == sys.ArchWasm {
   483  		// TODO(neelance): In theory this should never happen, because all wasm registers are equal.
   484  		// So if there is still a free register, the allocation should have picked that one in the first place instead of
   485  		// trying to kick some other value out. In practice, this case does happen and it breaks the stack optimization.
   486  		s.freeReg(r)
   487  		return r
   488  	}
   489  
   490  	// Try to move it around before kicking out, if there is a free register.
   491  	// We generate a Copy and record it. It will be deleted if never used.
   492  	v2 := s.regs[r].v
   493  	m := s.compatRegs(v2.Type) &^ s.used &^ s.tmpused &^ (regMask(1) << r)
   494  	if m != 0 && !s.values[v2.ID].rematerializeable && countRegs(s.values[v2.ID].regs) == 1 {
   495  		s.usedSinceBlockStart |= regMask(1) << r
   496  		r2 := pickReg(m)
   497  		c := s.curBlock.NewValue1(v2.Pos, OpCopy, v2.Type, s.regs[r].c)
   498  		s.copies[c] = false
   499  		if s.f.pass.debug > regDebug {
   500  			fmt.Printf("copy %s to %s : %s\n", v2, c, &s.registers[r2])
   501  		}
   502  		s.setOrig(c, v2)
   503  		s.assignReg(r2, v2, c)
   504  	}
   505  
   506  	// If the evicted register isn't used between the start of the block
   507  	// and now then there is no reason to even request it on entry. We can
   508  	// drop from startRegs in that case.
   509  	if s.usedSinceBlockStart&(regMask(1)<<r) == 0 {
   510  		if s.startRegsMask&(regMask(1)<<r) == 1 {
   511  			if s.f.pass.debug > regDebug {
   512  				fmt.Printf("dropped from startRegs: %s\n", &s.registers[r])
   513  			}
   514  			s.startRegsMask &^= regMask(1) << r
   515  		}
   516  	}
   517  
   518  	s.freeReg(r)
   519  	s.usedSinceBlockStart |= regMask(1) << r
   520  	return r
   521  }
   522  
   523  // makeSpill returns a Value which represents the spilled value of v.
   524  // b is the block in which the spill is used.
   525  func (s *regAllocState) makeSpill(v *Value, b *Block) *Value {
   526  	vi := &s.values[v.ID]
   527  	if vi.spill != nil {
   528  		// Final block not known - keep track of subtree where restores reside.
   529  		vi.restoreMin = min(vi.restoreMin, s.sdom[b.ID].entry)
   530  		vi.restoreMax = max(vi.restoreMax, s.sdom[b.ID].exit)
   531  		return vi.spill
   532  	}
   533  	// Make a spill for v. We don't know where we want
   534  	// to put it yet, so we leave it blockless for now.
   535  	spill := s.f.newValueNoBlock(OpStoreReg, v.Type, v.Pos)
   536  	// We also don't know what the spill's arg will be.
   537  	// Leave it argless for now.
   538  	s.setOrig(spill, v)
   539  	vi.spill = spill
   540  	vi.restoreMin = s.sdom[b.ID].entry
   541  	vi.restoreMax = s.sdom[b.ID].exit
   542  	return spill
   543  }
   544  
   545  // allocValToReg allocates v to a register selected from regMask and
   546  // returns the register copy of v. Any previous user is kicked out and spilled
   547  // (if necessary). Load code is added at the current pc. If nospill is set the
   548  // allocated register is marked nospill so the assignment cannot be
   549  // undone until the caller allows it by clearing nospill. Returns a
   550  // *Value which is either v or a copy of v allocated to the chosen register.
   551  func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos src.XPos) *Value {
   552  	if s.f.Config.ctxt.Arch.Arch == sys.ArchWasm && v.rematerializeable() {
   553  		c := v.copyIntoWithXPos(s.curBlock, pos)
   554  		c.OnWasmStack = true
   555  		s.setOrig(c, v)
   556  		return c
   557  	}
   558  	if v.OnWasmStack {
   559  		return v
   560  	}
   561  
   562  	vi := &s.values[v.ID]
   563  	pos = pos.WithNotStmt()
   564  	// Check if v is already in a requested register.
   565  	if mask&vi.regs != 0 {
   566  		mask &= vi.regs
   567  		r := pickReg(mask)
   568  		if mask.contains(s.SPReg) {
   569  			// Prefer the stack pointer if it is allowed.
   570  			// (Needed because the op might have an Aux symbol
   571  			// that needs SP as its base.)
   572  			r = s.SPReg
   573  		}
   574  		if !s.allocatable.contains(r) {
   575  			return v // v is in a fixed register
   576  		}
   577  		if s.regs[r].v != v || s.regs[r].c == nil {
   578  			panic("bad register state")
   579  		}
   580  		if nospill {
   581  			s.nospill |= regMask(1) << r
   582  		}
   583  		s.usedSinceBlockStart |= regMask(1) << r
   584  		return s.regs[r].c
   585  	}
   586  
   587  	var r register
   588  	// If nospill is set, the value is used immediately, so it can live on the WebAssembly stack.
   589  	onWasmStack := nospill && s.f.Config.ctxt.Arch.Arch == sys.ArchWasm
   590  	if !onWasmStack {
   591  		// Allocate a register.
   592  		r = s.allocReg(mask, v)
   593  	}
   594  
   595  	// Allocate v to the new register.
   596  	var c *Value
   597  	if vi.regs != 0 {
   598  		// Copy from a register that v is already in.
   599  		var current *Value
   600  		if vi.regs&^s.allocatable != 0 {
   601  			// v is in a fixed register, prefer that
   602  			current = v
   603  		} else {
   604  			r2 := pickReg(vi.regs)
   605  			if s.regs[r2].v != v {
   606  				panic("bad register state")
   607  			}
   608  			current = s.regs[r2].c
   609  			s.usedSinceBlockStart |= regMask(1) << r2
   610  		}
   611  		c = s.curBlock.NewValue1(pos, OpCopy, v.Type, current)
   612  	} else if v.rematerializeable() {
   613  		// Rematerialize instead of loading from the spill location.
   614  		c = v.copyIntoWithXPos(s.curBlock, pos)
   615  		// We need to consider its output mask and potentially issue a Copy
   616  		// if there are register mask conflicts.
   617  		// This currently happens for the SIMD package only between GP and FP
   618  		// register. Because Intel's vector extension can put integer value into
   619  		// FP, which is seen as a vector. Example instruction: VPSLL[BWDQ]
   620  		// Because GP and FP masks do not overlap, mask & outputMask == 0
   621  		// detects this situation thoroughly.
   622  		sourceMask := s.regspec(c).outputs[0].regs
   623  		if mask&sourceMask == 0 && !onWasmStack {
   624  			s.setOrig(c, v)
   625  			s.assignReg(s.allocReg(sourceMask, v), v, c)
   626  			// v.Type for the new OpCopy is likely wrong and it might delay the problem
   627  			// until ssa to asm lowering, which might need the types to generate the right
   628  			// assembly for OpCopy. For Intel's GP to FP move, it happens to be that
   629  			// MOV instruction has such a variant so it happens to be right.
   630  			// But it's unclear for other architectures or situations, and the problem
   631  			// might be exposed when the assembler sees illegal instructions.
   632  			// Right now make we still pick v.Type, because at least its size should be correct
   633  			// for the rematerialization case the amd64 SIMD package exposed.
   634  			// TODO: We might need to figure out a way to find the correct type or make
   635  			// the asm lowering use reg info only for OpCopy.
   636  			c = s.curBlock.NewValue1(pos, OpCopy, v.Type, c)
   637  		}
   638  	} else {
   639  		// Load v from its spill location.
   640  		spill := s.makeSpill(v, s.curBlock)
   641  		if s.f.pass.debug > logSpills {
   642  			s.f.Warnl(vi.spill.Pos, "load spill for %v from %v", v, spill)
   643  		}
   644  		c = s.curBlock.NewValue1(pos, OpLoadReg, v.Type, spill)
   645  	}
   646  
   647  	s.setOrig(c, v)
   648  
   649  	if onWasmStack {
   650  		c.OnWasmStack = true
   651  		return c
   652  	}
   653  
   654  	s.assignReg(r, v, c)
   655  	if c.Op == OpLoadReg && s.isGReg(r) {
   656  		s.f.Fatalf("allocValToReg.OpLoadReg targeting g: " + c.LongString())
   657  	}
   658  	if nospill {
   659  		s.nospill |= regMask(1) << r
   660  	}
   661  	return c
   662  }
   663  
   664  // isLeaf reports whether f performs any calls.
   665  func isLeaf(f *Func) bool {
   666  	for _, b := range f.Blocks {
   667  		for _, v := range b.Values {
   668  			if v.Op.IsCall() && !v.Op.IsTailCall() {
   669  				// tail call is not counted as it does not save the return PC or need a frame
   670  				return false
   671  			}
   672  		}
   673  	}
   674  	return true
   675  }
   676  
   677  // needRegister reports whether v needs a register.
   678  func (v *Value) needRegister() bool {
   679  	return !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && !v.Type.IsTuple()
   680  }
   681  
   682  func (s *regAllocState) init(f *Func) {
   683  	s.f = f
   684  	s.f.RegAlloc = s.f.Cache.locs[:0]
   685  	s.registers = f.Config.registers
   686  	if nr := len(s.registers); nr == 0 || nr > int(noRegister) || nr > int(unsafe.Sizeof(regMask(0))*8) {
   687  		s.f.Fatalf("bad number of registers: %d", nr)
   688  	} else {
   689  		s.numRegs = register(nr)
   690  	}
   691  	// Locate SP, SB, and g registers.
   692  	s.SPReg = noRegister
   693  	s.SBReg = noRegister
   694  	s.GReg = noRegister
   695  	s.ZeroIntReg = noRegister
   696  	for r := register(0); r < s.numRegs; r++ {
   697  		switch s.registers[r].String() {
   698  		case "SP":
   699  			s.SPReg = r
   700  		case "SB":
   701  			s.SBReg = r
   702  		case "g":
   703  			s.GReg = r
   704  		case "ZERO": // TODO: arch-specific?
   705  			s.ZeroIntReg = r
   706  		}
   707  	}
   708  	// Make sure we found all required registers.
   709  	switch noRegister {
   710  	case s.SPReg:
   711  		s.f.Fatalf("no SP register found")
   712  	case s.SBReg:
   713  		s.f.Fatalf("no SB register found")
   714  	case s.GReg:
   715  		if f.Config.hasGReg {
   716  			s.f.Fatalf("no g register found")
   717  		}
   718  	}
   719  
   720  	// Figure out which registers we're allowed to use.
   721  	s.allocatable = s.f.Config.gpRegMask | s.f.Config.fpRegMask | s.f.Config.specialRegMask
   722  	s.allocatable &^= 1 << s.SPReg
   723  	s.allocatable &^= 1 << s.SBReg
   724  	if s.f.Config.hasGReg {
   725  		s.allocatable &^= 1 << s.GReg
   726  	}
   727  	if s.ZeroIntReg != noRegister {
   728  		s.allocatable &^= 1 << s.ZeroIntReg
   729  	}
   730  	if buildcfg.FramePointerEnabled && s.f.Config.FPReg >= 0 {
   731  		s.allocatable &^= 1 << uint(s.f.Config.FPReg)
   732  	}
   733  	if s.f.Config.LinkReg != -1 {
   734  		if isLeaf(f) {
   735  			// Leaf functions don't save/restore the link register.
   736  			s.allocatable &^= 1 << uint(s.f.Config.LinkReg)
   737  		}
   738  	}
   739  	if s.f.Config.ctxt.Flag_dynlink {
   740  		switch s.f.Config.arch {
   741  		case "386":
   742  			// nothing to do.
   743  			// Note that for Flag_shared (position independent code)
   744  			// we do need to be careful, but that carefulness is hidden
   745  			// in the rewrite rules so we always have a free register
   746  			// available for global load/stores. See _gen/386.rules (search for Flag_shared).
   747  		case "amd64":
   748  			s.allocatable &^= 1 << 15 // R15
   749  		case "arm":
   750  			s.allocatable &^= 1 << 9 // R9
   751  		case "arm64":
   752  			// nothing to do
   753  		case "loong64": // R2 (aka TP) already reserved.
   754  			// nothing to do
   755  		case "ppc64le": // R2 already reserved.
   756  			// nothing to do
   757  		case "riscv64": // X3 (aka GP) and X4 (aka TP) already reserved.
   758  			// nothing to do
   759  		case "s390x":
   760  			s.allocatable &^= 1 << 11 // R11
   761  		default:
   762  			s.f.fe.Fatalf(src.NoXPos, "arch %s not implemented", s.f.Config.arch)
   763  		}
   764  	}
   765  
   766  	// Linear scan register allocation can be influenced by the order in which blocks appear.
   767  	// Decouple the register allocation order from the generated block order.
   768  	// This also creates an opportunity for experiments to find a better order.
   769  	s.visitOrder = layoutRegallocOrder(f)
   770  
   771  	// Compute block order. This array allows us to distinguish forward edges
   772  	// from backward edges and compute how far they go.
   773  	s.blockOrder = make([]int32, f.NumBlocks())
   774  	for i, b := range s.visitOrder {
   775  		s.blockOrder[b.ID] = int32(i)
   776  	}
   777  
   778  	s.regs = make([]regState, s.numRegs)
   779  	nv := f.NumValues()
   780  	if cap(s.f.Cache.regallocValues) >= nv {
   781  		s.f.Cache.regallocValues = s.f.Cache.regallocValues[:nv]
   782  	} else {
   783  		s.f.Cache.regallocValues = make([]valState, nv)
   784  	}
   785  	s.values = s.f.Cache.regallocValues
   786  	s.orig = s.f.Cache.allocValueSlice(nv)
   787  	s.copies = make(map[*Value]bool)
   788  	for _, b := range s.visitOrder {
   789  		for _, v := range b.Values {
   790  			if v.needRegister() {
   791  				s.values[v.ID].needReg = true
   792  				s.values[v.ID].rematerializeable = v.rematerializeable()
   793  				s.orig[v.ID] = v
   794  			}
   795  			// Note: needReg is false for values returning Tuple types.
   796  			// Instead, we mark the corresponding Selects as needReg.
   797  		}
   798  	}
   799  	s.computeLive()
   800  
   801  	s.endRegs = make([][]endReg, f.NumBlocks())
   802  	s.startRegs = make([][]startReg, f.NumBlocks())
   803  	s.spillLive = make([][]ID, f.NumBlocks())
   804  	s.sdom = f.Sdom()
   805  
   806  	// wasm: Mark instructions that can be optimized to have their values only on the WebAssembly stack.
   807  	if f.Config.ctxt.Arch.Arch == sys.ArchWasm {
   808  		canLiveOnStack := f.newSparseSet(f.NumValues())
   809  		defer f.retSparseSet(canLiveOnStack)
   810  		for _, b := range f.Blocks {
   811  			// New block. Clear candidate set.
   812  			canLiveOnStack.clear()
   813  			for _, c := range b.ControlValues() {
   814  				if c.Uses == 1 && !opcodeTable[c.Op].generic {
   815  					canLiveOnStack.add(c.ID)
   816  				}
   817  			}
   818  			// Walking backwards.
   819  			for i := len(b.Values) - 1; i >= 0; i-- {
   820  				v := b.Values[i]
   821  				if canLiveOnStack.contains(v.ID) {
   822  					v.OnWasmStack = true
   823  				} else {
   824  					// Value can not live on stack. Values are not allowed to be reordered, so clear candidate set.
   825  					canLiveOnStack.clear()
   826  				}
   827  				for _, arg := range v.Args {
   828  					// Value can live on the stack if:
   829  					// - it is only used once
   830  					// - it is used in the same basic block
   831  					// - it is not a "mem" value
   832  					// - it is a WebAssembly op
   833  					if arg.Uses == 1 && arg.Block == v.Block && !arg.Type.IsMemory() && !opcodeTable[arg.Op].generic {
   834  						canLiveOnStack.add(arg.ID)
   835  					}
   836  				}
   837  			}
   838  		}
   839  	}
   840  
   841  	// The clobberdeadreg experiment inserts code to clobber dead registers
   842  	// at call sites.
   843  	// Ignore huge functions to avoid doing too much work.
   844  	if base.Flag.ClobberDeadReg && len(s.f.Blocks) <= 10000 {
   845  		// TODO: honor GOCLOBBERDEADHASH, or maybe GOSSAHASH.
   846  		s.doClobber = true
   847  	}
   848  }
   849  
   850  func (s *regAllocState) close() {
   851  	s.f.Cache.freeValueSlice(s.orig)
   852  }
   853  
   854  // Adds a use record for id at distance dist from the start of the block.
   855  // All calls to addUse must happen with nonincreasing dist.
   856  func (s *regAllocState) addUse(id ID, dist int32, pos src.XPos) {
   857  	r := s.freeUseRecords
   858  	if r != nil {
   859  		s.freeUseRecords = r.next
   860  	} else {
   861  		r = &use{}
   862  	}
   863  	r.dist = dist
   864  	r.pos = pos
   865  	r.next = s.values[id].uses
   866  	s.values[id].uses = r
   867  	if r.next != nil && dist > r.next.dist {
   868  		s.f.Fatalf("uses added in wrong order")
   869  	}
   870  }
   871  
   872  // advanceUses advances the uses of v's args from the state before v to the state after v.
   873  // Any values which have no more uses are deallocated from registers.
   874  func (s *regAllocState) advanceUses(v *Value) {
   875  	for _, a := range v.Args {
   876  		if !s.values[a.ID].needReg {
   877  			continue
   878  		}
   879  		ai := &s.values[a.ID]
   880  		r := ai.uses
   881  		ai.uses = r.next
   882  		if r.next == nil || (!opcodeTable[a.Op].fixedReg && r.next.dist > s.nextCall[s.curIdx]) {
   883  			// Value is dead (or is not used again until after a call), free all registers that hold it.
   884  			s.freeRegs(ai.regs)
   885  		}
   886  		r.next = s.freeUseRecords
   887  		s.freeUseRecords = r
   888  	}
   889  	s.dropIfUnused(v)
   890  }
   891  
   892  // Drop v from registers if it isn't used again, or its only uses are after
   893  // a call instruction.
   894  func (s *regAllocState) dropIfUnused(v *Value) {
   895  	if !s.values[v.ID].needReg {
   896  		return
   897  	}
   898  	vi := &s.values[v.ID]
   899  	r := vi.uses
   900  	if r == nil || (!opcodeTable[v.Op].fixedReg && r.dist > s.nextCall[s.curIdx]) {
   901  		s.freeRegs(vi.regs)
   902  	}
   903  }
   904  
   905  // liveAfterCurrentInstruction reports whether v is live after
   906  // the current instruction is completed.  v must be used by the
   907  // current instruction.
   908  func (s *regAllocState) liveAfterCurrentInstruction(v *Value) bool {
   909  	u := s.values[v.ID].uses
   910  	if u == nil {
   911  		panic(fmt.Errorf("u is nil, v = %s, s.values[v.ID] = %v", v.LongString(), s.values[v.ID]))
   912  	}
   913  	d := u.dist
   914  	for u != nil && u.dist == d {
   915  		u = u.next
   916  	}
   917  	return u != nil && u.dist > d
   918  }
   919  
   920  // Sets the state of the registers to that encoded in regs.
   921  func (s *regAllocState) setState(regs []endReg) {
   922  	s.freeRegs(s.used)
   923  	for _, x := range regs {
   924  		s.assignReg(x.r, x.v, x.c)
   925  	}
   926  }
   927  
   928  // compatRegs returns the set of registers which can store a type t.
   929  func (s *regAllocState) compatRegs(t *types.Type) regMask {
   930  	var m regMask
   931  	if t.IsTuple() || t.IsFlags() {
   932  		return 0
   933  	}
   934  	if t.IsSIMD() {
   935  		if t.Size() > 8 {
   936  			return s.f.Config.fpRegMask & s.allocatable
   937  		} else {
   938  			// K mask
   939  			return s.f.Config.gpRegMask & s.allocatable
   940  		}
   941  	}
   942  	if t.IsFloat() || t == types.TypeInt128 {
   943  		if t.Kind() == types.TFLOAT32 && s.f.Config.fp32RegMask != 0 {
   944  			m = s.f.Config.fp32RegMask
   945  		} else if t.Kind() == types.TFLOAT64 && s.f.Config.fp64RegMask != 0 {
   946  			m = s.f.Config.fp64RegMask
   947  		} else {
   948  			m = s.f.Config.fpRegMask
   949  		}
   950  	} else {
   951  		m = s.f.Config.gpRegMask
   952  	}
   953  	return m & s.allocatable
   954  }
   955  
   956  // regspec returns the regInfo for operation op.
   957  func (s *regAllocState) regspec(v *Value) regInfo {
   958  	op := v.Op
   959  	if op == OpConvert {
   960  		// OpConvert is a generic op, so it doesn't have a
   961  		// register set in the static table. It can use any
   962  		// allocatable integer register.
   963  		m := s.allocatable & s.f.Config.gpRegMask
   964  		return regInfo{inputs: []inputInfo{{regs: m}}, outputs: []outputInfo{{regs: m}}}
   965  	}
   966  	if op == OpArgIntReg {
   967  		reg := v.Block.Func.Config.intParamRegs[v.AuxInt8()]
   968  		return regInfo{outputs: []outputInfo{{regs: 1 << uint(reg)}}}
   969  	}
   970  	if op == OpArgFloatReg {
   971  		reg := v.Block.Func.Config.floatParamRegs[v.AuxInt8()]
   972  		return regInfo{outputs: []outputInfo{{regs: 1 << uint(reg)}}}
   973  	}
   974  	if op.IsCall() {
   975  		if ac, ok := v.Aux.(*AuxCall); ok && ac.reg != nil {
   976  			return *ac.Reg(&opcodeTable[op].reg, s.f.Config)
   977  		}
   978  	}
   979  	if op == OpMakeResult && s.f.OwnAux.reg != nil {
   980  		return *s.f.OwnAux.ResultReg(s.f.Config)
   981  	}
   982  	return opcodeTable[op].reg
   983  }
   984  
   985  func (s *regAllocState) isGReg(r register) bool {
   986  	return s.f.Config.hasGReg && s.GReg == r
   987  }
   988  
   989  // Dummy value used to represent the value being held in a temporary register.
   990  var tmpVal Value
   991  
   992  func (s *regAllocState) regalloc(f *Func) {
   993  	regValLiveSet := f.newSparseSet(f.NumValues()) // set of values that may be live in register
   994  	defer f.retSparseSet(regValLiveSet)
   995  	var oldSched []*Value
   996  	var phis []*Value
   997  	var phiRegs []register
   998  	var args []*Value
   999  
  1000  	// Data structure used for computing desired registers.
  1001  	var desired desiredState
  1002  	desiredSecondReg := map[ID][4]register{} // desired register allocation for 2nd part of a tuple
  1003  
  1004  	// Desired registers for inputs & outputs for each instruction in the block.
  1005  	type dentry struct {
  1006  		out [4]register    // desired output registers
  1007  		in  [3][4]register // desired input registers (for inputs 0,1, and 2)
  1008  	}
  1009  	var dinfo []dentry
  1010  
  1011  	if f.Entry != f.Blocks[0] {
  1012  		f.Fatalf("entry block must be first")
  1013  	}
  1014  
  1015  	for _, b := range s.visitOrder {
  1016  		if s.f.pass.debug > regDebug {
  1017  			fmt.Printf("Begin processing block %v\n", b)
  1018  		}
  1019  		s.curBlock = b
  1020  		s.startRegsMask = 0
  1021  		s.usedSinceBlockStart = 0
  1022  		clear(desiredSecondReg)
  1023  
  1024  		// Initialize regValLiveSet and uses fields for this block.
  1025  		// Walk backwards through the block doing liveness analysis.
  1026  		regValLiveSet.clear()
  1027  		if s.live != nil {
  1028  			for _, e := range s.live[b.ID] {
  1029  				s.addUse(e.ID, int32(len(b.Values))+e.dist, e.pos) // pseudo-uses from beyond end of block
  1030  				regValLiveSet.add(e.ID)
  1031  			}
  1032  		}
  1033  		for _, v := range b.ControlValues() {
  1034  			if s.values[v.ID].needReg {
  1035  				s.addUse(v.ID, int32(len(b.Values)), b.Pos) // pseudo-use by control values
  1036  				regValLiveSet.add(v.ID)
  1037  			}
  1038  		}
  1039  		if len(s.nextCall) < len(b.Values) {
  1040  			s.nextCall = append(s.nextCall, make([]int32, len(b.Values)-len(s.nextCall))...)
  1041  		}
  1042  		var nextCall int32 = math.MaxInt32
  1043  		for i := len(b.Values) - 1; i >= 0; i-- {
  1044  			v := b.Values[i]
  1045  			regValLiveSet.remove(v.ID)
  1046  			if v.Op == OpPhi {
  1047  				// Remove v from the live set, but don't add
  1048  				// any inputs. This is the state the len(b.Preds)>1
  1049  				// case below desires; it wants to process phis specially.
  1050  				s.nextCall[i] = nextCall
  1051  				continue
  1052  			}
  1053  			if opcodeTable[v.Op].call {
  1054  				// Function call clobbers all the registers but SP and SB.
  1055  				regValLiveSet.clear()
  1056  				if s.sp != 0 && s.values[s.sp].uses != nil {
  1057  					regValLiveSet.add(s.sp)
  1058  				}
  1059  				if s.sb != 0 && s.values[s.sb].uses != nil {
  1060  					regValLiveSet.add(s.sb)
  1061  				}
  1062  				nextCall = int32(i)
  1063  			}
  1064  			for _, a := range v.Args {
  1065  				if !s.values[a.ID].needReg {
  1066  					continue
  1067  				}
  1068  				s.addUse(a.ID, int32(i), v.Pos)
  1069  				regValLiveSet.add(a.ID)
  1070  			}
  1071  			s.nextCall[i] = nextCall
  1072  		}
  1073  		if s.f.pass.debug > regDebug {
  1074  			fmt.Printf("use distances for %s\n", b)
  1075  			for i := range s.values {
  1076  				vi := &s.values[i]
  1077  				u := vi.uses
  1078  				if u == nil {
  1079  					continue
  1080  				}
  1081  				fmt.Printf("  v%d:", i)
  1082  				for u != nil {
  1083  					fmt.Printf(" %d", u.dist)
  1084  					u = u.next
  1085  				}
  1086  				fmt.Println()
  1087  			}
  1088  		}
  1089  
  1090  		// Make a copy of the block schedule so we can generate a new one in place.
  1091  		// We make a separate copy for phis and regular values.
  1092  		nphi := 0
  1093  		for _, v := range b.Values {
  1094  			if v.Op != OpPhi {
  1095  				break
  1096  			}
  1097  			nphi++
  1098  		}
  1099  		phis = append(phis[:0], b.Values[:nphi]...)
  1100  		oldSched = append(oldSched[:0], b.Values[nphi:]...)
  1101  		b.Values = b.Values[:0]
  1102  
  1103  		// Initialize start state of block.
  1104  		if b == f.Entry {
  1105  			// Regalloc state is empty to start.
  1106  			if nphi > 0 {
  1107  				f.Fatalf("phis in entry block")
  1108  			}
  1109  		} else if len(b.Preds) == 1 {
  1110  			// Start regalloc state with the end state of the previous block.
  1111  			s.setState(s.endRegs[b.Preds[0].b.ID])
  1112  			if nphi > 0 {
  1113  				f.Fatalf("phis in single-predecessor block")
  1114  			}
  1115  			// Drop any values which are no longer live.
  1116  			// This may happen because at the end of p, a value may be
  1117  			// live but only used by some other successor of p.
  1118  			for r := register(0); r < s.numRegs; r++ {
  1119  				v := s.regs[r].v
  1120  				if v != nil && !regValLiveSet.contains(v.ID) {
  1121  					s.freeReg(r)
  1122  				}
  1123  			}
  1124  		} else {
  1125  			// This is the complicated case. We have more than one predecessor,
  1126  			// which means we may have Phi ops.
  1127  
  1128  			// Start with the final register state of the predecessor with least spill values.
  1129  			// This is based on the following points:
  1130  			// 1, The less spill value indicates that the register pressure of this path is smaller,
  1131  			//    so the values of this block are more likely to be allocated to registers.
  1132  			// 2, Avoid the predecessor that contains the function call, because the predecessor that
  1133  			//    contains the function call usually generates a lot of spills and lose the previous
  1134  			//    allocation state.
  1135  			// TODO: Improve this part. At least the size of endRegs of the predecessor also has
  1136  			// an impact on the code size and compiler speed. But it is not easy to find a simple
  1137  			// and efficient method that combines multiple factors.
  1138  			idx := -1
  1139  			for i, p := range b.Preds {
  1140  				// If the predecessor has not been visited yet, skip it because its end state
  1141  				// (redRegs and spillLive) has not been computed yet.
  1142  				pb := p.b
  1143  				if s.blockOrder[pb.ID] >= s.blockOrder[b.ID] {
  1144  					continue
  1145  				}
  1146  				if idx == -1 {
  1147  					idx = i
  1148  					continue
  1149  				}
  1150  				pSel := b.Preds[idx].b
  1151  				if len(s.spillLive[pb.ID]) < len(s.spillLive[pSel.ID]) {
  1152  					idx = i
  1153  				} else if len(s.spillLive[pb.ID]) == len(s.spillLive[pSel.ID]) {
  1154  					// Use a bit of likely information. After critical pass, pb and pSel must
  1155  					// be plain blocks, so check edge pb->pb.Preds instead of edge pb->b.
  1156  					// TODO: improve the prediction of the likely predecessor. The following
  1157  					// method is only suitable for the simplest cases. For complex cases,
  1158  					// the prediction may be inaccurate, but this does not affect the
  1159  					// correctness of the program.
  1160  					// According to the layout algorithm, the predecessor with the
  1161  					// smaller blockOrder is the true branch, and the test results show
  1162  					// that it is better to choose the predecessor with a smaller
  1163  					// blockOrder than no choice.
  1164  					if pb.likelyBranch() && !pSel.likelyBranch() || s.blockOrder[pb.ID] < s.blockOrder[pSel.ID] {
  1165  						idx = i
  1166  					}
  1167  				}
  1168  			}
  1169  			if idx < 0 {
  1170  				f.Fatalf("bad visitOrder, no predecessor of %s has been visited before it", b)
  1171  			}
  1172  			p := b.Preds[idx].b
  1173  			s.setState(s.endRegs[p.ID])
  1174  
  1175  			if s.f.pass.debug > regDebug {
  1176  				fmt.Printf("starting merge block %s with end state of %s:\n", b, p)
  1177  				for _, x := range s.endRegs[p.ID] {
  1178  					fmt.Printf("  %s: orig:%s cache:%s\n", &s.registers[x.r], x.v, x.c)
  1179  				}
  1180  			}
  1181  
  1182  			// Decide on registers for phi ops. Use the registers determined
  1183  			// by the primary predecessor if we can.
  1184  			// TODO: pick best of (already processed) predecessors?
  1185  			// Majority vote? Deepest nesting level?
  1186  			phiRegs = phiRegs[:0]
  1187  			var phiUsed regMask
  1188  
  1189  			for _, v := range phis {
  1190  				if !s.values[v.ID].needReg {
  1191  					phiRegs = append(phiRegs, noRegister)
  1192  					continue
  1193  				}
  1194  				a := v.Args[idx]
  1195  				// Some instructions target not-allocatable registers.
  1196  				// They're not suitable for further (phi-function) allocation.
  1197  				m := s.values[a.ID].regs &^ phiUsed & s.allocatable
  1198  				if m != 0 {
  1199  					r := pickReg(m)
  1200  					phiUsed |= regMask(1) << r
  1201  					phiRegs = append(phiRegs, r)
  1202  				} else {
  1203  					phiRegs = append(phiRegs, noRegister)
  1204  				}
  1205  			}
  1206  
  1207  			// Second pass - deallocate all in-register phi inputs.
  1208  			for i, v := range phis {
  1209  				if !s.values[v.ID].needReg {
  1210  					continue
  1211  				}
  1212  				a := v.Args[idx]
  1213  				r := phiRegs[i]
  1214  				if r == noRegister {
  1215  					continue
  1216  				}
  1217  				if regValLiveSet.contains(a.ID) {
  1218  					// Input value is still live (it is used by something other than Phi).
  1219  					// Try to move it around before kicking out, if there is a free register.
  1220  					// We generate a Copy in the predecessor block and record it. It will be
  1221  					// deleted later if never used.
  1222  					//
  1223  					// Pick a free register. At this point some registers used in the predecessor
  1224  					// block may have been deallocated. Those are the ones used for Phis. Exclude
  1225  					// them (and they are not going to be helpful anyway).
  1226  					m := s.compatRegs(a.Type) &^ s.used &^ phiUsed
  1227  					if m != 0 && !s.values[a.ID].rematerializeable && countRegs(s.values[a.ID].regs) == 1 {
  1228  						r2 := pickReg(m)
  1229  						c := p.NewValue1(a.Pos, OpCopy, a.Type, s.regs[r].c)
  1230  						s.copies[c] = false
  1231  						if s.f.pass.debug > regDebug {
  1232  							fmt.Printf("copy %s to %s : %s\n", a, c, &s.registers[r2])
  1233  						}
  1234  						s.setOrig(c, a)
  1235  						s.assignReg(r2, a, c)
  1236  						s.endRegs[p.ID] = append(s.endRegs[p.ID], endReg{r2, a, c})
  1237  					}
  1238  				}
  1239  				s.freeReg(r)
  1240  			}
  1241  
  1242  			// Copy phi ops into new schedule.
  1243  			b.Values = append(b.Values, phis...)
  1244  
  1245  			// Third pass - pick registers for phis whose input
  1246  			// was not in a register in the primary predecessor.
  1247  			for i, v := range phis {
  1248  				if !s.values[v.ID].needReg {
  1249  					continue
  1250  				}
  1251  				if phiRegs[i] != noRegister {
  1252  					continue
  1253  				}
  1254  				m := s.compatRegs(v.Type) &^ phiUsed &^ s.used
  1255  				// If one of the other inputs of v is in a register, and the register is available,
  1256  				// select this register, which can save some unnecessary copies.
  1257  				for i, pe := range b.Preds {
  1258  					if i == idx {
  1259  						continue
  1260  					}
  1261  					ri := noRegister
  1262  					for _, er := range s.endRegs[pe.b.ID] {
  1263  						if er.v == s.orig[v.Args[i].ID] {
  1264  							ri = er.r
  1265  							break
  1266  						}
  1267  					}
  1268  					if ri != noRegister && m>>ri&1 != 0 {
  1269  						m = regMask(1) << ri
  1270  						break
  1271  					}
  1272  				}
  1273  				if m != 0 {
  1274  					r := pickReg(m)
  1275  					phiRegs[i] = r
  1276  					phiUsed |= regMask(1) << r
  1277  				}
  1278  			}
  1279  
  1280  			// Set registers for phis. Add phi spill code.
  1281  			for i, v := range phis {
  1282  				if !s.values[v.ID].needReg {
  1283  					continue
  1284  				}
  1285  				r := phiRegs[i]
  1286  				if r == noRegister {
  1287  					// stack-based phi
  1288  					// Spills will be inserted in all the predecessors below.
  1289  					s.values[v.ID].spill = v // v starts life spilled
  1290  					continue
  1291  				}
  1292  				// register-based phi
  1293  				s.assignReg(r, v, v)
  1294  			}
  1295  
  1296  			// Deallocate any values which are no longer live. Phis are excluded.
  1297  			for r := register(0); r < s.numRegs; r++ {
  1298  				if phiUsed>>r&1 != 0 {
  1299  					continue
  1300  				}
  1301  				v := s.regs[r].v
  1302  				if v != nil && !regValLiveSet.contains(v.ID) {
  1303  					s.freeReg(r)
  1304  				}
  1305  			}
  1306  
  1307  			// Save the starting state for use by merge edges.
  1308  			// We append to a stack allocated variable that we'll
  1309  			// later copy into s.startRegs in one fell swoop, to save
  1310  			// on allocations.
  1311  			regList := make([]startReg, 0, 32)
  1312  			for r := register(0); r < s.numRegs; r++ {
  1313  				v := s.regs[r].v
  1314  				if v == nil {
  1315  					continue
  1316  				}
  1317  				if phiUsed>>r&1 != 0 {
  1318  					// Skip registers that phis used, we'll handle those
  1319  					// specially during merge edge processing.
  1320  					continue
  1321  				}
  1322  				regList = append(regList, startReg{r, v, s.regs[r].c, s.values[v.ID].uses.pos})
  1323  				s.startRegsMask |= regMask(1) << r
  1324  			}
  1325  			s.startRegs[b.ID] = make([]startReg, len(regList))
  1326  			copy(s.startRegs[b.ID], regList)
  1327  
  1328  			if s.f.pass.debug > regDebug {
  1329  				fmt.Printf("after phis\n")
  1330  				for _, x := range s.startRegs[b.ID] {
  1331  					fmt.Printf("  %s: v%d\n", &s.registers[x.r], x.v.ID)
  1332  				}
  1333  			}
  1334  		}
  1335  
  1336  		// Drop phis from registers if they immediately go dead.
  1337  		for i, v := range phis {
  1338  			s.curIdx = i
  1339  			s.dropIfUnused(v)
  1340  		}
  1341  
  1342  		// Allocate space to record the desired registers for each value.
  1343  		if l := len(oldSched); cap(dinfo) < l {
  1344  			dinfo = make([]dentry, l)
  1345  		} else {
  1346  			dinfo = dinfo[:l]
  1347  			clear(dinfo)
  1348  		}
  1349  
  1350  		// Load static desired register info at the end of the block.
  1351  		if s.desired != nil {
  1352  			desired.copy(&s.desired[b.ID])
  1353  		}
  1354  
  1355  		// Check actual assigned registers at the start of the next block(s).
  1356  		// Dynamically assigned registers will trump the static
  1357  		// desired registers computed during liveness analysis.
  1358  		// Note that we do this phase after startRegs is set above, so that
  1359  		// we get the right behavior for a block which branches to itself.
  1360  		for _, e := range b.Succs {
  1361  			succ := e.b
  1362  			// TODO: prioritize likely successor?
  1363  			for _, x := range s.startRegs[succ.ID] {
  1364  				desired.add(x.v.ID, x.r)
  1365  			}
  1366  			// Process phi ops in succ.
  1367  			pidx := e.i
  1368  			for _, v := range succ.Values {
  1369  				if v.Op != OpPhi {
  1370  					break
  1371  				}
  1372  				if !s.values[v.ID].needReg {
  1373  					continue
  1374  				}
  1375  				rp, ok := s.f.getHome(v.ID).(*Register)
  1376  				if !ok {
  1377  					// If v is not assigned a register, pick a register assigned to one of v's inputs.
  1378  					// Hopefully v will get assigned that register later.
  1379  					// If the inputs have allocated register information, add it to desired,
  1380  					// which may reduce spill or copy operations when the register is available.
  1381  					for _, a := range v.Args {
  1382  						rp, ok = s.f.getHome(a.ID).(*Register)
  1383  						if ok {
  1384  							break
  1385  						}
  1386  					}
  1387  					if !ok {
  1388  						continue
  1389  					}
  1390  				}
  1391  				desired.add(v.Args[pidx].ID, register(rp.num))
  1392  			}
  1393  		}
  1394  		// Walk values backwards computing desired register info.
  1395  		// See computeDesired for more comments.
  1396  		for i := len(oldSched) - 1; i >= 0; i-- {
  1397  			v := oldSched[i]
  1398  			prefs := desired.remove(v.ID)
  1399  			regspec := s.regspec(v)
  1400  			desired.clobber(regspec.clobbers)
  1401  			for _, j := range regspec.inputs {
  1402  				if countRegs(j.regs) != 1 {
  1403  					continue
  1404  				}
  1405  				desired.clobber(j.regs)
  1406  				desired.add(v.Args[j.idx].ID, pickReg(j.regs))
  1407  			}
  1408  			if opcodeTable[v.Op].resultInArg0 || v.Op == OpAMD64ADDQconst || v.Op == OpAMD64ADDLconst || v.Op == OpSelect0 {
  1409  				if opcodeTable[v.Op].commutative {
  1410  					desired.addList(v.Args[1].ID, prefs)
  1411  				}
  1412  				desired.addList(v.Args[0].ID, prefs)
  1413  			}
  1414  			// Save desired registers for this value.
  1415  			dinfo[i].out = prefs
  1416  			for j, a := range v.Args {
  1417  				if j >= len(dinfo[i].in) {
  1418  					break
  1419  				}
  1420  				dinfo[i].in[j] = desired.get(a.ID)
  1421  			}
  1422  			if v.Op == OpSelect1 && prefs[0] != noRegister {
  1423  				// Save desired registers of select1 for
  1424  				// use by the tuple generating instruction.
  1425  				desiredSecondReg[v.Args[0].ID] = prefs
  1426  			}
  1427  		}
  1428  
  1429  		// Process all the non-phi values.
  1430  		for idx, v := range oldSched {
  1431  			s.curIdx = nphi + idx
  1432  			tmpReg := noRegister
  1433  			if s.f.pass.debug > regDebug {
  1434  				fmt.Printf("  processing %s\n", v.LongString())
  1435  			}
  1436  			regspec := s.regspec(v)
  1437  			if v.Op == OpPhi {
  1438  				f.Fatalf("phi %s not at start of block", v)
  1439  			}
  1440  			if opcodeTable[v.Op].fixedReg {
  1441  				switch v.Op {
  1442  				case OpSP:
  1443  					s.assignReg(s.SPReg, v, v)
  1444  					s.sp = v.ID
  1445  				case OpSB:
  1446  					s.assignReg(s.SBReg, v, v)
  1447  					s.sb = v.ID
  1448  				case OpARM64ZERO, OpLOONG64ZERO, OpMIPS64ZERO:
  1449  					s.assignReg(s.ZeroIntReg, v, v)
  1450  				case OpAMD64Zero128, OpAMD64Zero256, OpAMD64Zero512:
  1451  					regspec := s.regspec(v)
  1452  					m := regspec.outputs[0].regs
  1453  					if countRegs(m) != 1 {
  1454  						f.Fatalf("bad fixed-register op %s", v)
  1455  					}
  1456  					s.assignReg(pickReg(m), v, v)
  1457  				default:
  1458  					f.Fatalf("unknown fixed-register op %s", v)
  1459  				}
  1460  				b.Values = append(b.Values, v)
  1461  				s.advanceUses(v)
  1462  				continue
  1463  			}
  1464  			if v.Op == OpSelect0 || v.Op == OpSelect1 || v.Op == OpSelectN {
  1465  				if s.values[v.ID].needReg {
  1466  					if v.Op == OpSelectN {
  1467  						s.assignReg(register(s.f.getHome(v.Args[0].ID).(LocResults)[int(v.AuxInt)].(*Register).num), v, v)
  1468  					} else {
  1469  						var i = 0
  1470  						if v.Op == OpSelect1 {
  1471  							i = 1
  1472  						}
  1473  						s.assignReg(register(s.f.getHome(v.Args[0].ID).(LocPair)[i].(*Register).num), v, v)
  1474  					}
  1475  				}
  1476  				b.Values = append(b.Values, v)
  1477  				s.advanceUses(v)
  1478  				continue
  1479  			}
  1480  			if v.Op == OpGetG && s.f.Config.hasGReg {
  1481  				// use hardware g register
  1482  				if s.regs[s.GReg].v != nil {
  1483  					s.freeReg(s.GReg) // kick out the old value
  1484  				}
  1485  				s.assignReg(s.GReg, v, v)
  1486  				b.Values = append(b.Values, v)
  1487  				s.advanceUses(v)
  1488  				continue
  1489  			}
  1490  			if v.Op == OpArg {
  1491  				// Args are "pre-spilled" values. We don't allocate
  1492  				// any register here. We just set up the spill pointer to
  1493  				// point at itself and any later user will restore it to use it.
  1494  				s.values[v.ID].spill = v
  1495  				b.Values = append(b.Values, v)
  1496  				s.advanceUses(v)
  1497  				continue
  1498  			}
  1499  			if v.Op == OpKeepAlive {
  1500  				// Make sure the argument to v is still live here.
  1501  				s.advanceUses(v)
  1502  				a := v.Args[0]
  1503  				vi := &s.values[a.ID]
  1504  				if vi.regs == 0 && !vi.rematerializeable {
  1505  					// Use the spill location.
  1506  					// This forces later liveness analysis to make the
  1507  					// value live at this point.
  1508  					v.SetArg(0, s.makeSpill(a, b))
  1509  				} else if _, ok := a.Aux.(*ir.Name); ok && vi.rematerializeable {
  1510  					// Rematerializeable value with a *ir.Name. This is the address of
  1511  					// a stack object (e.g. an LEAQ). Keep the object live.
  1512  					// Change it to VarLive, which is what plive expects for locals.
  1513  					v.Op = OpVarLive
  1514  					v.SetArgs1(v.Args[1])
  1515  					v.Aux = a.Aux
  1516  				} else {
  1517  					// In-register and rematerializeable values are already live.
  1518  					// These are typically rematerializeable constants like nil,
  1519  					// or values of a variable that were modified since the last call.
  1520  					v.Op = OpCopy
  1521  					v.SetArgs1(v.Args[1])
  1522  				}
  1523  				b.Values = append(b.Values, v)
  1524  				continue
  1525  			}
  1526  			if len(regspec.inputs) == 0 && len(regspec.outputs) == 0 {
  1527  				// No register allocation required (or none specified yet)
  1528  				if s.doClobber && v.Op.IsCall() {
  1529  					s.clobberRegs(regspec.clobbers)
  1530  				}
  1531  				s.freeRegs(regspec.clobbers)
  1532  				b.Values = append(b.Values, v)
  1533  				s.advanceUses(v)
  1534  				continue
  1535  			}
  1536  
  1537  			if s.values[v.ID].rematerializeable {
  1538  				// Value is rematerializeable, don't issue it here.
  1539  				// It will get issued just before each use (see
  1540  				// allocValueToReg).
  1541  				for _, a := range v.Args {
  1542  					a.Uses--
  1543  				}
  1544  				s.advanceUses(v)
  1545  				continue
  1546  			}
  1547  
  1548  			if s.f.pass.debug > regDebug {
  1549  				fmt.Printf("value %s\n", v.LongString())
  1550  				fmt.Printf("  out:")
  1551  				for _, r := range dinfo[idx].out {
  1552  					if r != noRegister {
  1553  						fmt.Printf(" %s", &s.registers[r])
  1554  					}
  1555  				}
  1556  				fmt.Println()
  1557  				for i := 0; i < len(v.Args) && i < 3; i++ {
  1558  					fmt.Printf("  in%d:", i)
  1559  					for _, r := range dinfo[idx].in[i] {
  1560  						if r != noRegister {
  1561  							fmt.Printf(" %s", &s.registers[r])
  1562  						}
  1563  					}
  1564  					fmt.Println()
  1565  				}
  1566  			}
  1567  
  1568  			// Move arguments to registers.
  1569  			// First, if an arg must be in a specific register and it is already
  1570  			// in place, keep it.
  1571  			args = append(args[:0], make([]*Value, len(v.Args))...)
  1572  			for i, a := range v.Args {
  1573  				if !s.values[a.ID].needReg {
  1574  					args[i] = a
  1575  				}
  1576  			}
  1577  			for _, i := range regspec.inputs {
  1578  				mask := i.regs
  1579  				if countRegs(mask) == 1 && mask&s.values[v.Args[i.idx].ID].regs != 0 {
  1580  					args[i.idx] = s.allocValToReg(v.Args[i.idx], mask, true, v.Pos)
  1581  				}
  1582  			}
  1583  			// Then, if an arg must be in a specific register and that
  1584  			// register is free, allocate that one. Otherwise when processing
  1585  			// another input we may kick a value into the free register, which
  1586  			// then will be kicked out again.
  1587  			// This is a common case for passing-in-register arguments for
  1588  			// function calls.
  1589  			for {
  1590  				freed := false
  1591  				for _, i := range regspec.inputs {
  1592  					if args[i.idx] != nil {
  1593  						continue // already allocated
  1594  					}
  1595  					mask := i.regs
  1596  					if countRegs(mask) == 1 && mask&^s.used != 0 {
  1597  						args[i.idx] = s.allocValToReg(v.Args[i.idx], mask, true, v.Pos)
  1598  						// If the input is in other registers that will be clobbered by v,
  1599  						// or the input is dead, free the registers. This may make room
  1600  						// for other inputs.
  1601  						oldregs := s.values[v.Args[i.idx].ID].regs
  1602  						if oldregs&^regspec.clobbers == 0 || !s.liveAfterCurrentInstruction(v.Args[i.idx]) {
  1603  							s.freeRegs(oldregs &^ mask &^ s.nospill)
  1604  							freed = true
  1605  						}
  1606  					}
  1607  				}
  1608  				if !freed {
  1609  					break
  1610  				}
  1611  			}
  1612  			// Last, allocate remaining ones, in an ordering defined
  1613  			// by the register specification (most constrained first).
  1614  			for _, i := range regspec.inputs {
  1615  				if args[i.idx] != nil {
  1616  					continue // already allocated
  1617  				}
  1618  				mask := i.regs
  1619  				if mask&s.values[v.Args[i.idx].ID].regs == 0 {
  1620  					// Need a new register for the input.
  1621  					mask &= s.allocatable
  1622  					mask &^= s.nospill
  1623  					// Used desired register if available.
  1624  					if i.idx < 3 {
  1625  						for _, r := range dinfo[idx].in[i.idx] {
  1626  							if r != noRegister && (mask&^s.used)>>r&1 != 0 {
  1627  								// Desired register is allowed and unused.
  1628  								mask = regMask(1) << r
  1629  								break
  1630  							}
  1631  						}
  1632  					}
  1633  					// Avoid registers we're saving for other values.
  1634  					if mask&^desired.avoid != 0 {
  1635  						mask &^= desired.avoid
  1636  					}
  1637  				}
  1638  				if mask&s.values[v.Args[i.idx].ID].regs&(1<<s.SPReg) != 0 {
  1639  					// Prefer SP register. This ensures that local variables
  1640  					// use SP as their base register (instead of a copy of the
  1641  					// stack pointer living in another register). See issue 74836.
  1642  					mask = 1 << s.SPReg
  1643  				}
  1644  				args[i.idx] = s.allocValToReg(v.Args[i.idx], mask, true, v.Pos)
  1645  			}
  1646  
  1647  			// If the output clobbers the input register, make sure we have
  1648  			// at least two copies of the input register so we don't
  1649  			// have to reload the value from the spill location.
  1650  			if opcodeTable[v.Op].resultInArg0 {
  1651  				var m regMask
  1652  				if !s.liveAfterCurrentInstruction(v.Args[0]) {
  1653  					// arg0 is dead.  We can clobber its register.
  1654  					goto ok
  1655  				}
  1656  				if opcodeTable[v.Op].commutative && !s.liveAfterCurrentInstruction(v.Args[1]) {
  1657  					args[0], args[1] = args[1], args[0]
  1658  					goto ok
  1659  				}
  1660  				if s.values[v.Args[0].ID].rematerializeable {
  1661  					// We can rematerialize the input, don't worry about clobbering it.
  1662  					goto ok
  1663  				}
  1664  				if opcodeTable[v.Op].commutative && s.values[v.Args[1].ID].rematerializeable {
  1665  					args[0], args[1] = args[1], args[0]
  1666  					goto ok
  1667  				}
  1668  				if countRegs(s.values[v.Args[0].ID].regs) >= 2 {
  1669  					// we have at least 2 copies of arg0.  We can afford to clobber one.
  1670  					goto ok
  1671  				}
  1672  				if opcodeTable[v.Op].commutative && countRegs(s.values[v.Args[1].ID].regs) >= 2 {
  1673  					args[0], args[1] = args[1], args[0]
  1674  					goto ok
  1675  				}
  1676  
  1677  				// We can't overwrite arg0 (or arg1, if commutative).  So we
  1678  				// need to make a copy of an input so we have a register we can modify.
  1679  
  1680  				// Possible new registers to copy into.
  1681  				m = s.compatRegs(v.Args[0].Type) &^ s.used
  1682  				if m == 0 {
  1683  					// No free registers.  In this case we'll just clobber
  1684  					// an input and future uses of that input must use a restore.
  1685  					// TODO(khr): We should really do this like allocReg does it,
  1686  					// spilling the value with the most distant next use.
  1687  					goto ok
  1688  				}
  1689  
  1690  				// Try to move an input to the desired output, if allowed.
  1691  				for _, r := range dinfo[idx].out {
  1692  					if r != noRegister && (m&regspec.outputs[0].regs)>>r&1 != 0 {
  1693  						m = regMask(1) << r
  1694  						args[0] = s.allocValToReg(v.Args[0], m, true, v.Pos)
  1695  						// Note: we update args[0] so the instruction will
  1696  						// use the register copy we just made.
  1697  						goto ok
  1698  					}
  1699  				}
  1700  				// Try to copy input to its desired location & use its old
  1701  				// location as the result register.
  1702  				for _, r := range dinfo[idx].in[0] {
  1703  					if r != noRegister && m>>r&1 != 0 {
  1704  						m = regMask(1) << r
  1705  						c := s.allocValToReg(v.Args[0], m, true, v.Pos)
  1706  						s.copies[c] = false
  1707  						// Note: no update to args[0] so the instruction will
  1708  						// use the original copy.
  1709  						goto ok
  1710  					}
  1711  				}
  1712  				if opcodeTable[v.Op].commutative {
  1713  					for _, r := range dinfo[idx].in[1] {
  1714  						if r != noRegister && m>>r&1 != 0 {
  1715  							m = regMask(1) << r
  1716  							c := s.allocValToReg(v.Args[1], m, true, v.Pos)
  1717  							s.copies[c] = false
  1718  							args[0], args[1] = args[1], args[0]
  1719  							goto ok
  1720  						}
  1721  					}
  1722  				}
  1723  
  1724  				// Avoid future fixed uses if we can.
  1725  				if m&^desired.avoid != 0 {
  1726  					m &^= desired.avoid
  1727  				}
  1728  				// Save input 0 to a new register so we can clobber it.
  1729  				c := s.allocValToReg(v.Args[0], m, true, v.Pos)
  1730  				s.copies[c] = false
  1731  
  1732  				// Normally we use the register of the old copy of input 0 as the target.
  1733  				// However, if input 0 is already in its desired register then we use
  1734  				// the register of the new copy instead.
  1735  				if regspec.outputs[0].regs>>s.f.getHome(c.ID).(*Register).num&1 != 0 {
  1736  					if rp, ok := s.f.getHome(args[0].ID).(*Register); ok {
  1737  						r := register(rp.num)
  1738  						for _, r2 := range dinfo[idx].in[0] {
  1739  							if r == r2 {
  1740  								args[0] = c
  1741  								break
  1742  							}
  1743  						}
  1744  					}
  1745  				}
  1746  			}
  1747  		ok:
  1748  			for i := 0; i < 2; i++ {
  1749  				if !(i == 0 && regspec.clobbersArg0 || i == 1 && regspec.clobbersArg1) {
  1750  					continue
  1751  				}
  1752  				if !s.liveAfterCurrentInstruction(v.Args[i]) {
  1753  					// arg is dead.  We can clobber its register.
  1754  					continue
  1755  				}
  1756  				if s.values[v.Args[i].ID].rematerializeable {
  1757  					// We can rematerialize the input, don't worry about clobbering it.
  1758  					continue
  1759  				}
  1760  				if countRegs(s.values[v.Args[i].ID].regs) >= 2 {
  1761  					// We have at least 2 copies of arg.  We can afford to clobber one.
  1762  					continue
  1763  				}
  1764  				// Possible new registers to copy into.
  1765  				m := s.compatRegs(v.Args[i].Type) &^ s.used
  1766  				if m == 0 {
  1767  					// No free registers.  In this case we'll just clobber the
  1768  					// input and future uses of that input must use a restore.
  1769  					// TODO(khr): We should really do this like allocReg does it,
  1770  					// spilling the value with the most distant next use.
  1771  					continue
  1772  				}
  1773  				// Copy input to a different register that won't be clobbered.
  1774  				c := s.allocValToReg(v.Args[i], m, true, v.Pos)
  1775  				s.copies[c] = false
  1776  			}
  1777  
  1778  			// Pick a temporary register if needed.
  1779  			// It should be distinct from all the input registers, so we
  1780  			// allocate it after all the input registers, but before
  1781  			// the input registers are freed via advanceUses below.
  1782  			// (Not all instructions need that distinct part, but it is conservative.)
  1783  			// We also ensure it is not any of the single-choice output registers.
  1784  			if opcodeTable[v.Op].needIntTemp {
  1785  				m := s.allocatable & s.f.Config.gpRegMask
  1786  				for _, out := range regspec.outputs {
  1787  					if countRegs(out.regs) == 1 {
  1788  						m &^= out.regs
  1789  					}
  1790  				}
  1791  				if m&^desired.avoid&^s.nospill != 0 {
  1792  					m &^= desired.avoid
  1793  				}
  1794  				tmpReg = s.allocReg(m, &tmpVal)
  1795  				s.nospill |= regMask(1) << tmpReg
  1796  				s.tmpused |= regMask(1) << tmpReg
  1797  			}
  1798  
  1799  			if regspec.clobbersArg0 {
  1800  				s.freeReg(register(s.f.getHome(args[0].ID).(*Register).num))
  1801  			}
  1802  			if regspec.clobbersArg1 {
  1803  				s.freeReg(register(s.f.getHome(args[1].ID).(*Register).num))
  1804  			}
  1805  
  1806  			// Now that all args are in regs, we're ready to issue the value itself.
  1807  			// Before we pick a register for the output value, allow input registers
  1808  			// to be deallocated. We do this here so that the output can use the
  1809  			// same register as a dying input.
  1810  			if !opcodeTable[v.Op].resultNotInArgs {
  1811  				s.tmpused = s.nospill
  1812  				s.nospill = 0
  1813  				s.advanceUses(v) // frees any registers holding args that are no longer live
  1814  			}
  1815  
  1816  			// Dump any registers which will be clobbered
  1817  			if s.doClobber && v.Op.IsCall() {
  1818  				// clobber registers that are marked as clobber in regmask, but
  1819  				// don't clobber inputs.
  1820  				s.clobberRegs(regspec.clobbers &^ s.tmpused &^ s.nospill)
  1821  			}
  1822  			s.freeRegs(regspec.clobbers)
  1823  			s.tmpused |= regspec.clobbers
  1824  
  1825  			// Pick registers for outputs.
  1826  			{
  1827  				outRegs := noRegisters // TODO if this is costly, hoist and clear incrementally below.
  1828  				maxOutIdx := -1
  1829  				var used regMask
  1830  				if tmpReg != noRegister {
  1831  					// Ensure output registers are distinct from the temporary register.
  1832  					// (Not all instructions need that distinct part, but it is conservative.)
  1833  					used |= regMask(1) << tmpReg
  1834  				}
  1835  				for _, out := range regspec.outputs {
  1836  					if out.regs == 0 {
  1837  						continue
  1838  					}
  1839  					mask := out.regs & s.allocatable &^ used
  1840  					if mask == 0 {
  1841  						s.f.Fatalf("can't find any output register %s", v.LongString())
  1842  					}
  1843  					if opcodeTable[v.Op].resultInArg0 && out.idx == 0 {
  1844  						if !opcodeTable[v.Op].commutative {
  1845  							// Output must use the same register as input 0.
  1846  							r := register(s.f.getHome(args[0].ID).(*Register).num)
  1847  							if mask>>r&1 == 0 {
  1848  								s.f.Fatalf("resultInArg0 value's input %v cannot be an output of %s", s.f.getHome(args[0].ID).(*Register), v.LongString())
  1849  							}
  1850  							mask = regMask(1) << r
  1851  						} else {
  1852  							// Output must use the same register as input 0 or 1.
  1853  							r0 := register(s.f.getHome(args[0].ID).(*Register).num)
  1854  							r1 := register(s.f.getHome(args[1].ID).(*Register).num)
  1855  							// Check r0 and r1 for desired output register.
  1856  							found := false
  1857  							for _, r := range dinfo[idx].out {
  1858  								if (r == r0 || r == r1) && (mask&^s.used)>>r&1 != 0 {
  1859  									mask = regMask(1) << r
  1860  									found = true
  1861  									if r == r1 {
  1862  										args[0], args[1] = args[1], args[0]
  1863  									}
  1864  									break
  1865  								}
  1866  							}
  1867  							if !found {
  1868  								// Neither are desired, pick r0.
  1869  								mask = regMask(1) << r0
  1870  							}
  1871  						}
  1872  					}
  1873  					if out.idx == 0 { // desired registers only apply to the first element of a tuple result
  1874  						for _, r := range dinfo[idx].out {
  1875  							if r != noRegister && (mask&^s.used)>>r&1 != 0 {
  1876  								// Desired register is allowed and unused.
  1877  								mask = regMask(1) << r
  1878  								break
  1879  							}
  1880  						}
  1881  					}
  1882  					if out.idx == 1 {
  1883  						if prefs, ok := desiredSecondReg[v.ID]; ok {
  1884  							for _, r := range prefs {
  1885  								if r != noRegister && (mask&^s.used)>>r&1 != 0 {
  1886  									// Desired register is allowed and unused.
  1887  									mask = regMask(1) << r
  1888  									break
  1889  								}
  1890  							}
  1891  						}
  1892  					}
  1893  					// Avoid registers we're saving for other values.
  1894  					if mask&^desired.avoid&^s.nospill&^s.used != 0 {
  1895  						mask &^= desired.avoid
  1896  					}
  1897  					r := s.allocReg(mask, v)
  1898  					if out.idx > maxOutIdx {
  1899  						maxOutIdx = out.idx
  1900  					}
  1901  					outRegs[out.idx] = r
  1902  					used |= regMask(1) << r
  1903  					s.tmpused |= regMask(1) << r
  1904  				}
  1905  				// Record register choices
  1906  				if v.Type.IsTuple() {
  1907  					var outLocs LocPair
  1908  					if r := outRegs[0]; r != noRegister {
  1909  						outLocs[0] = &s.registers[r]
  1910  					}
  1911  					if r := outRegs[1]; r != noRegister {
  1912  						outLocs[1] = &s.registers[r]
  1913  					}
  1914  					s.f.setHome(v, outLocs)
  1915  					// Note that subsequent SelectX instructions will do the assignReg calls.
  1916  				} else if v.Type.IsResults() {
  1917  					// preallocate outLocs to the right size, which is maxOutIdx+1
  1918  					outLocs := make(LocResults, maxOutIdx+1, maxOutIdx+1)
  1919  					for i := 0; i <= maxOutIdx; i++ {
  1920  						if r := outRegs[i]; r != noRegister {
  1921  							outLocs[i] = &s.registers[r]
  1922  						}
  1923  					}
  1924  					s.f.setHome(v, outLocs)
  1925  				} else {
  1926  					if r := outRegs[0]; r != noRegister {
  1927  						s.assignReg(r, v, v)
  1928  					}
  1929  				}
  1930  				if tmpReg != noRegister {
  1931  					// Remember the temp register allocation, if any.
  1932  					if s.f.tempRegs == nil {
  1933  						s.f.tempRegs = map[ID]*Register{}
  1934  					}
  1935  					s.f.tempRegs[v.ID] = &s.registers[tmpReg]
  1936  				}
  1937  			}
  1938  
  1939  			// deallocate dead args, if we have not done so
  1940  			if opcodeTable[v.Op].resultNotInArgs {
  1941  				s.nospill = 0
  1942  				s.advanceUses(v) // frees any registers holding args that are no longer live
  1943  			}
  1944  			s.tmpused = 0
  1945  
  1946  			// Issue the Value itself.
  1947  			for i, a := range args {
  1948  				v.SetArg(i, a) // use register version of arguments
  1949  			}
  1950  			b.Values = append(b.Values, v)
  1951  			s.dropIfUnused(v)
  1952  		}
  1953  
  1954  		// Copy the control values - we need this so we can reduce the
  1955  		// uses property of these values later.
  1956  		controls := append(make([]*Value, 0, 2), b.ControlValues()...)
  1957  
  1958  		// Load control values into registers.
  1959  		for i, v := range b.ControlValues() {
  1960  			if !s.values[v.ID].needReg {
  1961  				continue
  1962  			}
  1963  			if s.f.pass.debug > regDebug {
  1964  				fmt.Printf("  processing control %s\n", v.LongString())
  1965  			}
  1966  			// We assume that a control input can be passed in any
  1967  			// type-compatible register. If this turns out not to be true,
  1968  			// we'll need to introduce a regspec for a block's control value.
  1969  			b.ReplaceControl(i, s.allocValToReg(v, s.compatRegs(v.Type), false, b.Pos))
  1970  		}
  1971  
  1972  		// Reduce the uses of the control values once registers have been loaded.
  1973  		// This loop is equivalent to the advanceUses method.
  1974  		for _, v := range controls {
  1975  			vi := &s.values[v.ID]
  1976  			if !vi.needReg {
  1977  				continue
  1978  			}
  1979  			// Remove this use from the uses list.
  1980  			u := vi.uses
  1981  			vi.uses = u.next
  1982  			if u.next == nil {
  1983  				s.freeRegs(vi.regs) // value is dead
  1984  			}
  1985  			u.next = s.freeUseRecords
  1986  			s.freeUseRecords = u
  1987  		}
  1988  
  1989  		// If we are approaching a merge point and we are the primary
  1990  		// predecessor of it, find live values that we use soon after
  1991  		// the merge point and promote them to registers now.
  1992  		if len(b.Succs) == 1 {
  1993  			if s.f.Config.hasGReg && s.regs[s.GReg].v != nil {
  1994  				s.freeReg(s.GReg) // Spill value in G register before any merge.
  1995  			}
  1996  			if s.blockOrder[b.ID] > s.blockOrder[b.Succs[0].b.ID] {
  1997  				// No point if we've already regalloc'd the destination.
  1998  				goto badloop
  1999  			}
  2000  			// For this to be worthwhile, the loop must have no calls in it.
  2001  			top := b.Succs[0].b
  2002  			loop := s.loopnest.b2l[top.ID]
  2003  			if loop == nil || loop.header != top || loop.containsUnavoidableCall {
  2004  				goto badloop
  2005  			}
  2006  
  2007  			// TODO: sort by distance, pick the closest ones?
  2008  			for _, live := range s.live[b.ID] {
  2009  				if live.dist >= unlikelyDistance {
  2010  					// Don't preload anything live after the loop.
  2011  					continue
  2012  				}
  2013  				vid := live.ID
  2014  				vi := &s.values[vid]
  2015  				if vi.regs != 0 {
  2016  					continue
  2017  				}
  2018  				if vi.rematerializeable {
  2019  					continue
  2020  				}
  2021  				v := s.orig[vid]
  2022  				m := s.compatRegs(v.Type) &^ s.used
  2023  				// Used desired register if available.
  2024  			outerloop:
  2025  				for _, e := range desired.entries {
  2026  					if e.ID != v.ID {
  2027  						continue
  2028  					}
  2029  					for _, r := range e.regs {
  2030  						if r != noRegister && m>>r&1 != 0 {
  2031  							m = regMask(1) << r
  2032  							break outerloop
  2033  						}
  2034  					}
  2035  				}
  2036  				if m&^desired.avoid != 0 {
  2037  					m &^= desired.avoid
  2038  				}
  2039  				if m != 0 {
  2040  					s.allocValToReg(v, m, false, b.Pos)
  2041  				}
  2042  			}
  2043  		}
  2044  	badloop:
  2045  		;
  2046  
  2047  		// Save end-of-block register state.
  2048  		// First count how many, this cuts allocations in half.
  2049  		k := 0
  2050  		for r := register(0); r < s.numRegs; r++ {
  2051  			v := s.regs[r].v
  2052  			if v == nil {
  2053  				continue
  2054  			}
  2055  			k++
  2056  		}
  2057  		regList := make([]endReg, 0, k)
  2058  		for r := register(0); r < s.numRegs; r++ {
  2059  			v := s.regs[r].v
  2060  			if v == nil {
  2061  				continue
  2062  			}
  2063  			regList = append(regList, endReg{r, v, s.regs[r].c})
  2064  		}
  2065  		s.endRegs[b.ID] = regList
  2066  
  2067  		if checkEnabled {
  2068  			regValLiveSet.clear()
  2069  			if s.live != nil {
  2070  				for _, x := range s.live[b.ID] {
  2071  					regValLiveSet.add(x.ID)
  2072  				}
  2073  			}
  2074  			for r := register(0); r < s.numRegs; r++ {
  2075  				v := s.regs[r].v
  2076  				if v == nil {
  2077  					continue
  2078  				}
  2079  				if !regValLiveSet.contains(v.ID) {
  2080  					s.f.Fatalf("val %s is in reg but not live at end of %s", v, b)
  2081  				}
  2082  			}
  2083  		}
  2084  
  2085  		// If a value is live at the end of the block and
  2086  		// isn't in a register, generate a use for the spill location.
  2087  		// We need to remember this information so that
  2088  		// the liveness analysis in stackalloc is correct.
  2089  		if s.live != nil {
  2090  			for _, e := range s.live[b.ID] {
  2091  				vi := &s.values[e.ID]
  2092  				if vi.regs != 0 {
  2093  					// in a register, we'll use that source for the merge.
  2094  					continue
  2095  				}
  2096  				if vi.rematerializeable {
  2097  					// we'll rematerialize during the merge.
  2098  					continue
  2099  				}
  2100  				if s.f.pass.debug > regDebug {
  2101  					fmt.Printf("live-at-end spill for %s at %s\n", s.orig[e.ID], b)
  2102  				}
  2103  				spill := s.makeSpill(s.orig[e.ID], b)
  2104  				s.spillLive[b.ID] = append(s.spillLive[b.ID], spill.ID)
  2105  			}
  2106  
  2107  			// Clear any final uses.
  2108  			// All that is left should be the pseudo-uses added for values which
  2109  			// are live at the end of b.
  2110  			for _, e := range s.live[b.ID] {
  2111  				u := s.values[e.ID].uses
  2112  				if u == nil {
  2113  					f.Fatalf("live at end, no uses v%d", e.ID)
  2114  				}
  2115  				if u.next != nil {
  2116  					f.Fatalf("live at end, too many uses v%d", e.ID)
  2117  				}
  2118  				s.values[e.ID].uses = nil
  2119  				u.next = s.freeUseRecords
  2120  				s.freeUseRecords = u
  2121  			}
  2122  		}
  2123  
  2124  		// allocReg may have dropped registers from startRegsMask that
  2125  		// aren't actually needed in startRegs. Synchronize back to
  2126  		// startRegs.
  2127  		//
  2128  		// This must be done before placing spills, which will look at
  2129  		// startRegs to decide if a block is a valid block for a spill.
  2130  		if c := countRegs(s.startRegsMask); c != len(s.startRegs[b.ID]) {
  2131  			regs := make([]startReg, 0, c)
  2132  			for _, sr := range s.startRegs[b.ID] {
  2133  				if s.startRegsMask&(regMask(1)<<sr.r) == 0 {
  2134  					continue
  2135  				}
  2136  				regs = append(regs, sr)
  2137  			}
  2138  			s.startRegs[b.ID] = regs
  2139  		}
  2140  	}
  2141  
  2142  	// Decide where the spills we generated will go.
  2143  	s.placeSpills()
  2144  
  2145  	// Anything that didn't get a register gets a stack location here.
  2146  	// (StoreReg, stack-based phis, inputs, ...)
  2147  	stacklive := stackalloc(s.f, s.spillLive)
  2148  
  2149  	// Fix up all merge edges.
  2150  	s.shuffle(stacklive)
  2151  
  2152  	// Erase any copies we never used.
  2153  	// Also, an unused copy might be the only use of another copy,
  2154  	// so continue erasing until we reach a fixed point.
  2155  	for {
  2156  		progress := false
  2157  		for c, used := range s.copies {
  2158  			if !used && c.Uses == 0 {
  2159  				if s.f.pass.debug > regDebug {
  2160  					fmt.Printf("delete copied value %s\n", c.LongString())
  2161  				}
  2162  				c.resetArgs()
  2163  				f.freeValue(c)
  2164  				delete(s.copies, c)
  2165  				progress = true
  2166  			}
  2167  		}
  2168  		if !progress {
  2169  			break
  2170  		}
  2171  	}
  2172  
  2173  	for _, b := range s.visitOrder {
  2174  		i := 0
  2175  		for _, v := range b.Values {
  2176  			if v.Op == OpInvalid {
  2177  				continue
  2178  			}
  2179  			b.Values[i] = v
  2180  			i++
  2181  		}
  2182  		b.Values = b.Values[:i]
  2183  	}
  2184  }
  2185  
  2186  func (s *regAllocState) placeSpills() {
  2187  	mustBeFirst := func(op Op) bool {
  2188  		return op.isLoweredGetClosurePtr() || op == OpPhi || op == OpArgIntReg || op == OpArgFloatReg
  2189  	}
  2190  
  2191  	// Start maps block IDs to the list of spills
  2192  	// that go at the start of the block (but after any phis).
  2193  	start := map[ID][]*Value{}
  2194  	// After maps value IDs to the list of spills
  2195  	// that go immediately after that value ID.
  2196  	after := map[ID][]*Value{}
  2197  
  2198  	for i := range s.values {
  2199  		vi := s.values[i]
  2200  		spill := vi.spill
  2201  		if spill == nil {
  2202  			continue
  2203  		}
  2204  		if spill.Block != nil {
  2205  			// Some spills are already fully set up,
  2206  			// like OpArgs and stack-based phis.
  2207  			continue
  2208  		}
  2209  		v := s.orig[i]
  2210  
  2211  		// Walk down the dominator tree looking for a good place to
  2212  		// put the spill of v.  At the start "best" is the best place
  2213  		// we have found so far.
  2214  		// TODO: find a way to make this O(1) without arbitrary cutoffs.
  2215  		if v == nil {
  2216  			panic(fmt.Errorf("nil v, s.orig[%d], vi = %v, spill = %s", i, vi, spill.LongString()))
  2217  		}
  2218  		best := v.Block
  2219  		bestArg := v
  2220  		var bestDepth int16
  2221  		if s.loopnest != nil && s.loopnest.b2l[best.ID] != nil {
  2222  			bestDepth = s.loopnest.b2l[best.ID].depth
  2223  		}
  2224  		b := best
  2225  		const maxSpillSearch = 100
  2226  		for i := 0; i < maxSpillSearch; i++ {
  2227  			// Find the child of b in the dominator tree which
  2228  			// dominates all restores.
  2229  			p := b
  2230  			b = nil
  2231  			for c := s.sdom.Child(p); c != nil && i < maxSpillSearch; c, i = s.sdom.Sibling(c), i+1 {
  2232  				if s.sdom[c.ID].entry <= vi.restoreMin && s.sdom[c.ID].exit >= vi.restoreMax {
  2233  					// c also dominates all restores.  Walk down into c.
  2234  					b = c
  2235  					break
  2236  				}
  2237  			}
  2238  			if b == nil {
  2239  				// Ran out of blocks which dominate all restores.
  2240  				break
  2241  			}
  2242  
  2243  			var depth int16
  2244  			if s.loopnest != nil && s.loopnest.b2l[b.ID] != nil {
  2245  				depth = s.loopnest.b2l[b.ID].depth
  2246  			}
  2247  			if depth > bestDepth {
  2248  				// Don't push the spill into a deeper loop.
  2249  				continue
  2250  			}
  2251  
  2252  			// If v is in a register at the start of b, we can
  2253  			// place the spill here (after the phis).
  2254  			if len(b.Preds) == 1 {
  2255  				for _, e := range s.endRegs[b.Preds[0].b.ID] {
  2256  					if e.v == v {
  2257  						// Found a better spot for the spill.
  2258  						best = b
  2259  						bestArg = e.c
  2260  						bestDepth = depth
  2261  						break
  2262  					}
  2263  				}
  2264  			} else {
  2265  				for _, e := range s.startRegs[b.ID] {
  2266  					if e.v == v {
  2267  						// Found a better spot for the spill.
  2268  						best = b
  2269  						bestArg = e.c
  2270  						bestDepth = depth
  2271  						break
  2272  					}
  2273  				}
  2274  			}
  2275  		}
  2276  
  2277  		// Put the spill in the best block we found.
  2278  		spill.Block = best
  2279  		spill.AddArg(bestArg)
  2280  		if best == v.Block && !mustBeFirst(v.Op) {
  2281  			// Place immediately after v.
  2282  			after[v.ID] = append(after[v.ID], spill)
  2283  		} else {
  2284  			// Place at the start of best block.
  2285  			start[best.ID] = append(start[best.ID], spill)
  2286  		}
  2287  	}
  2288  
  2289  	// Insert spill instructions into the block schedules.
  2290  	var oldSched []*Value
  2291  	for _, b := range s.visitOrder {
  2292  		nfirst := 0
  2293  		for _, v := range b.Values {
  2294  			if !mustBeFirst(v.Op) {
  2295  				break
  2296  			}
  2297  			nfirst++
  2298  		}
  2299  		oldSched = append(oldSched[:0], b.Values[nfirst:]...)
  2300  		b.Values = b.Values[:nfirst]
  2301  		b.Values = append(b.Values, start[b.ID]...)
  2302  		for _, v := range oldSched {
  2303  			b.Values = append(b.Values, v)
  2304  			b.Values = append(b.Values, after[v.ID]...)
  2305  		}
  2306  	}
  2307  }
  2308  
  2309  // shuffle fixes up all the merge edges (those going into blocks of indegree > 1).
  2310  func (s *regAllocState) shuffle(stacklive [][]ID) {
  2311  	var e edgeState
  2312  	e.s = s
  2313  	e.cache = map[ID][]*Value{}
  2314  	e.contents = map[Location]contentRecord{}
  2315  	if s.f.pass.debug > regDebug {
  2316  		fmt.Printf("shuffle %s\n", s.f.Name)
  2317  		fmt.Println(s.f.String())
  2318  	}
  2319  
  2320  	for _, b := range s.visitOrder {
  2321  		if len(b.Preds) <= 1 {
  2322  			continue
  2323  		}
  2324  		e.b = b
  2325  		for i, edge := range b.Preds {
  2326  			p := edge.b
  2327  			e.p = p
  2328  			e.setup(i, s.endRegs[p.ID], s.startRegs[b.ID], stacklive[p.ID])
  2329  			e.process()
  2330  		}
  2331  	}
  2332  
  2333  	if s.f.pass.debug > regDebug {
  2334  		fmt.Printf("post shuffle %s\n", s.f.Name)
  2335  		fmt.Println(s.f.String())
  2336  	}
  2337  }
  2338  
  2339  type edgeState struct {
  2340  	s    *regAllocState
  2341  	p, b *Block // edge goes from p->b.
  2342  
  2343  	// for each pre-regalloc value, a list of equivalent cached values
  2344  	cache      map[ID][]*Value
  2345  	cachedVals []ID // (superset of) keys of the above map, for deterministic iteration
  2346  
  2347  	// map from location to the value it contains
  2348  	contents map[Location]contentRecord
  2349  
  2350  	// desired destination locations
  2351  	destinations []dstRecord
  2352  	extra        []dstRecord
  2353  
  2354  	usedRegs              regMask // registers currently holding something
  2355  	uniqueRegs            regMask // registers holding the only copy of a value
  2356  	finalRegs             regMask // registers holding final target
  2357  	rematerializeableRegs regMask // registers that hold rematerializeable values
  2358  }
  2359  
  2360  type contentRecord struct {
  2361  	vid   ID       // pre-regalloc value
  2362  	c     *Value   // cached value
  2363  	final bool     // this is a satisfied destination
  2364  	pos   src.XPos // source position of use of the value
  2365  }
  2366  
  2367  type dstRecord struct {
  2368  	loc    Location // register or stack slot
  2369  	vid    ID       // pre-regalloc value it should contain
  2370  	splice **Value  // place to store reference to the generating instruction
  2371  	pos    src.XPos // source position of use of this location
  2372  }
  2373  
  2374  // setup initializes the edge state for shuffling.
  2375  func (e *edgeState) setup(idx int, srcReg []endReg, dstReg []startReg, stacklive []ID) {
  2376  	if e.s.f.pass.debug > regDebug {
  2377  		fmt.Printf("edge %s->%s\n", e.p, e.b)
  2378  	}
  2379  
  2380  	// Clear state.
  2381  	clear(e.cache)
  2382  	e.cachedVals = e.cachedVals[:0]
  2383  	clear(e.contents)
  2384  	e.usedRegs = 0
  2385  	e.uniqueRegs = 0
  2386  	e.finalRegs = 0
  2387  	e.rematerializeableRegs = 0
  2388  
  2389  	// Live registers can be sources.
  2390  	for _, x := range srcReg {
  2391  		e.set(&e.s.registers[x.r], x.v.ID, x.c, false, src.NoXPos) // don't care the position of the source
  2392  	}
  2393  	// So can all of the spill locations.
  2394  	for _, spillID := range stacklive {
  2395  		v := e.s.orig[spillID]
  2396  		spill := e.s.values[v.ID].spill
  2397  		if !e.s.sdom.IsAncestorEq(spill.Block, e.p) {
  2398  			// Spills were placed that only dominate the uses found
  2399  			// during the first regalloc pass. The edge fixup code
  2400  			// can't use a spill location if the spill doesn't dominate
  2401  			// the edge.
  2402  			// We are guaranteed that if the spill doesn't dominate this edge,
  2403  			// then the value is available in a register (because we called
  2404  			// makeSpill for every value not in a register at the start
  2405  			// of an edge).
  2406  			continue
  2407  		}
  2408  		e.set(e.s.f.getHome(spillID), v.ID, spill, false, src.NoXPos) // don't care the position of the source
  2409  	}
  2410  
  2411  	// Figure out all the destinations we need.
  2412  	dsts := e.destinations[:0]
  2413  	for _, x := range dstReg {
  2414  		dsts = append(dsts, dstRecord{&e.s.registers[x.r], x.v.ID, nil, x.pos})
  2415  	}
  2416  	// Phis need their args to end up in a specific location.
  2417  	for _, v := range e.b.Values {
  2418  		if v.Op != OpPhi {
  2419  			break
  2420  		}
  2421  		loc := e.s.f.getHome(v.ID)
  2422  		if loc == nil {
  2423  			continue
  2424  		}
  2425  		dsts = append(dsts, dstRecord{loc, v.Args[idx].ID, &v.Args[idx], v.Pos})
  2426  	}
  2427  	e.destinations = dsts
  2428  
  2429  	if e.s.f.pass.debug > regDebug {
  2430  		for _, vid := range e.cachedVals {
  2431  			a := e.cache[vid]
  2432  			for _, c := range a {
  2433  				fmt.Printf("src %s: v%d cache=%s\n", e.s.f.getHome(c.ID), vid, c)
  2434  			}
  2435  		}
  2436  		for _, d := range e.destinations {
  2437  			fmt.Printf("dst %s: v%d\n", d.loc, d.vid)
  2438  		}
  2439  	}
  2440  }
  2441  
  2442  // process generates code to move all the values to the right destination locations.
  2443  func (e *edgeState) process() {
  2444  	dsts := e.destinations
  2445  
  2446  	// Process the destinations until they are all satisfied.
  2447  	for len(dsts) > 0 {
  2448  		i := 0
  2449  		for _, d := range dsts {
  2450  			if !e.processDest(d.loc, d.vid, d.splice, d.pos) {
  2451  				// Failed - save for next iteration.
  2452  				dsts[i] = d
  2453  				i++
  2454  			}
  2455  		}
  2456  		if i < len(dsts) {
  2457  			// Made some progress. Go around again.
  2458  			dsts = dsts[:i]
  2459  
  2460  			// Append any extras destinations we generated.
  2461  			dsts = append(dsts, e.extra...)
  2462  			e.extra = e.extra[:0]
  2463  			continue
  2464  		}
  2465  
  2466  		// We made no progress. That means that any
  2467  		// remaining unsatisfied moves are in simple cycles.
  2468  		// For example, A -> B -> C -> D -> A.
  2469  		//   A ----> B
  2470  		//   ^       |
  2471  		//   |       |
  2472  		//   |       v
  2473  		//   D <---- C
  2474  
  2475  		// To break the cycle, we pick an unused register, say R,
  2476  		// and put a copy of B there.
  2477  		//   A ----> B
  2478  		//   ^       |
  2479  		//   |       |
  2480  		//   |       v
  2481  		//   D <---- C <---- R=copyofB
  2482  		// When we resume the outer loop, the A->B move can now proceed,
  2483  		// and eventually the whole cycle completes.
  2484  
  2485  		// Copy any cycle location to a temp register. This duplicates
  2486  		// one of the cycle entries, allowing the just duplicated value
  2487  		// to be overwritten and the cycle to proceed.
  2488  		d := dsts[0]
  2489  		loc := d.loc
  2490  		vid := e.contents[loc].vid
  2491  		c := e.contents[loc].c
  2492  		r := e.findRegFor(c.Type)
  2493  		if e.s.f.pass.debug > regDebug {
  2494  			fmt.Printf("breaking cycle with v%d in %s:%s\n", vid, loc, c)
  2495  		}
  2496  		e.erase(r)
  2497  		pos := d.pos.WithNotStmt()
  2498  		if _, isReg := loc.(*Register); isReg {
  2499  			c = e.p.NewValue1(pos, OpCopy, c.Type, c)
  2500  		} else {
  2501  			c = e.p.NewValue1(pos, OpLoadReg, c.Type, c)
  2502  		}
  2503  		e.set(r, vid, c, false, pos)
  2504  		if c.Op == OpLoadReg && e.s.isGReg(register(r.(*Register).num)) {
  2505  			e.s.f.Fatalf("process.OpLoadReg targeting g: " + c.LongString())
  2506  		}
  2507  	}
  2508  }
  2509  
  2510  // processDest generates code to put value vid into location loc. Returns true
  2511  // if progress was made.
  2512  func (e *edgeState) processDest(loc Location, vid ID, splice **Value, pos src.XPos) bool {
  2513  	pos = pos.WithNotStmt()
  2514  	occupant := e.contents[loc]
  2515  	if occupant.vid == vid {
  2516  		// Value is already in the correct place.
  2517  		e.contents[loc] = contentRecord{vid, occupant.c, true, pos}
  2518  		if splice != nil {
  2519  			(*splice).Uses--
  2520  			*splice = occupant.c
  2521  			occupant.c.Uses++
  2522  		}
  2523  		// Note: if splice==nil then c will appear dead. This is
  2524  		// non-SSA formed code, so be careful after this pass not to run
  2525  		// deadcode elimination.
  2526  		if _, ok := e.s.copies[occupant.c]; ok {
  2527  			// The copy at occupant.c was used to avoid spill.
  2528  			e.s.copies[occupant.c] = true
  2529  		}
  2530  		return true
  2531  	}
  2532  
  2533  	// Check if we're allowed to clobber the destination location.
  2534  	if len(e.cache[occupant.vid]) == 1 && !e.s.values[occupant.vid].rematerializeable && !opcodeTable[e.s.orig[occupant.vid].Op].fixedReg {
  2535  		// We can't overwrite the last copy
  2536  		// of a value that needs to survive.
  2537  		return false
  2538  	}
  2539  
  2540  	// Copy from a source of v, register preferred.
  2541  	v := e.s.orig[vid]
  2542  	var c *Value
  2543  	var src Location
  2544  	if e.s.f.pass.debug > regDebug {
  2545  		fmt.Printf("moving v%d to %s\n", vid, loc)
  2546  		fmt.Printf("sources of v%d:", vid)
  2547  	}
  2548  	if opcodeTable[v.Op].fixedReg {
  2549  		c = v
  2550  		src = e.s.f.getHome(v.ID)
  2551  	} else {
  2552  		for _, w := range e.cache[vid] {
  2553  			h := e.s.f.getHome(w.ID)
  2554  			if e.s.f.pass.debug > regDebug {
  2555  				fmt.Printf(" %s:%s", h, w)
  2556  			}
  2557  			_, isreg := h.(*Register)
  2558  			if src == nil || isreg {
  2559  				c = w
  2560  				src = h
  2561  			}
  2562  		}
  2563  	}
  2564  	if e.s.f.pass.debug > regDebug {
  2565  		if src != nil {
  2566  			fmt.Printf(" [use %s]\n", src)
  2567  		} else {
  2568  			fmt.Printf(" [no source]\n")
  2569  		}
  2570  	}
  2571  	_, dstReg := loc.(*Register)
  2572  
  2573  	// Pre-clobber destination. This avoids the
  2574  	// following situation:
  2575  	//   - v is currently held in R0 and stacktmp0.
  2576  	//   - We want to copy stacktmp1 to stacktmp0.
  2577  	//   - We choose R0 as the temporary register.
  2578  	// During the copy, both R0 and stacktmp0 are
  2579  	// clobbered, losing both copies of v. Oops!
  2580  	// Erasing the destination early means R0 will not
  2581  	// be chosen as the temp register, as it will then
  2582  	// be the last copy of v.
  2583  	e.erase(loc)
  2584  	var x *Value
  2585  	if c == nil || e.s.values[vid].rematerializeable {
  2586  		if !e.s.values[vid].rematerializeable {
  2587  			e.s.f.Fatalf("can't find source for %s->%s: %s\n", e.p, e.b, v.LongString())
  2588  		}
  2589  		if dstReg {
  2590  			// We want to rematerialize v into a register that is incompatible with v's op's register mask.
  2591  			// Instead of setting the wrong register for the rematerialized v, we should find the right register
  2592  			// for it and emit an additional copy to move to the desired register.
  2593  			// For #70451.
  2594  			if e.s.regspec(v).outputs[0].regs&regMask(1<<register(loc.(*Register).num)) == 0 {
  2595  				_, srcReg := src.(*Register)
  2596  				if srcReg {
  2597  					// It exists in a valid register already, so just copy it to the desired register
  2598  					// If src is a Register, c must have already been set.
  2599  					x = e.p.NewValue1(pos, OpCopy, c.Type, c)
  2600  				} else {
  2601  					// We need a tmp register
  2602  					x = v.copyInto(e.p)
  2603  					r := e.findRegFor(x.Type)
  2604  					e.erase(r)
  2605  					// Rematerialize to the tmp register
  2606  					e.set(r, vid, x, false, pos)
  2607  					// Copy from tmp to the desired register
  2608  					x = e.p.NewValue1(pos, OpCopy, x.Type, x)
  2609  				}
  2610  			} else {
  2611  				x = v.copyInto(e.p)
  2612  			}
  2613  		} else {
  2614  			// Rematerialize into stack slot. Need a free
  2615  			// register to accomplish this.
  2616  			r := e.findRegFor(v.Type)
  2617  			e.erase(r)
  2618  			x = v.copyIntoWithXPos(e.p, pos)
  2619  			e.set(r, vid, x, false, pos)
  2620  			// Make sure we spill with the size of the slot, not the
  2621  			// size of x (which might be wider due to our dropping
  2622  			// of narrowing conversions).
  2623  			x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, x)
  2624  		}
  2625  	} else {
  2626  		// Emit move from src to dst.
  2627  		_, srcReg := src.(*Register)
  2628  		if srcReg {
  2629  			if dstReg {
  2630  				x = e.p.NewValue1(pos, OpCopy, c.Type, c)
  2631  			} else {
  2632  				x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, c)
  2633  			}
  2634  		} else {
  2635  			if dstReg {
  2636  				x = e.p.NewValue1(pos, OpLoadReg, c.Type, c)
  2637  			} else {
  2638  				// mem->mem. Use temp register.
  2639  				r := e.findRegFor(c.Type)
  2640  				e.erase(r)
  2641  				t := e.p.NewValue1(pos, OpLoadReg, c.Type, c)
  2642  				e.set(r, vid, t, false, pos)
  2643  				x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, t)
  2644  			}
  2645  		}
  2646  	}
  2647  	e.set(loc, vid, x, true, pos)
  2648  	if x.Op == OpLoadReg && e.s.isGReg(register(loc.(*Register).num)) {
  2649  		e.s.f.Fatalf("processDest.OpLoadReg targeting g: " + x.LongString())
  2650  	}
  2651  	if splice != nil {
  2652  		(*splice).Uses--
  2653  		*splice = x
  2654  		x.Uses++
  2655  	}
  2656  	return true
  2657  }
  2658  
  2659  // set changes the contents of location loc to hold the given value and its cached representative.
  2660  func (e *edgeState) set(loc Location, vid ID, c *Value, final bool, pos src.XPos) {
  2661  	e.s.f.setHome(c, loc)
  2662  	e.contents[loc] = contentRecord{vid, c, final, pos}
  2663  	a := e.cache[vid]
  2664  	if len(a) == 0 {
  2665  		e.cachedVals = append(e.cachedVals, vid)
  2666  	}
  2667  	a = append(a, c)
  2668  	e.cache[vid] = a
  2669  	if r, ok := loc.(*Register); ok {
  2670  		if e.usedRegs&(regMask(1)<<uint(r.num)) != 0 {
  2671  			e.s.f.Fatalf("%v is already set (v%d/%v)", r, vid, c)
  2672  		}
  2673  		e.usedRegs |= regMask(1) << uint(r.num)
  2674  		if final {
  2675  			e.finalRegs |= regMask(1) << uint(r.num)
  2676  		}
  2677  		if len(a) == 1 {
  2678  			e.uniqueRegs |= regMask(1) << uint(r.num)
  2679  		}
  2680  		if len(a) == 2 {
  2681  			if t, ok := e.s.f.getHome(a[0].ID).(*Register); ok {
  2682  				e.uniqueRegs &^= regMask(1) << uint(t.num)
  2683  			}
  2684  		}
  2685  		if e.s.values[vid].rematerializeable {
  2686  			e.rematerializeableRegs |= regMask(1) << uint(r.num)
  2687  		}
  2688  	}
  2689  	if e.s.f.pass.debug > regDebug {
  2690  		fmt.Printf("%s\n", c.LongString())
  2691  		fmt.Printf("v%d now available in %s:%s\n", vid, loc, c)
  2692  	}
  2693  }
  2694  
  2695  // erase removes any user of loc.
  2696  func (e *edgeState) erase(loc Location) {
  2697  	cr := e.contents[loc]
  2698  	if cr.c == nil {
  2699  		return
  2700  	}
  2701  	vid := cr.vid
  2702  
  2703  	if cr.final {
  2704  		// Add a destination to move this value back into place.
  2705  		// Make sure it gets added to the tail of the destination queue
  2706  		// so we make progress on other moves first.
  2707  		e.extra = append(e.extra, dstRecord{loc, cr.vid, nil, cr.pos})
  2708  	}
  2709  
  2710  	// Remove c from the list of cached values.
  2711  	a := e.cache[vid]
  2712  	for i, c := range a {
  2713  		if e.s.f.getHome(c.ID) == loc {
  2714  			if e.s.f.pass.debug > regDebug {
  2715  				fmt.Printf("v%d no longer available in %s:%s\n", vid, loc, c)
  2716  			}
  2717  			a[i], a = a[len(a)-1], a[:len(a)-1]
  2718  			break
  2719  		}
  2720  	}
  2721  	e.cache[vid] = a
  2722  
  2723  	// Update register masks.
  2724  	if r, ok := loc.(*Register); ok {
  2725  		e.usedRegs &^= regMask(1) << uint(r.num)
  2726  		if cr.final {
  2727  			e.finalRegs &^= regMask(1) << uint(r.num)
  2728  		}
  2729  		e.rematerializeableRegs &^= regMask(1) << uint(r.num)
  2730  	}
  2731  	if len(a) == 1 {
  2732  		if r, ok := e.s.f.getHome(a[0].ID).(*Register); ok {
  2733  			e.uniqueRegs |= regMask(1) << uint(r.num)
  2734  		}
  2735  	}
  2736  }
  2737  
  2738  // findRegFor finds a register we can use to make a temp copy of type typ.
  2739  func (e *edgeState) findRegFor(typ *types.Type) Location {
  2740  	// Which registers are possibilities.
  2741  	m := e.s.compatRegs(typ)
  2742  
  2743  	// Pick a register. In priority order:
  2744  	// 1) an unused register
  2745  	// 2) a non-unique register not holding a final value
  2746  	// 3) a non-unique register
  2747  	// 4) a register holding a rematerializeable value
  2748  	x := m &^ e.usedRegs
  2749  	if x != 0 {
  2750  		return &e.s.registers[pickReg(x)]
  2751  	}
  2752  	x = m &^ e.uniqueRegs &^ e.finalRegs
  2753  	if x != 0 {
  2754  		return &e.s.registers[pickReg(x)]
  2755  	}
  2756  	x = m &^ e.uniqueRegs
  2757  	if x != 0 {
  2758  		return &e.s.registers[pickReg(x)]
  2759  	}
  2760  	x = m & e.rematerializeableRegs
  2761  	if x != 0 {
  2762  		return &e.s.registers[pickReg(x)]
  2763  	}
  2764  
  2765  	// No register is available.
  2766  	// Pick a register to spill.
  2767  	for _, vid := range e.cachedVals {
  2768  		a := e.cache[vid]
  2769  		for _, c := range a {
  2770  			if r, ok := e.s.f.getHome(c.ID).(*Register); ok && m>>uint(r.num)&1 != 0 {
  2771  				if !c.rematerializeable() {
  2772  					x := e.p.NewValue1(c.Pos, OpStoreReg, c.Type, c)
  2773  					// Allocate a temp location to spill a register to.
  2774  					t := LocalSlot{N: e.s.f.NewLocal(c.Pos, c.Type), Type: c.Type}
  2775  					// TODO: reuse these slots. They'll need to be erased first.
  2776  					e.set(t, vid, x, false, c.Pos)
  2777  					if e.s.f.pass.debug > regDebug {
  2778  						fmt.Printf("  SPILL %s->%s %s\n", r, t, x.LongString())
  2779  					}
  2780  				}
  2781  				// r will now be overwritten by the caller. At some point
  2782  				// later, the newly saved value will be moved back to its
  2783  				// final destination in processDest.
  2784  				return r
  2785  			}
  2786  		}
  2787  	}
  2788  
  2789  	fmt.Printf("m:%d unique:%d final:%d rematerializable:%d\n", m, e.uniqueRegs, e.finalRegs, e.rematerializeableRegs)
  2790  	for _, vid := range e.cachedVals {
  2791  		a := e.cache[vid]
  2792  		for _, c := range a {
  2793  			fmt.Printf("v%d: %s %s\n", vid, c, e.s.f.getHome(c.ID))
  2794  		}
  2795  	}
  2796  	e.s.f.Fatalf("can't find empty register on edge %s->%s", e.p, e.b)
  2797  	return nil
  2798  }
  2799  
  2800  // rematerializeable reports whether the register allocator should recompute
  2801  // a value instead of spilling/restoring it.
  2802  func (v *Value) rematerializeable() bool {
  2803  	if !opcodeTable[v.Op].rematerializeable {
  2804  		return false
  2805  	}
  2806  	for _, a := range v.Args {
  2807  		// Fixed-register allocations (SP, SB, etc.) are always available.
  2808  		// Any other argument of an opcode makes it not rematerializeable.
  2809  		if !opcodeTable[a.Op].fixedReg {
  2810  			return false
  2811  		}
  2812  	}
  2813  	return true
  2814  }
  2815  
  2816  type liveInfo struct {
  2817  	ID   ID       // ID of value
  2818  	dist int32    // # of instructions before next use
  2819  	pos  src.XPos // source position of next use
  2820  }
  2821  
  2822  // computeLive computes a map from block ID to a list of value IDs live at the end
  2823  // of that block. Together with the value ID is a count of how many instructions
  2824  // to the next use of that value. The resulting map is stored in s.live.
  2825  func (s *regAllocState) computeLive() {
  2826  	f := s.f
  2827  	// single block functions do not have variables that are live across
  2828  	// branches
  2829  	if len(f.Blocks) == 1 {
  2830  		return
  2831  	}
  2832  	po := f.postorder()
  2833  	s.live = make([][]liveInfo, f.NumBlocks())
  2834  	s.desired = make([]desiredState, f.NumBlocks())
  2835  	s.loopnest = f.loopnest()
  2836  
  2837  	rematIDs := make([]ID, 0, 64)
  2838  
  2839  	live := f.newSparseMapPos(f.NumValues())
  2840  	defer f.retSparseMapPos(live)
  2841  	t := f.newSparseMapPos(f.NumValues())
  2842  	defer f.retSparseMapPos(t)
  2843  
  2844  	s.loopnest.computeUnavoidableCalls()
  2845  
  2846  	// Liveness analysis.
  2847  	// This is an adapted version of the algorithm described in chapter 2.4.2
  2848  	// of Fabrice Rastello's On Sparse Intermediate Representations.
  2849  	//   https://web.archive.org/web/20240417212122if_/https://inria.hal.science/hal-00761555/file/habilitation.pdf#section.50
  2850  	//
  2851  	// For our implementation, we fall back to a traditional iterative algorithm when we encounter
  2852  	// Irreducible CFGs. They are very uncommon in Go code because they need to be constructed with
  2853  	// gotos and our current loopnest definition does not compute all the information that
  2854  	// we'd need to compute the loop ancestors for that step of the algorithm.
  2855  	//
  2856  	// Additionally, instead of only considering non-loop successors in the initial DFS phase,
  2857  	// we compute the liveout as the union of all successors. This larger liveout set is a subset
  2858  	// of the final liveout for the block and adding this information in the DFS phase means that
  2859  	// we get slightly more accurate distance information.
  2860  	var loopLiveIn map[*loop][]liveInfo
  2861  	var numCalls []int32
  2862  	if len(s.loopnest.loops) > 0 && !s.loopnest.hasIrreducible {
  2863  		loopLiveIn = make(map[*loop][]liveInfo)
  2864  		numCalls = f.Cache.allocInt32Slice(f.NumBlocks())
  2865  		defer f.Cache.freeInt32Slice(numCalls)
  2866  	}
  2867  
  2868  	for {
  2869  		changed := false
  2870  
  2871  		for _, b := range po {
  2872  			// Start with known live values at the end of the block.
  2873  			live.clear()
  2874  			for _, e := range s.live[b.ID] {
  2875  				live.set(e.ID, e.dist, e.pos)
  2876  			}
  2877  			update := false
  2878  			// arguments to phi nodes are live at this blocks out
  2879  			for _, e := range b.Succs {
  2880  				succ := e.b
  2881  				delta := branchDistance(b, succ)
  2882  				for _, v := range succ.Values {
  2883  					if v.Op != OpPhi {
  2884  						break
  2885  					}
  2886  					arg := v.Args[e.i]
  2887  					if s.values[arg.ID].needReg && (!live.contains(arg.ID) || delta < live.get(arg.ID)) {
  2888  						live.set(arg.ID, delta, v.Pos)
  2889  						update = true
  2890  					}
  2891  				}
  2892  			}
  2893  			if update {
  2894  				s.live[b.ID] = updateLive(live, s.live[b.ID])
  2895  			}
  2896  			// Add len(b.Values) to adjust from end-of-block distance
  2897  			// to beginning-of-block distance.
  2898  			c := live.contents()
  2899  			for i := range c {
  2900  				c[i].val += int32(len(b.Values))
  2901  			}
  2902  
  2903  			// Mark control values as live
  2904  			for _, c := range b.ControlValues() {
  2905  				if s.values[c.ID].needReg {
  2906  					live.set(c.ID, int32(len(b.Values)), b.Pos)
  2907  				}
  2908  			}
  2909  
  2910  			for i := len(b.Values) - 1; i >= 0; i-- {
  2911  				v := b.Values[i]
  2912  				live.remove(v.ID)
  2913  				if v.Op == OpPhi {
  2914  					continue
  2915  				}
  2916  				if opcodeTable[v.Op].call {
  2917  					if numCalls != nil {
  2918  						numCalls[b.ID]++
  2919  					}
  2920  					rematIDs = rematIDs[:0]
  2921  					c := live.contents()
  2922  					for i := range c {
  2923  						c[i].val += unlikelyDistance
  2924  						vid := c[i].key
  2925  						if s.values[vid].rematerializeable {
  2926  							rematIDs = append(rematIDs, vid)
  2927  						}
  2928  					}
  2929  					// We don't spill rematerializeable values, and assuming they
  2930  					// are live across a call would only force shuffle to add some
  2931  					// (dead) constant rematerialization. Remove them.
  2932  					for _, r := range rematIDs {
  2933  						live.remove(r)
  2934  					}
  2935  				}
  2936  				for _, a := range v.Args {
  2937  					if s.values[a.ID].needReg {
  2938  						live.set(a.ID, int32(i), v.Pos)
  2939  					}
  2940  				}
  2941  			}
  2942  			// This is a loop header, save our live-in so that
  2943  			// we can use it to fill in the loop bodies later
  2944  			if loopLiveIn != nil {
  2945  				loop := s.loopnest.b2l[b.ID]
  2946  				if loop != nil && loop.header.ID == b.ID {
  2947  					loopLiveIn[loop] = updateLive(live, nil)
  2948  				}
  2949  			}
  2950  			// For each predecessor of b, expand its list of live-at-end values.
  2951  			// invariant: live contains the values live at the start of b
  2952  			for _, e := range b.Preds {
  2953  				p := e.b
  2954  				delta := branchDistance(p, b)
  2955  
  2956  				// Start t off with the previously known live values at the end of p.
  2957  				t.clear()
  2958  				for _, e := range s.live[p.ID] {
  2959  					t.set(e.ID, e.dist, e.pos)
  2960  				}
  2961  				update := false
  2962  
  2963  				// Add new live values from scanning this block.
  2964  				for _, e := range live.contents() {
  2965  					d := e.val + delta
  2966  					if !t.contains(e.key) || d < t.get(e.key) {
  2967  						update = true
  2968  						t.set(e.key, d, e.pos)
  2969  					}
  2970  				}
  2971  
  2972  				if !update {
  2973  					continue
  2974  				}
  2975  				s.live[p.ID] = updateLive(t, s.live[p.ID])
  2976  				changed = true
  2977  			}
  2978  		}
  2979  
  2980  		// Doing a traditional iterative algorithm and have run
  2981  		// out of changes
  2982  		if !changed {
  2983  			break
  2984  		}
  2985  
  2986  		// Doing a pre-pass and will fill in the liveness information
  2987  		// later
  2988  		if loopLiveIn != nil {
  2989  			break
  2990  		}
  2991  		// For loopless code, we have full liveness info after a single
  2992  		// iteration
  2993  		if len(s.loopnest.loops) == 0 {
  2994  			break
  2995  		}
  2996  	}
  2997  	if f.pass.debug > regDebug {
  2998  		s.debugPrintLive("after dfs walk", f, s.live, s.desired)
  2999  	}
  3000  
  3001  	// irreducible CFGs and functions without loops are already
  3002  	// done, compute their desired registers and return
  3003  	if loopLiveIn == nil {
  3004  		s.computeDesired()
  3005  		return
  3006  	}
  3007  
  3008  	// Walk the loopnest from outer to inner, adding
  3009  	// all live-in values from their parent. Instead of
  3010  	// a recursive algorithm, iterate in depth order.
  3011  	// TODO(dmo): can we permute the loopnest? can we avoid this copy?
  3012  	loops := slices.Clone(s.loopnest.loops)
  3013  	slices.SortFunc(loops, func(a, b *loop) int {
  3014  		return cmp.Compare(a.depth, b.depth)
  3015  	})
  3016  
  3017  	loopset := f.newSparseMapPos(f.NumValues())
  3018  	defer f.retSparseMapPos(loopset)
  3019  	for _, loop := range loops {
  3020  		if loop.outer == nil {
  3021  			continue
  3022  		}
  3023  		livein := loopLiveIn[loop]
  3024  		loopset.clear()
  3025  		for _, l := range livein {
  3026  			loopset.set(l.ID, l.dist, l.pos)
  3027  		}
  3028  		update := false
  3029  		for _, l := range loopLiveIn[loop.outer] {
  3030  			if !loopset.contains(l.ID) {
  3031  				loopset.set(l.ID, l.dist, l.pos)
  3032  				update = true
  3033  			}
  3034  		}
  3035  		if update {
  3036  			loopLiveIn[loop] = updateLive(loopset, livein)
  3037  		}
  3038  	}
  3039  	// unknownDistance is a sentinel value for when we know a variable
  3040  	// is live at any given block, but we do not yet know how far until it's next
  3041  	// use. The distance will be computed later.
  3042  	const unknownDistance = -1
  3043  
  3044  	// add live-in values of the loop headers to their children.
  3045  	// This includes the loop headers themselves, since they can have values
  3046  	// that die in the middle of the block and aren't live-out
  3047  	for _, b := range po {
  3048  		loop := s.loopnest.b2l[b.ID]
  3049  		if loop == nil {
  3050  			continue
  3051  		}
  3052  		headerLive := loopLiveIn[loop]
  3053  		loopset.clear()
  3054  		for _, l := range s.live[b.ID] {
  3055  			loopset.set(l.ID, l.dist, l.pos)
  3056  		}
  3057  		update := false
  3058  		for _, l := range headerLive {
  3059  			if !loopset.contains(l.ID) {
  3060  				loopset.set(l.ID, unknownDistance, src.NoXPos)
  3061  				update = true
  3062  			}
  3063  		}
  3064  		if update {
  3065  			s.live[b.ID] = updateLive(loopset, s.live[b.ID])
  3066  		}
  3067  	}
  3068  	if f.pass.debug > regDebug {
  3069  		s.debugPrintLive("after live loop prop", f, s.live, s.desired)
  3070  	}
  3071  	// Filling in liveness from loops leaves some blocks with no distance information
  3072  	// Run over them and fill in the information from their successors.
  3073  	// To stabilize faster, we quit when no block has missing values and we only
  3074  	// look at blocks that still have missing values in subsequent iterations
  3075  	unfinishedBlocks := f.Cache.allocBlockSlice(len(po))
  3076  	defer f.Cache.freeBlockSlice(unfinishedBlocks)
  3077  	copy(unfinishedBlocks, po)
  3078  
  3079  	for len(unfinishedBlocks) > 0 {
  3080  		n := 0
  3081  		for _, b := range unfinishedBlocks {
  3082  			live.clear()
  3083  			unfinishedValues := 0
  3084  			for _, l := range s.live[b.ID] {
  3085  				if l.dist == unknownDistance {
  3086  					unfinishedValues++
  3087  				}
  3088  				live.set(l.ID, l.dist, l.pos)
  3089  			}
  3090  			update := false
  3091  			for _, e := range b.Succs {
  3092  				succ := e.b
  3093  				for _, l := range s.live[succ.ID] {
  3094  					if !live.contains(l.ID) || l.dist == unknownDistance {
  3095  						continue
  3096  					}
  3097  					dist := int32(len(succ.Values)) + l.dist + branchDistance(b, succ)
  3098  					dist += numCalls[succ.ID] * unlikelyDistance
  3099  					val := live.get(l.ID)
  3100  					switch {
  3101  					case val == unknownDistance:
  3102  						unfinishedValues--
  3103  						fallthrough
  3104  					case dist < val:
  3105  						update = true
  3106  						live.set(l.ID, dist, l.pos)
  3107  					}
  3108  				}
  3109  			}
  3110  			if update {
  3111  				s.live[b.ID] = updateLive(live, s.live[b.ID])
  3112  			}
  3113  			if unfinishedValues > 0 {
  3114  				unfinishedBlocks[n] = b
  3115  				n++
  3116  			}
  3117  		}
  3118  		unfinishedBlocks = unfinishedBlocks[:n]
  3119  	}
  3120  
  3121  	s.computeDesired()
  3122  
  3123  	if f.pass.debug > regDebug {
  3124  		s.debugPrintLive("final", f, s.live, s.desired)
  3125  	}
  3126  }
  3127  
  3128  // computeDesired computes the desired register information at the end of each block.
  3129  // It is essentially a liveness analysis on machine registers instead of SSA values
  3130  // The desired register information is stored in s.desired.
  3131  func (s *regAllocState) computeDesired() {
  3132  
  3133  	// TODO: Can we speed this up using the liveness information we have already
  3134  	// from computeLive?
  3135  	// TODO: Since we don't propagate information through phi nodes, can we do
  3136  	// this as a single dominator tree walk instead of the iterative solution?
  3137  	var desired desiredState
  3138  	f := s.f
  3139  	po := f.postorder()
  3140  	for {
  3141  		changed := false
  3142  		for _, b := range po {
  3143  			desired.copy(&s.desired[b.ID])
  3144  			for i := len(b.Values) - 1; i >= 0; i-- {
  3145  				v := b.Values[i]
  3146  				prefs := desired.remove(v.ID)
  3147  				if v.Op == OpPhi {
  3148  					// TODO: if v is a phi, save desired register for phi inputs.
  3149  					// For now, we just drop it and don't propagate
  3150  					// desired registers back though phi nodes.
  3151  					continue
  3152  				}
  3153  				regspec := s.regspec(v)
  3154  				// Cancel desired registers if they get clobbered.
  3155  				desired.clobber(regspec.clobbers)
  3156  				// Update desired registers if there are any fixed register inputs.
  3157  				for _, j := range regspec.inputs {
  3158  					if countRegs(j.regs) != 1 {
  3159  						continue
  3160  					}
  3161  					desired.clobber(j.regs)
  3162  					desired.add(v.Args[j.idx].ID, pickReg(j.regs))
  3163  				}
  3164  				// Set desired register of input 0 if this is a 2-operand instruction.
  3165  				if opcodeTable[v.Op].resultInArg0 || v.Op == OpAMD64ADDQconst || v.Op == OpAMD64ADDLconst || v.Op == OpSelect0 {
  3166  					// ADDQconst is added here because we want to treat it as resultInArg0 for
  3167  					// the purposes of desired registers, even though it is not an absolute requirement.
  3168  					// This is because we'd rather implement it as ADDQ instead of LEAQ.
  3169  					// Same for ADDLconst
  3170  					// Select0 is added here to propagate the desired register to the tuple-generating instruction.
  3171  					if opcodeTable[v.Op].commutative {
  3172  						desired.addList(v.Args[1].ID, prefs)
  3173  					}
  3174  					desired.addList(v.Args[0].ID, prefs)
  3175  				}
  3176  			}
  3177  			for _, e := range b.Preds {
  3178  				p := e.b
  3179  				changed = s.desired[p.ID].merge(&desired) || changed
  3180  			}
  3181  		}
  3182  		if !changed || (!s.loopnest.hasIrreducible && len(s.loopnest.loops) == 0) {
  3183  			break
  3184  		}
  3185  	}
  3186  }
  3187  
  3188  // updateLive updates a given liveInfo slice with the contents of t
  3189  func updateLive(t *sparseMapPos, live []liveInfo) []liveInfo {
  3190  	live = live[:0]
  3191  	if cap(live) < t.size() {
  3192  		live = make([]liveInfo, 0, t.size())
  3193  	}
  3194  	for _, e := range t.contents() {
  3195  		live = append(live, liveInfo{e.key, e.val, e.pos})
  3196  	}
  3197  	return live
  3198  }
  3199  
  3200  // branchDistance calculates the distance between a block and a
  3201  // successor in pseudo-instructions. This is used to indicate
  3202  // likeliness
  3203  func branchDistance(b *Block, s *Block) int32 {
  3204  	if len(b.Succs) == 2 {
  3205  		if b.Succs[0].b == s && b.Likely == BranchLikely ||
  3206  			b.Succs[1].b == s && b.Likely == BranchUnlikely {
  3207  			return likelyDistance
  3208  		}
  3209  		if b.Succs[0].b == s && b.Likely == BranchUnlikely ||
  3210  			b.Succs[1].b == s && b.Likely == BranchLikely {
  3211  			return unlikelyDistance
  3212  		}
  3213  	}
  3214  	// Note: the branch distance must be at least 1 to distinguish the control
  3215  	// value use from the first user in a successor block.
  3216  	return normalDistance
  3217  }
  3218  
  3219  func (s *regAllocState) debugPrintLive(stage string, f *Func, live [][]liveInfo, desired []desiredState) {
  3220  	fmt.Printf("%s: live values at end of each block: %s\n", stage, f.Name)
  3221  	for _, b := range f.Blocks {
  3222  		s.debugPrintLiveBlock(b, live[b.ID], &desired[b.ID])
  3223  	}
  3224  }
  3225  
  3226  func (s *regAllocState) debugPrintLiveBlock(b *Block, live []liveInfo, desired *desiredState) {
  3227  	fmt.Printf("  %s:", b)
  3228  	slices.SortFunc(live, func(a, b liveInfo) int {
  3229  		return cmp.Compare(a.ID, b.ID)
  3230  	})
  3231  	for _, x := range live {
  3232  		fmt.Printf(" v%d(%d)", x.ID, x.dist)
  3233  		for _, e := range desired.entries {
  3234  			if e.ID != x.ID {
  3235  				continue
  3236  			}
  3237  			fmt.Printf("[")
  3238  			first := true
  3239  			for _, r := range e.regs {
  3240  				if r == noRegister {
  3241  					continue
  3242  				}
  3243  				if !first {
  3244  					fmt.Printf(",")
  3245  				}
  3246  				fmt.Print(&s.registers[r])
  3247  				first = false
  3248  			}
  3249  			fmt.Printf("]")
  3250  		}
  3251  	}
  3252  	if avoid := desired.avoid; avoid != 0 {
  3253  		fmt.Printf(" avoid=%v", s.RegMaskString(avoid))
  3254  	}
  3255  	fmt.Println()
  3256  }
  3257  
  3258  // A desiredState represents desired register assignments.
  3259  type desiredState struct {
  3260  	// Desired assignments will be small, so we just use a list
  3261  	// of valueID+registers entries.
  3262  	entries []desiredStateEntry
  3263  	// Registers that other values want to be in.  This value will
  3264  	// contain at least the union of the regs fields of entries, but
  3265  	// may contain additional entries for values that were once in
  3266  	// this data structure but are no longer.
  3267  	avoid regMask
  3268  }
  3269  type desiredStateEntry struct {
  3270  	// (pre-regalloc) value
  3271  	ID ID
  3272  	// Registers it would like to be in, in priority order.
  3273  	// Unused slots are filled with noRegister.
  3274  	// For opcodes that return tuples, we track desired registers only
  3275  	// for the first element of the tuple (see desiredSecondReg for
  3276  	// tracking the desired register for second part of a tuple).
  3277  	regs [4]register
  3278  }
  3279  
  3280  // get returns a list of desired registers for value vid.
  3281  func (d *desiredState) get(vid ID) [4]register {
  3282  	for _, e := range d.entries {
  3283  		if e.ID == vid {
  3284  			return e.regs
  3285  		}
  3286  	}
  3287  	return [4]register{noRegister, noRegister, noRegister, noRegister}
  3288  }
  3289  
  3290  // add records that we'd like value vid to be in register r.
  3291  func (d *desiredState) add(vid ID, r register) {
  3292  	d.avoid |= regMask(1) << r
  3293  	for i := range d.entries {
  3294  		e := &d.entries[i]
  3295  		if e.ID != vid {
  3296  			continue
  3297  		}
  3298  		if e.regs[0] == r {
  3299  			// Already known and highest priority
  3300  			return
  3301  		}
  3302  		for j := 1; j < len(e.regs); j++ {
  3303  			if e.regs[j] == r {
  3304  				// Move from lower priority to top priority
  3305  				copy(e.regs[1:], e.regs[:j])
  3306  				e.regs[0] = r
  3307  				return
  3308  			}
  3309  		}
  3310  		copy(e.regs[1:], e.regs[:])
  3311  		e.regs[0] = r
  3312  		return
  3313  	}
  3314  	d.entries = append(d.entries, desiredStateEntry{vid, [4]register{r, noRegister, noRegister, noRegister}})
  3315  }
  3316  
  3317  func (d *desiredState) addList(vid ID, regs [4]register) {
  3318  	// regs is in priority order, so iterate in reverse order.
  3319  	for i := len(regs) - 1; i >= 0; i-- {
  3320  		r := regs[i]
  3321  		if r != noRegister {
  3322  			d.add(vid, r)
  3323  		}
  3324  	}
  3325  }
  3326  
  3327  // clobber erases any desired registers in the set m.
  3328  func (d *desiredState) clobber(m regMask) {
  3329  	for i := 0; i < len(d.entries); {
  3330  		e := &d.entries[i]
  3331  		j := 0
  3332  		for _, r := range e.regs {
  3333  			if r != noRegister && m>>r&1 == 0 {
  3334  				e.regs[j] = r
  3335  				j++
  3336  			}
  3337  		}
  3338  		if j == 0 {
  3339  			// No more desired registers for this value.
  3340  			d.entries[i] = d.entries[len(d.entries)-1]
  3341  			d.entries = d.entries[:len(d.entries)-1]
  3342  			continue
  3343  		}
  3344  		for ; j < len(e.regs); j++ {
  3345  			e.regs[j] = noRegister
  3346  		}
  3347  		i++
  3348  	}
  3349  	d.avoid &^= m
  3350  }
  3351  
  3352  // copy copies a desired state from another desiredState x.
  3353  func (d *desiredState) copy(x *desiredState) {
  3354  	d.entries = append(d.entries[:0], x.entries...)
  3355  	d.avoid = x.avoid
  3356  }
  3357  
  3358  // remove removes the desired registers for vid and returns them.
  3359  func (d *desiredState) remove(vid ID) [4]register {
  3360  	for i := range d.entries {
  3361  		if d.entries[i].ID == vid {
  3362  			regs := d.entries[i].regs
  3363  			d.entries[i] = d.entries[len(d.entries)-1]
  3364  			d.entries = d.entries[:len(d.entries)-1]
  3365  			return regs
  3366  		}
  3367  	}
  3368  	return [4]register{noRegister, noRegister, noRegister, noRegister}
  3369  }
  3370  
  3371  // merge merges another desired state x into d. Returns whether the set has
  3372  // changed
  3373  func (d *desiredState) merge(x *desiredState) bool {
  3374  	oldAvoid := d.avoid
  3375  	d.avoid |= x.avoid
  3376  	// There should only be a few desired registers, so
  3377  	// linear insert is ok.
  3378  	for _, e := range x.entries {
  3379  		d.addList(e.ID, e.regs)
  3380  	}
  3381  	return oldAvoid != d.avoid
  3382  }
  3383  
  3384  // computeUnavoidableCalls computes the containsUnavoidableCall fields in the loop nest.
  3385  func (loopnest *loopnest) computeUnavoidableCalls() {
  3386  	f := loopnest.f
  3387  
  3388  	hasCall := f.Cache.allocBoolSlice(f.NumBlocks())
  3389  	defer f.Cache.freeBoolSlice(hasCall)
  3390  	for _, b := range f.Blocks {
  3391  		if b.containsCall() {
  3392  			hasCall[b.ID] = true
  3393  		}
  3394  	}
  3395  	found := f.Cache.allocSparseSet(f.NumBlocks())
  3396  	defer f.Cache.freeSparseSet(found)
  3397  	// Run dfs to find path through the loop that avoids all calls.
  3398  	// Such path either escapes the loop or returns back to the header.
  3399  	// It isn't enough to have exit not dominated by any call, for example:
  3400  	// ... some loop
  3401  	// call1    call2
  3402  	//   \       /
  3403  	//     block
  3404  	// ...
  3405  	// block is not dominated by any single call, but we don't have call-free path to it.
  3406  loopLoop:
  3407  	for _, l := range loopnest.loops {
  3408  		found.clear()
  3409  		tovisit := make([]*Block, 0, 8)
  3410  		tovisit = append(tovisit, l.header)
  3411  		for len(tovisit) > 0 {
  3412  			cur := tovisit[len(tovisit)-1]
  3413  			tovisit = tovisit[:len(tovisit)-1]
  3414  			if hasCall[cur.ID] {
  3415  				continue
  3416  			}
  3417  			for _, s := range cur.Succs {
  3418  				nb := s.Block()
  3419  				if nb == l.header {
  3420  					// Found a call-free path around the loop.
  3421  					continue loopLoop
  3422  				}
  3423  				if found.contains(nb.ID) {
  3424  					// Already found via another path.
  3425  					continue
  3426  				}
  3427  				nl := loopnest.b2l[nb.ID]
  3428  				if nl == nil || (nl.depth <= l.depth && nl != l) {
  3429  					// Left the loop.
  3430  					continue
  3431  				}
  3432  				tovisit = append(tovisit, nb)
  3433  				found.add(nb.ID)
  3434  			}
  3435  		}
  3436  		// No call-free path was found.
  3437  		l.containsUnavoidableCall = true
  3438  	}
  3439  }
  3440  
  3441  func (b *Block) containsCall() bool {
  3442  	if b.Kind == BlockDefer {
  3443  		return true
  3444  	}
  3445  	for _, v := range b.Values {
  3446  		if opcodeTable[v.Op].call {
  3447  			return true
  3448  		}
  3449  	}
  3450  	return false
  3451  }
  3452  

View as plain text