Source file src/cmd/compile/internal/ssa/rewritetern.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package ssa
     6  
     7  import (
     8  	"fmt"
     9  	"internal/goarch"
    10  	"slices"
    11  )
    12  
    13  var truthTableValues [3]uint8 = [3]uint8{0b1111_0000, 0b1100_1100, 0b1010_1010}
    14  
    15  func (slop SIMDLogicalOP) String() string {
    16  	if slop == sloInterior {
    17  		return "leaf"
    18  	}
    19  	interior := ""
    20  	if slop&sloInterior != 0 {
    21  		interior = "+interior"
    22  	}
    23  	switch slop &^ sloInterior {
    24  	case sloAnd:
    25  		return "and" + interior
    26  	case sloXor:
    27  		return "xor" + interior
    28  	case sloOr:
    29  		return "or" + interior
    30  	case sloAndNot:
    31  		return "andNot" + interior
    32  	case sloNot:
    33  		return "not" + interior
    34  	}
    35  	return "wrong"
    36  }
    37  
    38  func rewriteTern(f *Func) {
    39  	if f.maxCPUFeatures == CPUNone {
    40  		return
    41  	}
    42  
    43  	arch := f.Config.Ctxt().Arch.Family
    44  	// TODO there are other SIMD architectures
    45  	if arch != goarch.AMD64 {
    46  		return
    47  	}
    48  
    49  	boolExprTrees := make(map[*Value]SIMDLogicalOP)
    50  
    51  	// Find logical-expr expression trees, including leaves.
    52  	// interior nodes will be marked sloInterior,
    53  	// root nodes will not be marked sloInterior,
    54  	// leaf nodes are only marked sloInterior.
    55  	for _, b := range f.Blocks {
    56  		for _, v := range b.Values {
    57  			slo := classifyBooleanSIMD(v)
    58  			switch slo {
    59  			case sloOr,
    60  				sloAndNot,
    61  				sloXor,
    62  				sloAnd:
    63  				boolExprTrees[v.Args[1]] |= sloInterior
    64  				fallthrough
    65  			case sloNot:
    66  				boolExprTrees[v.Args[0]] |= sloInterior
    67  				boolExprTrees[v] |= slo
    68  			}
    69  		}
    70  	}
    71  
    72  	// get a canonical sorted set of roots
    73  	var roots []*Value
    74  	for v, slo := range boolExprTrees {
    75  		if f.pass.debug > 1 {
    76  			f.Warnl(v.Pos, "%s has SLO %v", v.LongString(), slo)
    77  		}
    78  
    79  		if slo&sloInterior == 0 && v.Block.CPUfeatures.hasFeature(CPUavx512) {
    80  			roots = append(roots, v)
    81  		}
    82  	}
    83  	slices.SortFunc(roots, func(u, v *Value) int { return int(u.ID - v.ID) }) // IDs are small enough to not care about overflow.
    84  
    85  	// This rewrite works by iterating over the root set.
    86  	// For each boolean expression, it walks the expression
    87  	// bottom up accumulating sets of variables mentioned in
    88  	// subexpressions, lazy-greedily finding the largest subexpressions
    89  	// of 3 inputs that can be rewritten to use ternary-truth-table instructions.
    90  
    91  	// rewrite recursively attempts to replace v and v's subexpressions with
    92  	// ternary-logic truth-table operations, returning a set of not more than 3
    93  	// subexpressions within v that may be combined into a parent's replacement.
    94  	// V need not have the CPU features that allow a ternary-logic operation;
    95  	// in that case, v will not be rewritten.  Replacements also require
    96  	// exactly 3 different variable inputs to a boolean expression.
    97  	//
    98  	// Given the CPU feature and 3 inputs, v is replaced in the following
    99  	// cases:
   100  	//
   101  	// 1) v is a root
   102  	// 2) u = NOT(v) and u lacks the CPU feature
   103  	// 3) u = OP(v, w) and u lacks the CPU feature
   104  	// 4) u = OP(v, w) and u has more than 3 variable inputs.	var rewrite func(v *Value) [3]*Value
   105  	var rewrite func(v *Value) [3]*Value
   106  
   107  	// computeTT returns the truth table for a boolean expression
   108  	// over the variables in vars, where vars[0] varies slowest in
   109  	// the truth table and vars[2] varies fastest.
   110  	// e.g. computeTT( "and(x, or(y, not(z)))", {x,y,z} ) returns
   111  	// (bit 0 first) 0 0 0 0 1 0 1 1 = (reversed) 1101_0000 = 0xD0
   112  	//            x: 0 0 0 0 1 1 1 1
   113  	//            y: 0 0 1 1 0 0 1 1
   114  	//            z: 0 1 0 1 0 1 0 1
   115  	var computeTT func(v *Value, vars [3]*Value) uint8
   116  
   117  	// combine two sets of variables into one, returning ok/not
   118  	// if the two sets contained 3 or fewer elements.  Combine
   119  	// ensures that the sets of Values never contain duplicates.
   120  	// (Duplicates would create less-efficient code, not incorrect code.)
   121  	combine := func(a, b [3]*Value) ([3]*Value, bool) {
   122  		var c [3]*Value
   123  		i := 0
   124  		for _, v := range a {
   125  			if v == nil {
   126  				break
   127  			}
   128  			c[i] = v
   129  			i++
   130  		}
   131  	bloop:
   132  		for _, v := range b {
   133  			if v == nil {
   134  				break
   135  			}
   136  			for _, u := range a {
   137  				if v == u {
   138  					continue bloop
   139  				}
   140  			}
   141  			if i == 3 {
   142  				return [3]*Value{}, false
   143  			}
   144  			c[i] = v
   145  			i++
   146  		}
   147  		return c, true
   148  	}
   149  
   150  	computeTT = func(v *Value, vars [3]*Value) uint8 {
   151  		i := 0
   152  		for ; i < len(vars); i++ {
   153  			if vars[i] == v {
   154  				return truthTableValues[i]
   155  			}
   156  		}
   157  		slo := boolExprTrees[v] &^ sloInterior
   158  		a := computeTT(v.Args[0], vars)
   159  		switch slo {
   160  		case sloNot:
   161  			return ^a
   162  		case sloAnd:
   163  			return a & computeTT(v.Args[1], vars)
   164  		case sloXor:
   165  			return a ^ computeTT(v.Args[1], vars)
   166  		case sloOr:
   167  			return a | computeTT(v.Args[1], vars)
   168  		case sloAndNot:
   169  			return a & ^computeTT(v.Args[1], vars)
   170  		}
   171  		panic("switch should have covered all cases, or unknown var in logical expression")
   172  	}
   173  
   174  	replace := func(a0 *Value, vars0 [3]*Value) {
   175  		imm := computeTT(a0, vars0)
   176  		op := ternOpForLogical(a0.Op)
   177  		if op == a0.Op {
   178  			panic(fmt.Errorf("should have mapped away from input op, a0 is %s", a0.LongString()))
   179  		}
   180  		if f.pass.debug > 0 {
   181  			f.Warnl(a0.Pos, "Rewriting %s into %v of 0b%b %v %v %v", a0.LongString(), op, imm,
   182  				vars0[0], vars0[1], vars0[2])
   183  		}
   184  		a0.reset(op)
   185  		a0.SetArgs3(vars0[0], vars0[1], vars0[2])
   186  		a0.AuxInt = int64(int8(imm))
   187  	}
   188  
   189  	// addOne ensures the no-duplicates addition of a single value
   190  	// to a set that is not full.  It seems possible that a shared
   191  	// subexpression in tricky combination with blocks lacking the
   192  	// AVX512 feature might permit this.
   193  	addOne := func(vars [3]*Value, v *Value) [3]*Value {
   194  		if vars[2] != nil {
   195  			panic("rewriteTern.addOne, vars[2] should be nil")
   196  		}
   197  		if v == vars[0] || v == vars[1] {
   198  			return vars
   199  		}
   200  		if vars[1] == nil {
   201  			vars[1] = v
   202  		} else {
   203  			vars[2] = v
   204  		}
   205  		return vars
   206  	}
   207  
   208  	rewrite = func(v *Value) [3]*Value {
   209  		slo := boolExprTrees[v]
   210  		if slo == sloInterior { // leaf node, i.e., a "variable"
   211  			return [3]*Value{v, nil, nil}
   212  		}
   213  		var vars [3]*Value
   214  		hasFeature := v.Block.CPUfeatures.hasFeature(CPUavx512)
   215  		if slo&sloNot == sloNot {
   216  			vars = rewrite(v.Args[0])
   217  			if !hasFeature {
   218  				if vars[2] != nil {
   219  					replace(v.Args[0], vars)
   220  					return [3]*Value{v, nil, nil}
   221  				}
   222  				return vars
   223  			}
   224  		} else {
   225  			var ok bool
   226  			a0, a1 := v.Args[0], v.Args[1]
   227  			vars0 := rewrite(a0)
   228  			vars1 := rewrite(a1)
   229  			vars, ok = combine(vars0, vars1)
   230  
   231  			if f.pass.debug > 1 {
   232  				f.Warnl(a0.Pos, "combine(%v, %v) -> %v, %v", vars0, vars1, vars, ok)
   233  			}
   234  
   235  			if !(ok && v.Block.CPUfeatures.hasFeature(CPUavx512)) {
   236  				// too many variables, or cannot rewrite current values.
   237  				// rewrite one or both subtrees if possible
   238  				if vars0[2] != nil && a0.Block.CPUfeatures.hasFeature(CPUavx512) {
   239  					replace(a0, vars0)
   240  				}
   241  				if vars1[2] != nil && a1.Block.CPUfeatures.hasFeature(CPUavx512) {
   242  					replace(a1, vars1)
   243  				}
   244  
   245  				// 3-element var arrays are either rewritten, or unable to be rewritten
   246  				// because of the features in effect in their block.  Either way, they
   247  				// are treated as a "new var" if 3 elements are present.
   248  
   249  				if vars0[2] == nil {
   250  					if vars1[2] == nil {
   251  						// both subtrees are 2-element and were not rewritten.
   252  						//
   253  						// TODO a clever person would look at subtrees of inputs,
   254  						// e.g. rewrite
   255  						//        ((a AND b) XOR b) XOR (d  XOR (c AND d))
   256  						// to    (((a AND b) XOR b) XOR  d) XOR (c AND d)
   257  						// to v = TERNLOG(truthtable, a, b, d) XOR (c AND d)
   258  						// and return the variable set {v, c, d}
   259  						//
   260  						// But for now, just restart with a0 and a1.
   261  						return [3]*Value{a0, a1, nil}
   262  					} else {
   263  						// a1 (maybe) rewrote, a0 has room for another var
   264  						vars = addOne(vars0, a1)
   265  					}
   266  				} else if vars1[2] == nil {
   267  					// a0 (maybe) rewrote, a1 has room for another var
   268  					vars = addOne(vars1, a0)
   269  				} else if !ok {
   270  					// both (maybe) rewrote
   271  					// a0 and a1 are different because otherwise their variable
   272  					// sets would have combined "ok".
   273  					return [3]*Value{a0, a1, nil}
   274  				}
   275  				// continue with either the vars from "ok" or the updated set of vars.
   276  			}
   277  		}
   278  		// if root and 3 vars and hasFeature, rewrite.
   279  		if slo&sloInterior == 0 && vars[2] != nil && hasFeature {
   280  			replace(v, vars)
   281  			return [3]*Value{v, nil, nil}
   282  		}
   283  		return vars
   284  	}
   285  
   286  	for _, v := range roots {
   287  		if f.pass.debug > 1 {
   288  			f.Warnl(v.Pos, "SLO root %s", v.LongString())
   289  		}
   290  		rewrite(v)
   291  	}
   292  }
   293  

View as plain text