// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package ssa

import (
	"fmt"
	"internal/goarch"
	"slices"
)

var truthTableValues [3]uint8 = [3]uint8{0b1111_0000, 0b1100_1100, 0b1010_1010}

func (slop SIMDLogicalOP) String() string {
	if slop == sloInterior {
		return "leaf"
	}
	interior := ""
	if slop&sloInterior != 0 {
		interior = "+interior"
	}
	switch slop &^ sloInterior {
	case sloAnd:
		return "and" + interior
	case sloXor:
		return "xor" + interior
	case sloOr:
		return "or" + interior
	case sloAndNot:
		return "andNot" + interior
	case sloNot:
		return "not" + interior
	}
	return "wrong"
}

func rewriteTern(f *Func) {
	if f.maxCPUFeatures == CPUNone {
		return
	}

	arch := f.Config.Ctxt().Arch.Family
	// TODO there are other SIMD architectures
	if arch != goarch.AMD64 {
		return
	}

	boolExprTrees := make(map[*Value]SIMDLogicalOP)

	// Find logical-expr expression trees, including leaves.
	// interior nodes will be marked sloInterior,
	// root nodes will not be marked sloInterior,
	// leaf nodes are only marked sloInterior.
	for _, b := range f.Blocks {
		for _, v := range b.Values {
			slo := classifyBooleanSIMD(v)
			switch slo {
			case sloOr,
				sloAndNot,
				sloXor,
				sloAnd:
				boolExprTrees[v.Args[1]] |= sloInterior
				fallthrough
			case sloNot:
				boolExprTrees[v.Args[0]] |= sloInterior
				boolExprTrees[v] |= slo
			}
		}
	}

	// get a canonical sorted set of roots
	var roots []*Value
	for v, slo := range boolExprTrees {
		if f.pass.debug > 1 {
			f.Warnl(v.Pos, "%s has SLO %v", v.LongString(), slo)
		}

		if slo&sloInterior == 0 && v.Block.CPUfeatures.hasFeature(CPUavx512) {
			roots = append(roots, v)
		}
	}
	slices.SortFunc(roots, func(u, v *Value) int { return int(u.ID - v.ID) }) // IDs are small enough to not care about overflow.

	// This rewrite works by iterating over the root set.
	// For each boolean expression, it walks the expression
	// bottom up accumulating sets of variables mentioned in
	// subexpressions, lazy-greedily finding the largest subexpressions
	// of 3 inputs that can be rewritten to use ternary-truth-table instructions.

	// rewrite recursively attempts to replace v and v's subexpressions with
	// ternary-logic truth-table operations, returning a set of not more than 3
	// subexpressions within v that may be combined into a parent's replacement.
	// V need not have the CPU features that allow a ternary-logic operation;
	// in that case, v will not be rewritten.  Replacements also require
	// exactly 3 different variable inputs to a boolean expression.
	//
	// Given the CPU feature and 3 inputs, v is replaced in the following
	// cases:
	//
	// 1) v is a root
	// 2) u = NOT(v) and u lacks the CPU feature
	// 3) u = OP(v, w) and u lacks the CPU feature
	// 4) u = OP(v, w) and u has more than 3 variable inputs.	var rewrite func(v *Value) [3]*Value
	var rewrite func(v *Value) [3]*Value

	// computeTT returns the truth table for a boolean expression
	// over the variables in vars, where vars[0] varies slowest in
	// the truth table and vars[2] varies fastest.
	// e.g. computeTT( "and(x, or(y, not(z)))", {x,y,z} ) returns
	// (bit 0 first) 0 0 0 0 1 0 1 1 = (reversed) 1101_0000 = 0xD0
	//            x: 0 0 0 0 1 1 1 1
	//            y: 0 0 1 1 0 0 1 1
	//            z: 0 1 0 1 0 1 0 1
	var computeTT func(v *Value, vars [3]*Value) uint8

	// combine two sets of variables into one, returning ok/not
	// if the two sets contained 3 or fewer elements.  Combine
	// ensures that the sets of Values never contain duplicates.
	// (Duplicates would create less-efficient code, not incorrect code.)
	combine := func(a, b [3]*Value) ([3]*Value, bool) {
		var c [3]*Value
		i := 0
		for _, v := range a {
			if v == nil {
				break
			}
			c[i] = v
			i++
		}
	bloop:
		for _, v := range b {
			if v == nil {
				break
			}
			for _, u := range a {
				if v == u {
					continue bloop
				}
			}
			if i == 3 {
				return [3]*Value{}, false
			}
			c[i] = v
			i++
		}
		return c, true
	}

	computeTT = func(v *Value, vars [3]*Value) uint8 {
		i := 0
		for ; i < len(vars); i++ {
			if vars[i] == v {
				return truthTableValues[i]
			}
		}
		slo := boolExprTrees[v] &^ sloInterior
		a := computeTT(v.Args[0], vars)
		switch slo {
		case sloNot:
			return ^a
		case sloAnd:
			return a & computeTT(v.Args[1], vars)
		case sloXor:
			return a ^ computeTT(v.Args[1], vars)
		case sloOr:
			return a | computeTT(v.Args[1], vars)
		case sloAndNot:
			return a & ^computeTT(v.Args[1], vars)
		}
		panic("switch should have covered all cases, or unknown var in logical expression")
	}

	replace := func(a0 *Value, vars0 [3]*Value) {
		imm := computeTT(a0, vars0)
		op := ternOpForLogical(a0.Op)
		if op == a0.Op {
			panic(fmt.Errorf("should have mapped away from input op, a0 is %s", a0.LongString()))
		}
		if f.pass.debug > 0 {
			f.Warnl(a0.Pos, "Rewriting %s into %v of 0b%b %v %v %v", a0.LongString(), op, imm,
				vars0[0], vars0[1], vars0[2])
		}
		a0.reset(op)
		a0.SetArgs3(vars0[0], vars0[1], vars0[2])
		a0.AuxInt = int64(int8(imm))
	}

	// addOne ensures the no-duplicates addition of a single value
	// to a set that is not full.  It seems possible that a shared
	// subexpression in tricky combination with blocks lacking the
	// AVX512 feature might permit this.
	addOne := func(vars [3]*Value, v *Value) [3]*Value {
		if vars[2] != nil {
			panic("rewriteTern.addOne, vars[2] should be nil")
		}
		if v == vars[0] || v == vars[1] {
			return vars
		}
		if vars[1] == nil {
			vars[1] = v
		} else {
			vars[2] = v
		}
		return vars
	}

	rewrite = func(v *Value) [3]*Value {
		slo := boolExprTrees[v]
		if slo == sloInterior { // leaf node, i.e., a "variable"
			return [3]*Value{v, nil, nil}
		}
		var vars [3]*Value
		hasFeature := v.Block.CPUfeatures.hasFeature(CPUavx512)
		if slo&sloNot == sloNot {
			vars = rewrite(v.Args[0])
			if !hasFeature {
				if vars[2] != nil {
					replace(v.Args[0], vars)
					return [3]*Value{v, nil, nil}
				}
				return vars
			}
		} else {
			var ok bool
			a0, a1 := v.Args[0], v.Args[1]
			vars0 := rewrite(a0)
			vars1 := rewrite(a1)
			vars, ok = combine(vars0, vars1)

			if f.pass.debug > 1 {
				f.Warnl(a0.Pos, "combine(%v, %v) -> %v, %v", vars0, vars1, vars, ok)
			}

			if !(ok && v.Block.CPUfeatures.hasFeature(CPUavx512)) {
				// too many variables, or cannot rewrite current values.
				// rewrite one or both subtrees if possible
				if vars0[2] != nil && a0.Block.CPUfeatures.hasFeature(CPUavx512) {
					replace(a0, vars0)
				}
				if vars1[2] != nil && a1.Block.CPUfeatures.hasFeature(CPUavx512) {
					replace(a1, vars1)
				}

				// 3-element var arrays are either rewritten, or unable to be rewritten
				// because of the features in effect in their block.  Either way, they
				// are treated as a "new var" if 3 elements are present.

				if vars0[2] == nil {
					if vars1[2] == nil {
						// both subtrees are 2-element and were not rewritten.
						//
						// TODO a clever person would look at subtrees of inputs,
						// e.g. rewrite
						//        ((a AND b) XOR b) XOR (d  XOR (c AND d))
						// to    (((a AND b) XOR b) XOR  d) XOR (c AND d)
						// to v = TERNLOG(truthtable, a, b, d) XOR (c AND d)
						// and return the variable set {v, c, d}
						//
						// But for now, just restart with a0 and a1.
						return [3]*Value{a0, a1, nil}
					} else {
						// a1 (maybe) rewrote, a0 has room for another var
						vars = addOne(vars0, a1)
					}
				} else if vars1[2] == nil {
					// a0 (maybe) rewrote, a1 has room for another var
					vars = addOne(vars1, a0)
				} else if !ok {
					// both (maybe) rewrote
					// a0 and a1 are different because otherwise their variable
					// sets would have combined "ok".
					return [3]*Value{a0, a1, nil}
				}
				// continue with either the vars from "ok" or the updated set of vars.
			}
		}
		// if root and 3 vars and hasFeature, rewrite.
		if slo&sloInterior == 0 && vars[2] != nil && hasFeature {
			replace(v, vars)
			return [3]*Value{v, nil, nil}
		}
		return vars
	}

	for _, v := range roots {
		if f.pass.debug > 1 {
			f.Warnl(v.Pos, "SLO root %s", v.LongString())
		}
		rewrite(v)
	}
}