/*	scheduler.c

	Tuple scheduler and register allocator.
*/



#undef DEBUG
#undef DEBUG_BEFOREBB
#undef DEBUG_ENDBB
#undef DEBUG_SEARCH
#undef DEBUG_PERF
#undef DEBUG_TUPLIST
#undef DEBUG_m2rmode
#undef DEBUG_SYMNAME
#undef DEBUG_SCHED
#undef DEBUG_FSCHED
#undef DEBUG_CODEGEN

#undef NEWSTUFF

#include <time.h>

#include "stdpccts.h"
#include "swartypes.h"
#include "tuple.h"
#include "oputils.h"
#include "scheduler.h"
#include "cpool.h"
#include "spool.h"
#include "showir.h"
#include "output.h"
#include "messages.h"

extern char     *funcname;      /* current function name */


static tuple	safetup[MAXTUP];	/* tuple buffer */
tuple	*tup = &(safetup[1]);	/* tuple buffer */
int	tupsp;			/* tuple stack pointer */

static int	sched[MAXTUP];		/* tuple schedule */
static int	schedsp = 0;

#define NOTFOUND -1
static int	sbest[MAXTUP];		/* best schedule so far */
static int	sbestval = NOTFOUND;	/* value of best schedule */
static int	schedval = 0;		/* value of current schedule */

static int	schedtim;		/* time to stop scheduler */


void
begin_bb(void)
{
	/* Reinitialize the tuple stack pointer
	   and fieldcount for the next block
	*/
	tupsp = 0;
	fieldcount = 0;
}

static void
incref(int i)
{
	/* Increment refs of live tuples */
	/* Prerequisite tuples are live */

	if (i >= MAXTUP)
		bug("tuple number is greater than MAXTUP in tuple.h");

	if ((i >= 0) &&
	    (++(tup[i].refs) == 1)) {
		if (tup[i].arg[0] != -1) incref(tup[i].arg[0]);
		if (tup[i].arg[1] != -1) incref(tup[i].arg[1]);
		if (tup[i].arg[2] != -1) incref(tup[i].arg[2]);
	}
}

static int
before_bb(int here,
int there,
int init)
{
	/* Returns 1 if "here" is a prerequiste for "there".
	   Returns 0 otherwise.
	*/
	static int cache[MAXTUP][MAXTUP];
	int rval;

	#ifdef DEBUG_BEFOREBB
		info(0, "Start before_bb");
	#endif

	if (init) {
		#ifdef DEBUG_BEFOREBB
			info(0, "before_bb():start init loop");
		#endif

		for (here=0; here<tupsp; ++here)
			for (there=0; there<tupsp; ++there)
				cache[here][there] = -1;

		#ifdef DEBUG_BEFOREBB
			info(0, "before_bb():end init loop");
		#endif

		return 0;
	}

	if (cache[here][there] != -1) return(cache[here][there]);

	if ((there < 0) ||
	    (tup[there].refs < 1)) {
		cache[here][there] = 0;
		return(0);
	}

	#ifdef DEBUG_BEFOREBB
		info(0, "before_bb: a");
	#endif
	if (here == there) {
		cache[here][there] = 1;
		return(1);
	}

	#ifdef DEBUG_BEFOREBB
		info(0, "before_bb: b");
	#endif
	switch (tup[there].op) {
	case TPERM:
		/* Trinary op */
		#ifdef DEBUG_BEFOREBB
			info(0, "before_bb: c");
		#endif
		rval = (before_bb(here, tup[there].arg[0], 0) ||
		        before_bb(here, tup[there].arg[1], 0) ||
		        before_bb(here, tup[there].arg[2], 0));
		cache[here][there] = rval;
		return rval;
	case ADD:
	case ADDH:
	case AND:
	case ANDN:
	case AVG:
	case DIV:
	case EQ:
	case EQ_C:
	case GE:
	case GT:
	case GT_C:
	case MOD:
	case MUL:
	case MULH:
	case MULEVEN:
	case MULODD:
	case OR:
	case SHL:
	case SHLBIT:
	case SHLBYTE:
	case SHR:
	case SHRBIT:
	case SHRBYTE:
	case SUB:
	case XOR:
	case PACK:
	case PACKS2U:
	case INTRLVLOW:
	case INTRLVHIGH:
	case INTRLVEVEN:
	case INTRLVODD:
	case PERM:
	case MAX:
	case MIN:
	case RCP1:
	case RCP2:
	case REPL:
	case STORER:
	case STORERR:
		/* Binary op */
		#ifdef DEBUG_BEFOREBB
			info(0, "before_bb: d");
		#endif
		tup[there].arg[2] = -1;
		rval = (before_bb(here, tup[there].arg[0], 0) ||
		        before_bb(here, tup[there].arg[1], 0));
		cache[here][there] = rval;
		return rval;
	case I2F:
	case F2I:
	case LNOT:
	case NEG:
	case NOT:
	case LEA:
	case LOADR:
	case LOADRR:
	case STORE:
	case RCP:
	case UNPACKH:
	case UNPACKL:
		/* Unary op */
		#ifdef DEBUG_BEFOREBB
			info(0, "before_bb: e");
		#endif
		tup[there].arg[1] = -1;
		tup[there].arg[2] = -1;
		rval = before_bb(here, tup[there].arg[0], 0);
		cache[here][there] = rval;
		return rval;

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"before_bb op=%s (this cannot happen?)",
				opname(tup[there].op));
			bug(buf);
		}
		/* Fall through... */
	case LOAD:
	case LVSL:
	case NUM:
		tup[there].arg[0] = -1;
		tup[there].arg[1] = -1;
		tup[there].arg[2] = -1;
		break;
	}

	#ifdef DEBUG_BEFOREBB
		info(0, "before_bb: f");
	#endif
	cache[here][there] = 0;
	return(0);
}

int
regsavail(void)
{
        if ((optcpu & CPU_AltiVec) || (optcpu == GenericIA32)) {
                return 32;
        } else {
                return 8;
        }
}

static int
pickreg(register int pos)
{
	/* Find and use a free register.
	   Returns 0 if a register was found, 1 if not.
	*/

	register int i, j, k;
	int regbusy[maxspills()+regsavail()];	/* -1 if not in use */
	int regwho[maxspills()+regsavail()];	/* Tuple using this register */
	register int dontuse = maxspills()+regsavail();

	/* Mark all registers as not busy and never used */
	for (i=0; i<maxspills()+regsavail(); ++i) {
		regbusy[i] = -1;
		regwho[i] = -1;
	}

	/* If there is a 1st argument to this tuple, don't use its register? */
	if (tup[ sched[pos] ].arg[0] != -1) {
		dontuse = tup[ tup[ sched[pos] ].arg[0] ].reg;
	}

	/* Find/Store information about previous tuples in the list */
	for (i=0; i<=pos; ++i) {
		/* Mark busy registers with the time (i.e. sched[]) slot in
		   which they are used */
		if ((tup[ sched[i] ].reg != -1) &&
		    (tup[ sched[i] ].trefs > 0)) {
			regbusy[ tup[ sched[i] ].reg ] = i;
		}

		/* Mark when each argument was last used in regwho */
		if ((tup[ sched[i] ].arg[0] != -1) &&
		    (regwho[ tup[ tup[ sched[i] ].arg[0] ].reg ] == -1)) {
			regwho[ tup[ tup[ sched[i] ].arg[0] ].reg ] = i;
		}
		if ((tup[ sched[i] ].arg[1] != -1) &&
		    (regwho[ tup[ tup[ sched[i] ].arg[1] ].reg ] == -1)) {
			regwho[ tup[ tup[ sched[i] ].arg[1] ].reg ] = i;
		}
		if ((tup[ sched[i] ].arg[2] != -1) &&
		    (regwho[ tup[ tup[ sched[i] ].arg[2] ].reg ] == -1)) {
			regwho[ tup[ tup[ sched[i] ].arg[2] ].reg ] = i;
		}
	}

	/* Set j to the least recently used free register? */
	/* Uses lowest numbered when all are free */
	k = pos;
	j = numspills+regsavail();
	for (i=0; i<numspills+regsavail(); ++i) {
		if (regbusy[i] == -1) {
			if ((regwho[i] < k) &&
			    (dontuse != i)) {
				j = i;
				k = regwho[i];
			}
		}
	}

	/* If j is a valid register or pseudoregister, use it */
	if (j < numspills+regsavail()) {
		switch (tup[ sched[pos] ].op) {
		case LOAD:
		case LVSL:
		case NUM:
			/* Use j for this tuple */
			tup[ sched[pos] ].reg = j;
			return(0);
		default:
			/* Give j to the first argument and use its
			   register instead */
			tup[ sched[pos] ].reg =
			  tup[ tup[ sched[pos] ].arg[0] ].reg;
			tup[ tup[ sched[pos] ].arg[0] ].reg = j;
			tup[ sched[pos] ].antidep = j;
			return(0);
		}
	}

	/* If j is not valid, no registers are available */
	return(1);
}

static int
argregs(register int n)
{
	/* Allocate register(s?) for the tuple corresponding
	     to position n in the candidate schedule.
	   Returns 0 if register allocation was okay.
	   Returns 1 otherwise.
	*/

	register int s = sched[n];

	/* Temporary reference count.  We'll decrement this as we allocate
	   time slots to referencing tuples.
	*/
	tup[s].trefs = tup[s].refs;

	/* Clear previous register assignment info */
	tup[s].oreg = -1;
	tup[s].reg = -1;

	if ((tup[s].op==STORE) || (tup[s].op==STORER) || (tup[s].op==STORERR)) {
		/* stores do not need registers (to hold the result?) */
		--(tup[ tup[s].arg[0] ].trefs);
	} else {
		tup[s].antidep = -1;

		if (tup[s].arg[0] != -1) {
			/* Plan to use arg0's register... */
			tup[s].reg = tup[ tup[s].arg[0] ].reg;

			if (--(tup[ tup[s].arg[0] ].trefs) > 0) {
				/* ...but if it is referenced by later tuples,
				   pick a new register */
				if (pickreg(n)) return(1);
			}
		} else {
			if (pickreg(n)) return(1);
		}

		/* free registers that were not overwritten */
		if (tup[s].arg[1] != -1) {
			--(tup[ tup[s].arg[1] ].trefs);
		}
		if (tup[s].arg[2] != -1) {
			--(tup[ tup[s].arg[2] ].trefs);
		}
	}
	tup[s].oreg = tup[s].reg;

	return(0);
}

static void
unregs(register int n)
{
	register int s = sched[n];

	if ((tup[s].op==STORE) || (tup[s].op==STORER) || (tup[s].op==STORERR)){
		/* stores do not need registers (to hold the result?) */
		++(tup[ tup[s].arg[0] ].trefs);
	} else {
		/* unfree registers not overwritten */
		if (tup[s].arg[1] != -1) {
			++(tup[ tup[s].arg[1] ].trefs);
		}
		if (tup[s].arg[2] != -1) {
			++(tup[ tup[s].arg[2] ].trefs);
		}

		/* free the destination register */
		if (tup[s].arg[0] != -1) {
			++(tup[ tup[s].arg[0] ].trefs);

			if (tup[s].antidep != -1) {
				tup[ tup[s].arg[0] ].reg = tup[s].oreg;
			}
		}
	}
}


inline int
sched_eval(void)
{
	/* If the candidate schedule is better than the current best,
	   save it as the current best.
	   Return 1 if the absolute best schedule has been found, or
	    if the timelimit has been reached.
	*/

	register int i;

	if ((sbestval == NOTFOUND) ||
	    (schedval < sbestval)) {
		for (i=0; i<schedsp; ++i) {
			sbest[i] = sched[i];
		}
		sbestval = schedval;
#ifdef NOTDEFD
/* I don't think this should be done... */
		schedtim = (time(0) + opttime);
#endif
	}

	/* Timeout if needed... */
	if (time(0) >= schedtim) {
		return(1);
	}

	/* Is this as good as it gets? */
	return(sbestval == 0);
}

inline int
sched_pos(register int i)
{
	/* Find the position in the schedule of tuple i */

	register int j;

	for (j=0; j<schedsp; ++j) {
		if (sched[j] == i) return(j);
	}

	{
		char buf[64];
		snprintf(buf, 64, "sched_pos could not find instruction %d "
				  "(this cannot happen?)", i);
		bug(buf);
	}
	return(-1);
}

static int
possible_conflict(register int i,
register int j)
{
	/* Returns 1 if tuples i and j may be memory accesses of
	   the same fragment of the same vector; 0 otherwise.
	   This assumes that i!=j.
	*/
	sym *s;
	int frag;
	int off;

	switch(tup[i].op) {
	case LOAD:
		s = tup[i].symbol;
		frag = tup[i].fragment;
		off = tup[i].offset;

		switch(tup[j].op) {
		case LOAD:
			/* LOADs are reused unless the fragments or offsets
			   differ */
			return 0;
		case LOADR:
			/* Assume conflict if symbols match */
			if (s == tup[tup[j].arg[0]].symbol) {
				return 1;
			} else {
				return 0;
			}
		case LOADRR:
			/* Assume conflict if symbols match */
			if (s == tup[j].symbol) {
				return 1;
			} else {
				return 0;
			}
		case STORE:
			/* Assume conflict if both symbol and fragment match */
			if ((s == tup[j].symbol) && (frag == tup[j].fragment)) {
				return 1;
			} else {
				return 0;
			}
		case STORER:
			/* Assume conflict if symbols match */
			if (s == tup[tup[j].arg[0]].symbol) {
				return 1;
			} else {
				return 0;
			}
		case STORERR:
			/* Assume conflict if symbols match */
			if (s == tup[j].symbol) {
				return 1;
			} else {
				return 0;
			}
		default:
			return 0;
		}
		break;

	case LOADR:
		s = tup[tup[i].arg[0]].symbol;

		switch(tup[j].op) {
		case LOAD:
		case LOADRR:
			/* Assume conflict if symbols match */
			if (s == tup[j].symbol) {
				return 1;
			} else {
				return 0;
			}
		case LOADR:
			/* Assume conflict if symbols match */
			if (s == tup[tup[j].arg[0]].symbol) {
				return 1;
			} else {
				return 0;
			}
		case STORE:
		case STORERR:
			/* Assume conflict if symbols match */
			if (s == tup[j].symbol) {
				return 1;
			} else {
				return 0;
			}
		case STORER:
			/* Assume conflict if symbols match */
			if (s == tup[tup[j].arg[0]].symbol) {
				return 1;
			} else {
				return 0;
			}
		default:
			return 0;
		}
		break;

	case STORE:
		s = tup[i].symbol;
		frag = tup[i].fragment;
		off = tup[i].offset;

		switch(tup[j].op) {
		case LOAD:
			/* Assume conflict if both symbol and fragment match */
			if ((s == tup[j].symbol) && (frag == tup[j].fragment)) {
				return 1;
			} else {
				return 0;
			}
		case LOADR:
			/* Assume conflict if symbols match */
			if (s == tup[tup[j].arg[0]].symbol) {
				return 1;
			} else {
				return 0;
			}
		case LOADRR:
			/* Assume conflict if symbols match */
			if (s == tup[j].symbol) {
				return 1;
			} else {
				return 0;
			}
		case STORE:
		case STORERR:
			/* Assume conflict if symbols match */
			if (s == tup[j].symbol) {
				return 1;
			} else {
				return 0;
			}
		case STORER:
			/* Assume conflict if symbols match */
			if (s == tup[tup[j].arg[0]].symbol) {
				return 1;
			} else {
				return 0;
			}
		default:
			return 0;
		}
		break;

	case STORER:
		s = tup[tup[i].arg[0]].symbol;
		switch(tup[j].op) {
		case LOAD:
		case LOADRR:
			/* Assume conflict if symbols match */
			if (s == tup[j].symbol) {
				return 1;
			} else {
				return 0;
			}
		case LOADR:
			/* Assume conflict if symbols match */
			if (s == tup[tup[j].arg[0]].symbol) {
				return 1;
			} else {
				return 0;
			}
		case STORE:
		case STORERR:
			/* Assume conflict if symbols match */
			if (s == tup[j].symbol) {
				return 1;
			} else {
				return 0;
			}
		case STORER:
			/* Assume conflict if symbols match */
			if (s == tup[tup[j].arg[0]].symbol) {
				return 1;
			} else {
				return 0;
			}
		default:
			return 0;
		}
		break;

	default:
		return 0;
	}
}

inline int
sched_ok(register int i,
register int pos)
{
	/* Returns 1 if it is okay to schedule tuple i in position pos?
	   Returns 0 otherwise.
	*/

	register int j, k;

	/* Do the quick before and after checks */
	if (pos < tup[ sched[i] ].before) return(0);
	if (pos >= (schedsp - tup[ sched[i] ].after)) return(0);

	/* Are the immediate predecessors scheduled before this? */
	if (((k = tup[ sched[i] ].arg[0]) != -1) &&
	    (sched_pos(k) >= pos)) return(0);
	if (((k = tup[ sched[i] ].arg[1]) != -1) &&
	    (sched_pos(k) >= pos)) return(0);
	if (((k = tup[ sched[i] ].arg[2]) != -1) &&
	    (sched_pos(k) >= pos)) return(0);

	/* If this is a store, is anti-dependence ok? */
	if (tup[ sched[i] ].op == STORE) {
		/* no load of the same object should follow this */
		for (j=i; j<schedsp; ++j) {
			/* Is this an explicit load? */
			if ((tup[ sched[j] ].op == LOAD) &&
			    (tup[ sched[j] ].symbol ==
			     tup[ sched[i] ].symbol) &&
			    (tup[ sched[j] ].fragment ==
			     tup[ sched[i] ].fragment) &&
			    (tup[ sched[j] ].offset ==
			     tup[ sched[i] ].offset)) {
				/* Load of same object */
				return(0);
			}

			/* Is this an m2r load? */
			if ((tup[ sched[j] ].m2rmode != -1) &&
			    (tup[ tup[ sched[j] ].m2rmode ].symbol ==
			     tup[ sched[i] ].symbol) &&
			    (tup[ tup[ sched[j] ].m2rmode ].fragment ==
			     tup[ sched[i] ].fragment) &&
			    (tup[ tup[ sched[j] ].m2rmode ].offset ==
			     tup[ sched[i] ].offset)) {
				/* Implicit load of same object */
				return(0);
			}
		}
	}

	/* Don't schedule this tuple ahead of any previous tuple with which
	   it may have a conflict.  Effectively, this means that no lower-
	   numbered conflicting tuple should follow this in the schedule. */
	for (j=i+1; j<schedsp; ++j) {
		if ((sched[i]>sched[j]) &&
		    (possible_conflict(sched[i],sched[j])))
			return(0);
	}

	/* Must be ok...? */
	return(1);
}

inline int
simcost(register int s)
{
	/* Return estimated cost of instructions simulated in
	   target- specific header file */
	register int val = 0;

	switch (tup[s].op) {
	case DIV:
	case MOD:
		/* Approximate cost as per serial code estimate */
		val = 2 + 3 * (64 / bitsperfield(tup[s].type.bits));
		break;
	case MUL:
		if (bitsperfield(tup[s].type.bits) == 32) {
			/* This is also faked by serial code */
			val = 2 + 3 * 2;
		}
		break;
	}

	if (((optcpu & CPU_2PIPE) == 0) && (val & 1)) ++val;
	return(val);
}

inline int
sched_class(int op)
{
	/* Determines MMX internal resource usage for the purpose
	   of estimating conflicts, hence performance
	*/

	switch (op) {
	case AND:
	case ANDN:
	case OR:
	case XOR:
	case NOT:
	case ADD:
	case ADDH:
	case EQ:
	case EQ_C:
	case GE:
	case GT:
	case GT_C:
	case SUB:
	case NEG:
	case LEA:
	case STORE:
	case STORER:
	case STORERR:
		return(0);
	case AVG:
	case DIV:
	case MOD:
	case I2F:
	case F2I:
	case RCP:
	case RCP1:
	case RCP2:
		return(0);
	case MUL:
	case MULH:
	case MULEVEN:
	case MULODD:
		return(1);
	case SHL:
	case SHLBIT:
	case SHLBYTE:
	case SHR:
	case SHRBIT:
	case SHRBYTE:
	case PACK:
	case PACKS2U:
	case UNPACKH:
	case UNPACKL:
	case INTRLVLOW:
	case INTRLVHIGH:
		return(2);
	case LOAD:
	case LOADR:
	case LOADRR:
	case LVSL:
	case NUM:
		return(4);
	case MIN:
	case MAX:
		return(8);
	case INTRLVEVEN:
	case INTRLVODD:
	case PERM:
	case TPERM:
	case REPL:
		return(2);
	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"sched_class op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
		return(0);
	}
}

inline int
sched_cost(register int pos)
{
	/* Compute approximate cost of this placement */
	register int val = 0;
	register int val0 = 0;
	register int val1 = 0;
	register int s = sched[pos];

	/* Is this an expensive simulated op? */
	if ((val = simcost(s)) > 0) {
		/* Simulated instructions are S L O W...
		   other delays are negligible
		*/
		return(val);
	}

	/* Two same-class ops in a row? */
	if ((pos > 0) &&
	    ((sched_class(tup[s].op) &
	      sched_class(tup[ sched[pos - 1] ].op)))) {
		/* Expected cost of 0-1/2 clock */
		val += ((optcpu & CPU_2PIPE) ? 1 : 0);
	}

	/* Cyrix MAX instruction takes 2 clocks */
	if ((optcpu & CPU_XMMX) &&
	    (tup[s].op == MAX)) {
		val += 4;
	}

	/* Where are the operands coming from? */
	if (tup[s].arg[2] != -1) {
		register int i = sched_pos(tup[s].arg[2]);
		register int dist = 2;

		switch (tup[sched[i]].op) {
		case LOAD:
		case LOADR:
		case LOADRR:
		case LVSL:
		case NUM:
			dist = 4;
			break;
		case MUL:
			dist = ((optcpu == K6_2) ? 4 : 6);
		}

		val1 = (dist / (pos - i));
		if (((optcpu & CPU_2PIPE) == 0) && (val1 & 1)) ++val1;
	}
	if (tup[s].arg[1] != -1) {
		register int i = sched_pos(tup[s].arg[1]);
		register int dist = 2;

		switch (tup[sched[i]].op) {
		case LOAD:
		case LOADR:
		case LOADRR:
		case LVSL:
		case NUM:
			dist = 4;
			break;
		case MUL:
			dist = ((optcpu == K6_2) ? 4 : 6);
		}

		val1 = (dist / (pos - i));
		if (((optcpu & CPU_2PIPE) == 0) && (val1 & 1)) ++val1;
	}
	if (tup[s].arg[0] != -1) {
		register int i = sched_pos(tup[s].arg[0]);
		register int dist = 2;

		switch (tup[sched[i]].op) {
		case LOAD:
		case LOADR:
		case LOADRR:
		case LVSL:
		case NUM:
			dist = 4;
			break;
		case MUL:
			dist = ((optcpu == K6_2) ? 4 : 6);
		}

		val0 += (dist / (pos - i));
		if (((optcpu & CPU_2PIPE) == 0) && (val0 & 1)) ++val0;
	}
	if (val0 > val1) {
		val += val0;
	} else {
		val += val1;
	}

	/* Also have to give a penalty for inserted movq */
	if ((tup[s].antidep != -1) &&
	    (tup[s].op != STORE) &&
	    (tup[s].op != STORER) &&
	    (tup[s].op != STORERR)) {
		/* 1/2-1 clock penalty */
		val += ((optcpu & CPU_2PIPE) ? 1 : 2);
	}

	return(val);
}


static int perm_count = 0;

static int
sched_perm(register int pos)
{
	/* Recursively build a schedule.
	   Returns 0 if this schedule is not better than the current best.
	   Otherwise, if we have a comlete schedule(?), returns its cost.
	   If the schedule is not complete:
		return 1 if timed-out during subschedule
		return 1 if sucessful building subschedule
		return 0 if failed building subschedule
	*/

	register int i;

	#ifdef DEBUG_SCHED
		fprintf(stderr, "Start sched_perm(%d):\n", pos);
		fprintf(stderr, " sbestval=%d, schedval=%d schedsp=%d, "
				"perm_count=%d, schedtim=%d\n",
				sbestval, schedval, schedsp, perm_count,
				schedtim);
	#endif

	/* If we already have a best schedule,
	   and this one isn't better, return 0. */
	if ((sbestval != NOTFOUND) &&
	    (schedval >= sbestval)) return(0);

	/* If a complete schedule, return its cost */
	if (pos >= schedsp) {
		/* Have a complete schedule */
		return(sched_eval());
	}

/* HEREHERE - What is this? */
	/* Timeout if needed... */
	if (--perm_count < 0) {
		perm_count = 2 * schedsp;
		if (time(0) >= schedtim) {
			return(1);
		}
	}

	/* Hack to improve scheduling...
	   if next instruction depends on previous,
	   swap it with next op in perm order
	*/
	if ((pos > 1) &&
	    ((pos + 1) < schedsp) &&
	    ((tup[sched[pos]].arg[0] == sched[pos-1]) ||
	     (tup[sched[pos]].arg[1] == sched[pos-1]) ||
	     (tup[sched[pos]].arg[2] == sched[pos-1]))) {
		i = sched[pos];
		sched[pos] = sched[pos+1];
		sched[pos+1] = i;
	}

	/* Only a partial schedule, continue building */
	for (i=pos; i<schedsp; ++i) {
		if (sched_ok(i, pos)) {
			register int j;
			register int k;

			j = sched[i];
			sched[i] = sched[pos];
			sched[pos] = j;

			/* Sometime I should rewrite this to handle trinaries
			   intelligently. */
			if ((tup[ sched[pos] ].arg[1] != -1) &&
			    (!ordered(tup[ sched[pos] ].op)) &&
			    (tup[ tup[ sched[pos] ].arg[0] ].trefs >
			     tup[ tup[ sched[pos] ].arg[1] ].trefs)) {
				/* normalize arg order to make the first
				   arg the one with the lowest reference
				   count, hence least likely to need a
				   new register when this clobbers it
				*/
				register int t;
				t = tup[ sched[pos] ].arg[0];
				tup[ sched[pos] ].arg[0] =
					tup[ sched[pos] ].arg[1];
				tup[ sched[pos] ].arg[1] = t;
			}

			/* if we allocated registers ok, recurse */
			if (!argregs(pos)) {
				k = sched_cost(pos);
				schedval += k;

				/* If we recurse successfully, return success */
				if (sched_perm(pos + 1)) return(1);

				/* Otherwise, backtrack */
				schedval -= k;
			}

			/* If we are here, we must undo this attempt */

			/* undo the register allocation we did */
			unregs(pos);

			sched[pos] = sched[i];
			sched[i] = j;
		}
	}

	/* If we got here, we failed */
	return(0);
}

static char *
symname(sym *s,
int frag,
int off)
{
	static char symnamebuf[256];

	#ifdef DEBUG_SYMNAME
		fflush(Cout);
		fprintf(stderr,
			"symname(): s=%p s->text=%s fragment=%d offset=%d\n",
			s,
			(s)? ((s->text)? s->text : "(null)") : "-",
			frag,
			off );
		fflush(stderr);
	#endif

	if ( !s ) {
		bug ("s is null in symname()");
		return "";
	}

	if ( !(s->text) ) {
		bug ("s->text is null in symname()");
		return "";
	}

	snprintf(&(symnamebuf[0]),
		256,
		"*(((p%d_t *) ((char *)%s%s %+d)) + %llu)",
		bitsperfrag(),
		((s->scope == 1) ? "" : "&"),
		s->text,
		off,
		tup[frag].immed.uq[0]);
	return(&(symnamebuf[0]));
}

static char *
show_suf2(int i)
{
	if (optcpu & CPU_AltiVec) {
		switch (i) {
		case 1:		return("1");
		case 2:		return("2");
		case 4:		return("4");
		case 8:		return("b");
		case 16:	return("h");
		case 32:	return("w");
		case 64:	return("q");
		default:	return("");
		}
	} else {
		switch (i) {
		case 1:		return("1");
		case 2:		return("2");
		case 4:		return("4");
		case 8:		return("b");
		case 16:	return("w");
		case 32:	return("d");
		default:	return("q");
		}
	}
}

static char *
show_suf(int i)
{
	return(show_suf2(tup[i].type.bits));
}

static char *
ia32regname(int regnum)
{
	static char buf[9];
	#ifdef DEBUG_CODEGEN
		int rval = 
	#endif

	snprintf ( buf, 8, "reg[%d]", regnum );
	#ifdef DEBUG_CODEGEN
	    fprintf(Cout,
		"\n/* ia32regname(%d) buf is %s; return value is %d*/\n",
		regnum,
		buf,
		rval);
	#endif

	return buf;
}

static void
ia32op(register int s)
{
	register int r0 = -1;
	register int r1 = -1;
	char buf[256];		/* This is much larger than it needs to be */

	/* Get first arg, with possible rename */
	if (tup[s].arg[0] != -1) {
		r0 = tup[ tup[s].arg[0] ].reg;

		if (tup[s].antidep != -1) {
			/* Arg was renamed; output a move */
			r0 = tup[s].reg;
			Ctab();

			/* Note ia32regname returns a pointer to a
			   static buffer, so we need to use its value
			   before calling it a second time.  Thus the
			   separate fprintf()s.
			*/
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif
			fprintf(Cout, "movl_r2r(%s,", ia32regname(r0));
			fprintf(Cout,
				" %s);\n",
				ia32regname(tup[s].antidep));
			tup[ tup[s].arg[0] ].reg = tup[s].antidep;
		}
	}

	/* Get second arg */
	if (tup[s].arg[1] != -1) {
		r1 = tup[ tup[s].arg[1] ].reg;
	}

	/* Where does result go? */
	tup[s].reg = r0;

	Ctab();
	switch (tup[s].op) {
	case F2I:
		bug("float->int cast not available for this target");
		break;
	case I2F:
		bug("int->float cast not available for this target");
		break;
	case RCP:
		bug("float reciprocal not available for this target");
		break;
	case RCP1:
		bug("float reciprocal not available for this target");
		break;
	case RCP2:
		bug("float reciprocal not available for this target");
		break;
	case ADD:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float ADD not available for this target");
		} else {
			sprintf(buf, "addl");
		}
		break;
	case SUB:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float SUB not available for this target");
		} else {
			sprintf(buf, "subl");
		}
		break;
	case MUL:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float MUL not available for this target");
		} else {
			sprintf(buf, "mulll");
		}
		break;
	case MULH:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float MULH not available for this target");
		} else if (tup[s].type.attr & TYP_UNSIGN) {
			sprintf(buf, "mulhul");
		} else {
			sprintf(buf, "mulhl");
		}
		break;
	case DIV:
		snprintf(buf,
			256,
			"div%sl",
			((tup[s].type.attr & TYP_UNSIGN) ? "u" : ""));
		break;
	case MOD:
		snprintf(buf,
			256,
			"mod%sl",
			((tup[s].type.attr & TYP_UNSIGN) ? "u" : ""));
		break;
	case AVG:
		bug("AVG instruction not available for this target");
		break;
	case MAX:
		bug("MAX instruction not available on this target");
		break;
	case MIN:
		bug("MIN instruction not available for this target");
		break;
	case AND:
		sprintf(buf, "andl");
		break;
	case ANDN:
		sprintf(buf, "andnl");
		break;
	case OR:
		sprintf(buf, "orl");
		break;
	case XOR:
		sprintf(buf, "xorl");
		break;
	case EQ:
		bug("EQ instruction not available for this target");
		break;
	case EQ_C: /* Compare for EQ, with C-like result */
		sprintf(buf, "cmpeql");
		break;

	case GT:
		bug("GT instruction not available for this target");
		break;
	case GT_C: /* Compare for GT, with C-like result */
		if (tup[s].type.attr & TYP_UNSIGN) {
			sprintf(buf, "cmpgtul");
		} else {
			sprintf(buf, "cmpgtl");
		}
		break;

	case GE:
		bug("GE instruction not available for this target");
		break;

	case SHL:
		sprintf(buf, "slll");
		break;

	case SHLBIT:
		bug("SHLBIT instruction not available for this target");
		break;

	case SHLBYTE:
		bug("SHLBYTE instruction not available for this target");
		break;

	case SHR:
		sprintf(buf, "sr%cl",
			(tup[s].type.attr & TYP_UNSIGN)?
			    'l' :
			    (tup[s].type.attr == TYP_NULL)? 'l' : 'a');
		break;

	case SHRBIT:
		bug("SHRBIT instruction not available for this target");
		break;

	case SHRBYTE:
		bug("SHRBYTE instruction not available for this target");
		break;

	case PACKS2U:
		bug("PACKS2U instruction not available for this target");
		break;

	case PACK:
		bug("PACK instruction not available for this target");
		break;

	case UNPACKH:
		bug("UNPACKH instruction not available for this target");
		break;

	case UNPACKL:
		bug("UNPACKL instruction not available for this target");
		break;

	case INTRLVLOW:
		bug("PUNPCKL instruction not available for this target");
		break;
	case INTRLVHIGH:
		bug("PUNPCKH instruction not available for this target");
		break;

	case INTRLVEVEN:
		bug("INTRLVEVEN instruction not available for this target");
		break;

	case INTRLVODD:
		bug("INTRLVODD instruction not available for this target");
		break;

	case PERM:
		bug("PERM instruction not available for this target");
		break;

	case LEA:
		snprintf(buf, 256, "addl");
		break;

	case LOADR:
		snprintf(buf, 256, "movl");
		break;

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"ia32op op=%s (this cannot happen?)",
				opname(tup[s].op));
			bug(buf);
		}
		return;
	}

	/* Output the arguments */
	#ifdef DEBUG_m2rmode
		fprintf(Cout, "/* m2rmode is %d */", tup[s].m2rmode);
	#endif
	if (tup[s].m2rmode != -1) {
		if (tup[ tup[s].m2rmode ].op == LOAD) {
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif
			fprintf(Cout, "%s_m2r(%s, %s);",
				buf,
				symname(tup[ tup[s].m2rmode ].symbol,
					tup[ tup[s].m2rmode ].fragment,
					tup[ tup[s].m2rmode ].offset),
				ia32regname(r0));
		} else {
			/* an immediate from the constant pool */
			switch (tup[s].op) {
			case SHL:
			case SHR:
			case PERM:
				/* do not need constant pool for 8-bit
				   immediates, but this only works for shifts
				   and adds a byte to the opcode, so it is not
				   as clear a win as one might hope...  anyway,
				   use it if we can
				*/
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif

				/* This is touchy about the return from
				   ia32regname().  I'm not sure why
				   because it seems to me that the
				   fprintf()s should be combinable.
				   Maybe a segment pointer gets
				   clobbered?
				*/
				fprintf(Cout, "%s_i2r(%u, ",
					buf,
					(tup[tup[s].m2rmode].immed.ud[0]
					 & 0xffU));
				fprintf(Cout, "%s);", ia32regname(r0));
				break;
			default:
				/* load frag-sized immediates via the constant
				   pool
				*/
				cpoolenter(tup[ tup[s].m2rmode ].immed);

				#ifdef DEBUG_CODEGEN
					fprintf(Cout,"/* %d */ ",__LINE__);
				#endif

				fprintf(Cout, "%s_m2r(%s, %s);",
					buf,
					cpoolname(tup[ tup[s].m2rmode ].immed),
					ia32regname(r0));
			}
		}
	} else {
		if (tup[s].op == LEA) {
			/* Generate the address */
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif
			fprintf(Cout,
				"%s_m2r(((p32_t)((unsigned int)%s)), %s);",
				buf,
				tup[s].symbol->text,
				ia32regname(r0));
		} else if (tup[s].op == LOADR) {
			/* Generate the load using address in register */
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif
			/* We can get away without splitting this for the two
			   separate ia32regname() calls because they have the
			   value anyway. */
			fprintf(Cout,
				"%s_x2r(%s, %s);",
				 buf,
				 ia32regname(r0),
				 ia32regname(r0));
		} else {
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif

			/* Note ia32regname returns a pointer to a
			   static buffer, so we need to use its value
			   before calling it a second time.  Thus the
			   separate fprintf()s.
			*/
			fprintf(Cout, "%s_r2r(%s,", buf, ia32regname(r1));
			fprintf(Cout, " %s);", ia32regname(r0));
		}
	}
	fprintf(Cout, "\n");
}

static void
mmxop(register int s)
{
	register int r0 = -1;
	register int r1 = -1;
	char buf[256];		/* This is much larger than it needs to be */

	/* Get first arg, with possible rename */
	if (tup[s].arg[0] != -1) {
		r0 = tup[ tup[s].arg[0] ].reg;

		if (tup[s].antidep != -1) {
			/* Arg was renamed; output a move */
			r0 = tup[s].reg;
			Ctab();

			if ( (r0 < regsavail()) &&
			     (tup[s].antidep < regsavail()) ) {
				/* Neither spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif
				fprintf(Cout,
					"movq_r2r(mm%d, mm%d);\n",
					r0,
					tup[s].antidep);
			} else if ( (r0 >= regsavail()) &&
			     (tup[s].antidep < regsavail()) ) {
				/* Source spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif
				fprintf(Cout,
					"movq_m2r(%s, mm%d);\n",
					spoolname(r0),
					tup[s].antidep);
			} else if ( (r0 < regsavail()) &&
			     (tup[s].antidep >= regsavail()) ) {
				/* Destination spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif
				fprintf(Cout,
					"movq_r2m(mm%d, %s);\n",
					r0,
					spoolname(tup[s].antidep));
			} else if ( (r0 >= regsavail()) &&
			     (tup[s].antidep >= regsavail()) ) {
				/* Source and destination spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif

				/* Save mm0 */
				fprintf(Cout,
					"movq_r2m(mm0, %s);\n",
					spoolname(regsavail()-1));
				/* Move source into mm0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm0);\n",
					spoolname(r0));
				/* Move source into destination */
				Ctab();
				fprintf(Cout,
					"movq_r2m(mm0, %s);\n",
					spoolname(tup[s].antidep));
				/* Restore mm0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm0);\n",
					spoolname(regsavail()-1));
			}
			tup[ tup[s].arg[0] ].reg = tup[s].antidep;
		}
	}

	/* Get second arg */
	if (tup[s].arg[1] != -1) {
		r1 = tup[ tup[s].arg[1] ].reg;
	}

	/* Where does result go? */
	tup[s].reg = r0;

	Ctab();
	switch (tup[s].op) {
	case F2I:
		if ((optcpu & CPU_athlon) || (optcpu & CPU_3DNow)) {
			sprintf(buf, "pf2i%s", show_suf(s));
			r1 = r0;
		} else {
			bug("float->int cast available only for (ext'd)3DNow!");
		}
		break;
	case I2F:
		if ((optcpu & CPU_athlon) || (optcpu & CPU_3DNow)) {
			sprintf(buf, "pi2f%s", show_suf(s));
			r1 = r0;
		} else {
			bug("int->float cast available only for (ext'd)3DNow!");
		}
		break;
	case RCP:
		if (optcpu & CPU_3DNow) {
			sprintf(buf, "pfrcp");
			r1 = r0;
		} else {
			bug("float reciprocal available only for 3DNow!");
		}
		break;
	case RCP1:
		if (optcpu & CPU_3DNow) {
			sprintf(buf, "pfrcpit1");
		} else {
			bug("float reciprocal available only for 3DNow!");
		}
		break;
	case RCP2:
		if (optcpu & CPU_3DNow) {
			sprintf(buf, "pfrcpit2");
		} else {
			bug("float reciprocal available only for 3DNow!");
		}
		break;

	case ADD:
		if (tup[s].type.attr & TYP_FLOAT) {
			if (optcpu & CPU_3DNow) {
				sprintf(buf, "pfadd");
			} else {
				bug("float ADD available only for 3DNow!");
			}
		} else {
			if (optcpu & CPU_MMX) {
				snprintf(buf, 256, "padd%s%s%s",
					 ((tup[s].type.attr & TYP_SAT) &&
					  (tup[s].type.attr & TYP_UNSIGN)
					 )?  "u": "",
					 (tup[s].type.attr & TYP_SAT)? "s": "",
					 show_suf(s));
			} else {
				bug("ADD instruction not available for this "
				    "target");
			}
		}
		break;
	case SUB:
		if (tup[s].type.attr & TYP_FLOAT) {
			if (optcpu & CPU_3DNow) {
				sprintf(buf, "pfsub");
			} else {
				bug("float SUB available only for 3DNow!");
			}
		} else {
			if (optcpu & CPU_MMX) {
				snprintf(buf, 256, "psub%s%s%s",
					 ((tup[s].type.attr & TYP_SAT) &&
					  (tup[s].type.attr & TYP_UNSIGN)
					 )?  "u": "",
					 (tup[s].type.attr & TYP_SAT)? "s": "",
					 show_suf(s));
			} else {
				bug("SUB instruction not available for this "
				    "target");
			}
		}
		break;
	case MUL:
		if (tup[s].type.attr & TYP_FLOAT) {
			if (optcpu & CPU_3DNow) {
				sprintf(buf, "pfmul");
			} else {
				bug("float MUL available only for 3DNow!");
			}
		} else {
			if (optcpu & CPU_MMX)
				snprintf(buf, 256, "pmull%s", show_suf(s));
			else
				bug("MUL instruction not available for this "
				    "target");
		}
		break;
	case MULH:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float MULH instruction not available for this "
			    "target");
		} else if (tup[s].type.attr & TYP_UNSIGN) {
			if ((optcpu & CPU_MMX) || (optcpu & CPU_athlon))
				snprintf(buf, 256, "pmulhu%s", show_suf(s));
			else
				bug("MULH instruction not available for this "
				    "target");
		} else {
			if (optcpu & CPU_MMX)
				snprintf(buf, 256, "pmulh%s", show_suf(s));
			else
				bug("MULH instruction not available for this "
				    "target");
		}
		break;
	case DIV:
		if (optcpu & CPU_MMX)
			snprintf(buf,
				256,
				"pdiv%s%s",
				((tup[s].type.attr & TYP_UNSIGN) ? "u" : ""),
				show_suf(s));
		else
			bug("DIV instruction not available for this target");
		break;
	case MOD:
		if (optcpu & CPU_MMX)
			snprintf(buf,
				256,
				"pmod%s%s",
				((tup[s].type.attr & TYP_UNSIGN) ? "u" : ""),
				show_suf(s));
		else
			bug("MOD instruction not available for this target");
		break;
	case AVG:
		if (optcpu & CPU_athlon) {
			sprintf(buf, "pavg%s", show_suf(s));
		} else if (optcpu & CPU_XMMX) {
			sprintf(buf, "pave%s%s%s",
				((tup[s].type.attr & TYP_UNSIGN) ? "u" : ""),
				((tup[s].type.attr & TYP_SAT) ? "s" : ""),
				show_suf(s));
		} else if (optcpu & CPU_3DNow) {
			sprintf(buf, "pavg%s%s%s",
				((tup[s].type.attr & TYP_UNSIGN) ? "us" : ""),
				((tup[s].type.attr & TYP_SAT) ? "s" : ""),
				show_suf(s));
		} else {
			bug("AVG instruction not available for this target");
		}
		break;

	case MAX:
		if (tup[s].type.attr & TYP_FLOAT) {
			if (optcpu & CPU_3DNow) {
				sprintf(buf, "pfmax");
			} else {
				bug("float MAX available only for 3DNow!");
			}
		} else {
			if (optcpu & CPU_XMMX) {
				sprintf(buf, "pmag%s", show_suf(s));
			} else if (optcpu & CPU_athlon) {
				if (!(tup[s].type.attr & TYP_UNSIGN) &&
				     (bitsperfield(tup[s].type.bits)==16)) {
					sprintf(buf, "pmaxsw");
				} else if ((tup[s].type.attr & TYP_UNSIGN) &&
					  (bitsperfield(tup[s].type.bits)==8)) {
						sprintf(buf, "pmaxub");
				}
			} else {
				bug("MAX instruction not available on this "
				    "target");
			}
		}
		break;
	case MIN:
		if (tup[s].type.attr & TYP_FLOAT) {
			if (optcpu & CPU_3DNow) {
				sprintf(buf, "pfmin");
			} else {
				bug("float MIN available only for 3DNow!");
			}
		} else {
			if (optcpu & CPU_athlon) {
				if (!(tup[s].type.attr & TYP_UNSIGN) &&
				     (bitsperfield(tup[s].type.bits)==16)) {
					sprintf(buf, "pminsw");
				} else if ((tup[s].type.attr & TYP_UNSIGN) &&
					  (bitsperfield(tup[s].type.bits)==8)) {
						sprintf(buf, "pminub");
				}
			} else {
				bug("MIN instruction not available for this "
				    "target");
			}
		}
		break;
	case AND:
		sprintf(buf, "pand");
		break;
	case ANDN:
		sprintf(buf, "pandn");
		break;
	case OR:
		sprintf(buf, "por");
		break;
	case XOR:
		sprintf(buf, "pxor");
		break;

	case EQ:
		if (tup[s].type.attr & TYP_FLOAT) {
			if (optcpu & CPU_3DNow) {
				sprintf(buf, "pfcmpeq");
			} else {
				bug("float CMPEQ available only for 3DNow!");
			}
		} else {
			snprintf(buf, 256, "pcmpeq%s", show_suf(s));
		}
		break;

	case EQ_C: /* Compare for EQ, with C-like result */
		bug("EQ_C instruction not available for this target");
		break;

	case GT:
		if (tup[s].type.attr & TYP_FLOAT) {
			if (optcpu & CPU_3DNow) {
				sprintf(buf, "pfcmpgt");
			} else {
				bug("float CMPGT available only for 3DNow!");
			}
		} else {
			snprintf(buf, 256, "pcmpgt%s", show_suf(s));
		}
		break;

	case GT_C: /* Compare for GT, with C-like result */
		bug("GT_C instruction not available for this target");
		break;

	case GE:
		if (tup[s].type.attr & TYP_FLOAT) {
			if (optcpu & CPU_3DNow) {
				sprintf(buf, "pfcmpge");
			} else {
				bug("float CMPGE available only for 3DNow!");
			}
		} else {
			bug("GE instruction not available for this target");
		}
		break;

	case SHL:
		snprintf(buf, 256, "psll%s", show_suf(s));
		break;

	case SHLBIT:
		bug("SHLBIT instruction not available for this target");
		break;

	case SHLBYTE:
		bug("SHLBYTE instruction not available for this target");
		break;

	case SHR:
		snprintf(buf, 256, "psr%c%s",
			(tup[s].type.attr & TYP_UNSIGN)?
			    'l' :
			    (tup[s].type.attr == TYP_NULL)? 'l' : 'a',
			show_suf(s));
		break;

	case SHRBIT:
		bug("SHRBIT instruction not available for this target");
		break;

	case SHRBYTE:
		bug("SHRBYTE instruction not available for this target");
		break;

	case PACKS2U:
		if (tup[s].type.attr & TYP_SAT) {
			snprintf(buf,
				256,
				"packus%s%s",
				show_suf2(
				  bitsperfield(tup[s].type.bits) * 2),
				show_suf(s));
		} else {
			bug("Generated PACKS2U is incorrect for modular types");
		}
		break;

	case PACK:
		if (tup[s].type.attr & TYP_SAT) {
			if (tup[s].type.attr & TYP_UNSIGN) {
				bug("Generated PACK is incorrect for unsigned "
				    "types");
			} else {
				snprintf(buf,
					256,
					"packss%s%s",
					show_suf2(
					  bitsperfield(tup[s].type.bits) * 2),
					show_suf(s));
			}
		} else {
			bug("Generated PACK is incorrect for modular types");
		}
		break;

	case UNPACKH:
		bug("UNPACKH instruction not available for this target");
		break;

	case UNPACKL:
		bug("UNPACKL instruction not available for this target");
		break;

	case INTRLVLOW:
		snprintf(buf,
			256,
			"punpckl%s%s",
			show_suf2(bitsperfield(tup[s].type.bits) / 2),
			show_suf(s));
		break;
	case INTRLVHIGH:
		snprintf(buf,
			256,
			"punpckh%s%s",
			show_suf2(bitsperfield(tup[s].type.bits) / 2),
			show_suf(s));
		break;

	case INTRLVEVEN:
		bug("INTRLVEVEN instruction not available for this target");
		break;

	case INTRLVODD:
		bug("INTRLVODD instruction not available for this target");
		break;

	case PERM:
		if (optcpu & CPU_athlon) {
			if (bitsperfield(tup[s].type.bits)==16) {
				snprintf(buf, 256, "pshufw");
			} else {
				bug("Athlon PSHUFW for 16 bits only");
			}
		} else {
			bug("PERM instruction not available for this target");
		}
		break;

	case LEA:
		snprintf(buf, 256, "paddd");
		break;

	case LOADR:
		snprintf(buf, 256, "movq");
		break;

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"mmxop op=%s (this cannot happen?)",
				opname(tup[s].op));
			bug(buf);
		}
		return;
	}


	/* Output the arguments */
	#ifdef DEBUG_m2rmode
		fprintf(Cout, "/* m2rmode is %d */", tup[s].m2rmode);
	#endif
	if (tup[s].m2rmode != -1) {
		if (tup[ tup[s].m2rmode ].op == LOAD) {
			#ifdef DEBUG_CODEGEN
				fprintf(Cout," /* %d */ ",__LINE__);
			#endif
			if (r0 < regsavail()) {
				fprintf(Cout, "%s_m2r(%s, mm%d);",
					buf,
					symname(tup[ tup[s].m2rmode ].symbol,
						tup[ tup[s].m2rmode ].fragment,
						tup[ tup[s].m2rmode ].offset),
					r0);
			} else {
				/* Destination spilled */
				/* Save mm0 */
				fprintf(Cout,
					"movq_r2m(mm0, %s);\n",
					spoolname(regsavail()-1));
				/* Move destination into mm0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm0);\n",
					spoolname(r0));

				/* Apply op to source and mm0 */
				Ctab();
				fprintf(Cout, "%s_m2r(%s, mm0);",
					buf,
					symname(tup[ tup[s].m2rmode ].symbol,
						tup[ tup[s].m2rmode ].fragment,
						tup[ tup[s].m2rmode ].offset));

				/* Move mm0 into destination */
				Ctab();
				fprintf(Cout,
					"movq_r2m(mm0, %s);\n",
					spoolname(r0));
				/* Restore mm0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm0);",
					spoolname(regsavail()-1));
			}
		} else {
			/* an immediate from the constant pool */
			switch (tup[s].op) {
			case SHL:
			case SHR:
			case PERM:
				/* do not need constant pool for 8-bit
				   immediates, but this only works for shifts
				   and adds a byte to the opcode, so it is not
				   as clear a win as one might hope...  anyway,
				   use it if we can
				*/
				if (r0 < regsavail()) {
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					fprintf(Cout, "%s_i2r(%lld, mm%d);",
					buf,
					(tup[tup[s].m2rmode].immed.q[0] &
					 0xffLL),
					r0);
				} else {
					/* Destination spilled */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					/* Save mm0 */
					fprintf(Cout,
						"movq_r2m(mm0, %s);\n",
						spoolname(regsavail()-1));
					/* Move destination into mm0 */
					Ctab();
					fprintf(Cout,
						"movq_m2r(%s, mm0);\n",
						spoolname(r0));

					/* Apply op to source and mm0 */
					Ctab();
					fprintf(Cout, "%s_i2r(%lld, mm0);\n",
					    buf,
					    (tup[tup[s].m2rmode].immed.q[0] &
					     0xffLL));

					/* Move mm0 into destination */
					Ctab();
					fprintf(Cout,
						"movq_r2m(mm0, %s);\n",
						spoolname(r0));
					/* Restore mm0 */
					Ctab();
					fprintf(Cout,
						"movq_m2r(%s, mm0);",
						spoolname(regsavail()-1));
				}
				break;
			default:
				/* load frag-sized immediates via the constant
				   pool
				*/
				cpoolenter(tup[ tup[s].m2rmode ].immed);

				if (r0 < regsavail()) {
					/* Didn't spill */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
				    	fprintf(Cout, "%s_m2r(%s, mm%d);",
					    buf,
					    cpoolname(
						tup[ tup[s].m2rmode ].immed),
					    r0);
				} else {
					/* Destination spilled */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					/* Save mm0 */
					fprintf(Cout,
						"movq_r2m(mm0, %s);\n",
						spoolname(regsavail()-1));
					/* Move destination into mm0 */
					Ctab();
					fprintf(Cout,
						"movq_m2r(%s, mm0);\n",
						spoolname(r0));
					/* Apply op to source and mm0 */
					Ctab();
					fprintf(Cout, "%s_m2r(%s, mm0);\n",
					    buf,
					    cpoolname(
						tup[ tup[s].m2rmode ].immed));
					/* Move mm0 into destination */
					Ctab();
					fprintf(Cout,
						"movq_r2m(mm0, %s);\n",
						spoolname(r0));
					/* Restore mm0 */
					Ctab();
					fprintf(Cout,
						"movq_m2r(%s, mm0);",
						spoolname(regsavail()-1));
				}
			}
		}
	} else {
		if (tup[s].op == LEA) {
			/* Generate the address */
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif
			if (r0 < regsavail()) {
				fprintf(Cout, "%s_m2r((p64_t *) &%s, mm%d);",
					buf, tup[s].symbol->text, r0);
			} else {
				/* Destination spilled */
				/* Save mm0 */
				fprintf(Cout,
					"movq_r2m(mm0, %s);\n",
					spoolname(regsavail()-1));
				/* Move destination into mm0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm0);\n",
					spoolname(r0));

				/* Apply op to source and mm0 */
				Ctab();
				fprintf(Cout, "%s_m2r((p64_t *) &%s, mm0);\n",
					buf,
					tup[s].symbol->text);

				/* Move mm0 into destination */
				Ctab();
				fprintf(Cout,
					"movq_r2m(mm0, %s);\n",
					spoolname(r0));
				/* Restore mm0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm0);",
					spoolname(regsavail()-1));
			}
		} else if (tup[s].op == LOADR) {
			/* Generate the load using address in register */
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif
			if (r0 < regsavail()) {
				fprintf(Cout, "%s_x2r(mm%d, mm%d);",
					buf, r0, r0);
			} else {
				/* Spilled */
				/* Save mm0 */
				fprintf(Cout,
					"movq_r2m(mm0, %s);\n",
					spoolname(regsavail()-1));
				/* Move destination into mm0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm0);\n",
					spoolname(r0));
				/* Apply op to source and mm0 (as dest) */
				Ctab();
				fprintf(Cout,
					"%s_x2r(mm0, mm0);",
					buf);
				/* Move mm0 into destination */
				Ctab();
				fprintf(Cout,
					"movq_r2m(mm0, %s);\n",
					spoolname(r0));
				/* Restore mm0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm0);",
					spoolname(regsavail()-1));
			}
		} else {
			if ( (r1 < regsavail()) && (r0 < regsavail()) ) {
				/* Neither spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout,"/* %d */ ",__LINE__);
				#endif
				fprintf(Cout, "%s_r2r(mm%d, mm%d);",
					buf, r1, r0);
			} else if ( (r1>=regsavail()) && (r0<regsavail()) ) {
				/* Source spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout,"/* %d */ ",__LINE__);
				#endif
				fprintf(Cout, "%s_m2r(%s, mm%d);",
					buf,
					spoolname(r1),
					r0);
			} else if ( (r1<regsavail()) && (r0>=regsavail()) ) {
				/* Destination spilled */
				int savereg = (r1==0)? 1:0;

				#ifdef DEBUG_CODEGEN
					fprintf(Cout,"/* %d */ ",__LINE__);
				#endif

				/* Save a register */
				fprintf(Cout,
					"movq_r2m(mm%d, %s);\n",
					savereg,
					spoolname(regsavail()-1));
				/* Move destination into savereg */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm%d);\n",
					spoolname(r0),
					savereg);
				/* Apply op to source and savereg (as dest) */
				Ctab();
				fprintf(Cout,
					"%s_r2r(mm%d, mm%d);\n",
					buf,
					r1,
					savereg);
				/* Move savereg into destination */
				Ctab();
				fprintf(Cout,
					"movq_r2m(mm%d, %s);\n",
					savereg,
					spoolname(r0));
				/* Restore savereg */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm%d);",
					spoolname(regsavail()-1),
					savereg);
			} else if ( (r1>=regsavail()) && (r0>=regsavail()) ) {
				/* Source and destination spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout,"/* %d */ ",__LINE__);
				#endif
				/* Save mm0 */
				fprintf(Cout,
					"movq_r2m(mm0, %s);\n",
					spoolname(regsavail()-1));
				/* Move destination into mm0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm0);\n",
					spoolname(r0));
				/* Apply op to source and mm0 (as dest) */
				Ctab();
				fprintf(Cout,
					"%s_m2r(%s, mm0);\n",
					buf,
					spoolname(r1));
				/* Move mm0 into destination */
				Ctab();
				fprintf(Cout,
					"movq_r2m(mm0, %s);\n",
					spoolname(r0));
				/* Restore mm0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, mm0);",
					spoolname(regsavail()-1));
			}
		}
	}
	fprintf(Cout, "\n");
}

static void
maxop(register int s)
{
	register int r0 = -1;
	register int r1 = -1;
	char buf[256];		/* This is much larger than it needs to be */

	/* Get first arg, with possible rename */
	if (tup[s].arg[0] != -1) {
		r0 = tup[ tup[s].arg[0] ].reg;

		if (tup[s].antidep != -1) {
			/* Arg was renamed; output a move */
			r0 = tup[s].reg;
			Ctab();

			if ( (r0 < regsavail()) &&
			     (tup[s].antidep < regsavail()) ) {
				/* Neither spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout,"/* %d */ ",__LINE__);
				#endif
				fprintf(Cout,
					"movq_r2r(r%d, r%d);\n",
					r0,
					tup[s].antidep);
			} else if ( (r0 >= regsavail()) &&
			     (tup[s].antidep < regsavail()) ) {
				/* Source spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif
				fprintf(Cout,
					"movq_m2r(%s, r%d);\n",
					spoolname(r0),
					tup[s].antidep);
			} else if ( (r0 < regsavail()) &&
			     (tup[s].antidep >= regsavail()) ) {
				/* Destination spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif
				fprintf(Cout,
					"movq_r2m(r%d, %s);\n",
					r0,
					spoolname(tup[s].antidep));
			} else if ( (r0 >= regsavail()) &&
			     (tup[s].antidep >= regsavail()) ) {
				/* Source and destination spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif
				/* Save r0 */
				fprintf(Cout,
					"movq_r2m(r0, %s);\n",
					spoolname(regsavail()-1));
				/* Move source into r0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, r0);\n",
					spoolname(r0));
				/* Move source into destination */
				Ctab();
				fprintf(Cout,
					"movq_r2m(r0, %s);\n",
					spoolname(tup[s].antidep));
				/* Restore r0 */
				Ctab();
				fprintf(Cout,
					"movq_m2r(%s, r0);\n",
					spoolname(regsavail()-1));
			}
			tup[ tup[s].arg[0] ].reg = tup[s].antidep;
		}
	}

	/* Get second arg */
	if (tup[s].arg[1] != -1) {
		r1 = tup[ tup[s].arg[1] ].reg;
	}

	/* Where does result go? */
	tup[s].reg = r0;

	Ctab();
	switch (tup[s].op) {
	case ADD:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float ADD available only for 3DNow!");
		} else {
			if (strcmp(show_suf(s), "q")) {
				snprintf(buf, 256, "hadd%s%s",
				 (tup[s].type.attr & TYP_SAT)?
				  ((tup[s].type.attr & TYP_UNSIGN)?
				   "us": "ss"):
				  (""),
				 show_suf(s));
			} else {
				snprintf(buf, 256, "add%s",
				 (tup[s].type.attr & TYP_SAT)?
				  ((tup[s].type.attr & TYP_UNSIGN)?
				   "us": "ss"):
				  (""));
			}
		}
		break;
	case SUB:
		if (strcmp(show_suf(s), "q")) {
			snprintf(buf, 256, "hsub%s%s",
			 (tup[s].type.attr & TYP_SAT)?
			  ((tup[s].type.attr & TYP_UNSIGN)?
			   "us": "ss"):
			  (""),
			 show_suf(s));
		} else {
			snprintf(buf, 256, "sub%s",
			 (tup[s].type.attr & TYP_SAT)?
			  ((tup[s].type.attr & TYP_UNSIGN)?
			   "us": "ss"):
			  (""));
		}
		break;
	case MUL:
		bug("MUL instruction not available for this target");
		break;
	case MULH:
		bug("MULH instruction not available for this target");
		break;
	case DIV:
		snprintf(buf,
			256,
			"pdiv%s%s",
			((tup[s].type.attr & TYP_UNSIGN) ? "u" : ""),
			show_suf(s));
		break;
	case MOD:
		snprintf(buf,
			256,
			"pmod%s%s",
			((tup[s].type.attr & TYP_UNSIGN) ? "u" : ""),
			show_suf(s));
		break;
	case AVG:
		snprintf(buf, 256, "havg%s%s",
			 (tup[s].type.attr & TYP_SAT)?
			  ((tup[s].type.attr & TYP_UNSIGN)?
			   "us": "ss"):
			  (""),
			 show_suf(s));
		break;

	case MAX:
		bug("MAX instruction not available on this target");
		break;
	case MIN:
		bug("MIN instruction not available for this target");
		break;
	case AND:
		sprintf(buf, "and");
		break;
	case ANDN:
		sprintf(buf, "andcm");
		break;
	case OR:
		sprintf(buf, "or");
		break;
	case XOR:
		sprintf(buf, "xor");
		break;

	case EQ:
		snprintf(buf,
			256,
			"cmpeq%s%s",
			(tup[s].type.attr & TYP_UNSIGN)? "u":"",
			show_suf(s));
		break;

	case GT:
		snprintf(buf,
			256,
			"cmpgt%s%s",
			(tup[s].type.attr & TYP_UNSIGN)? "u":"",
			show_suf(s));
		break;

	case GT_C: /* Compare for GT, with C-like result */
		bug("GT_C instruction not available for this target");
		break;

	case SHL:
		sprintf(buf, "hshl");
		break;

	case SHLBIT:
		bug("SHLBIT instruction not available for this target");
		break;

	case SHLBYTE:
		bug("SHLBYTE instruction not available for this target");
		break;

	case SHR:
		sprintf(buf, "hshr%s",
			(tup[s].type.attr & TYP_UNSIGN)? "u" : "");
		break;

	case SHRBIT:
		bug("SHRBIT instruction not available for this target");
		break;

	case SHRBYTE:
		bug("SHRBYTE instruction not available for this target");
		break;

	case PACKS2U:
		bug("PACKS2U instruction not available for this target");
		break;

	case PACK:
		bug("PACK instruction not available for this target");
		break;

	case UNPACKH:
		bug("UNPACKH instruction not available for this target");
		break;

	case UNPACKL:
		bug("UNPACKL instruction not available for this target");
		break;

	case INTRLVLOW:
		bug("PUNPCKL instruction not available for this target");
		break;

	case INTRLVHIGH:
		bug("PUNPCKH instruction not available for this target");
		break;

	case INTRLVEVEN:
		snprintf(buf, 256, "mixr%s", show_suf(s));
		break;

	case INTRLVODD:
		snprintf(buf, 256, "mixl%s", show_suf(s));
		break;

	case PERM:
		snprintf(buf, 256, "perm%s", show_suf(s));
		break;

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"maxop op=%s (this cannot happen?)",
				opname(tup[s].op));
			bug(buf);
		}
		return;
	}


	/* Output the arguments */
	if (tup[s].m2rmode != -1) {
		if (tup[ tup[s].m2rmode ].op == LOAD) {
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif
			fprintf(Cout, "%s_m2r(%s, r%d);",
				buf,
				symname(tup[ tup[s].m2rmode ].symbol,
					tup[ tup[s].m2rmode ].fragment,
					tup[ tup[s].m2rmode ].offset),
				r0);
		} else {
			/* an immediate from the constant pool */
			switch (tup[s].op) {
			case SHL:
			case SHR:
			case PERM:
				/* do not need constant pool for 8-bit
				   immediates, but this only works for shifts
				   and adds a byte to the opcode, so it is not
				   as clear a win as one might hope...  anyway,
				   use it if we can
				*/
				if (r0 < regsavail()) {
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					fprintf(Cout, "%s_i2r(%lld, r%d);",
					buf,
					(tup[tup[s].m2rmode].immed.q[0] &
					 0xffLL),
					r0);
				} else {
					/* Destination spilled */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					/* Save r0 */
					fprintf(Cout,
						"movq_r2m(r0, %s);\n",
						spoolname(regsavail()-1));
					/* Move destination into r0 */
					Ctab();
					fprintf(Cout,
						"movq_m2r(%s, r0);\n",
						spoolname(r0));

					/* Apply op to source and r0 */
					Ctab();
					fprintf(Cout, "%s_i2r(%lld, r0);\n",
					    buf,
					    (tup[tup[s].m2rmode].immed.q[0] &
					     0xffLL));

					/* Move r0 into destination */
					Ctab();
					fprintf(Cout,
						"movq_r2m(r0, %s);\n",
						spoolname(r0));
					/* Restore r0 */
					Ctab();
					fprintf(Cout,
						"movq_m2r(%s, r0);",
						spoolname(regsavail()-1));
				}
				break;
			default:
				/* load frag-sized immediates via the constant
				   pool
				*/
				cpoolenter(tup[ tup[s].m2rmode ].immed);

				if (r0 < regsavail()) {
					/* Didn't spill */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
				    	fprintf(Cout, "%s_m2r(%s, r%d);",
					    buf,
					    cpoolname(
						tup[ tup[s].m2rmode ].immed),
					    r0);
				} else {
					/* Destination spilled */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					/* Save r0 */
					fprintf(Cout,
						"movq_r2m(r0, %s);\n",
						spoolname(regsavail()-1));
					/* Move destination into r0 */
					Ctab();
					fprintf(Cout,
						"movq_m2r(%s, r0);\n",
						spoolname(r0));
					/* Apply op to source and r0 */
					Ctab();
					fprintf(Cout, "%s_m2r(%s, r0);\n",
					    buf,
					    cpoolname(
						tup[ tup[s].m2rmode ].immed));
					/* Move r0 into destination */
					Ctab();
					fprintf(Cout,
						"movq_r2m(r0, %s);\n",
						spoolname(r0));
					/* Restore r0 */
					Ctab();
					fprintf(Cout,
						"movq_m2r(%s, r0);",
						spoolname(regsavail()-1));
				}
			}
		}
	} else {
		if ( (r1 < regsavail()) && (r0 < regsavail()) ) {
			/* Neither spilled */
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif
			fprintf(Cout, "%s_r2r(r%d, r%d);",
				buf, r1, r0);
		} else if ( (r1 >= regsavail()) && (r0 < regsavail()) ) {
			/* Source spilled */
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif
			fprintf(Cout, "%s_m2r(%s, r%d);",
				buf,
				spoolname(r1),
				r0);
		} else if ( (r1 < regsavail()) && (r0 >= regsavail()) ) {
			/* Destination spilled */
			int savereg = (r1==0)? 1:0;

			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif

			/* Save a register */
			fprintf(Cout,
				"movq_r2m(r%d, %s);\n",
				savereg,
				spoolname(regsavail()-1));
			/* Move destination into savereg */
			Ctab();
			fprintf(Cout,
				"movq_m2r(%s, r%d);\n",
				spoolname(r0),
				savereg);
			/* Apply op to source and savereg (as dest) */
			Ctab();
			fprintf(Cout,
				"%s_r2r(r%d, r%d);\n",
				buf,
				r1,
				savereg);
			/* Move savereg into destination */
			Ctab();
			fprintf(Cout,
				"movq_r2m(r%d, %s);\n",
				savereg,
				spoolname(r0));
			/* Restore savereg */
			Ctab();
			fprintf(Cout,
				"movq_m2r(%s, r%d);",
				spoolname(regsavail()-1),
				savereg);
		} else if ( (r1 >= regsavail()) && (r0 >= regsavail()) ) {
			/* Source and destination spilled */
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif

			/* Save r0 */
			fprintf(Cout,
				"movq_r2m(r0, %s);\n",
				spoolname(regsavail()-1));
			/* Move destination into r0 */
			Ctab();
			fprintf(Cout,
				"movq_m2r(%s, r0);\n",
				spoolname(r0));
			/* Apply op to source and r0 (as dest) */
			Ctab();
			fprintf(Cout,
				"%s_m2r(%s, r0);\n",
				buf,
				spoolname(r1));
			/* Move r0 into destination */
			Ctab();
			fprintf(Cout,
				"movq_r2m(r0, %s);\n",
				spoolname(r0));
			/* Restore r0 */
			Ctab();
			fprintf(Cout,
				"movq_m2r(%s, r0);",
				spoolname(regsavail()-1));
		}
	}
	fprintf(Cout, "\n");
}

static void
altivecop(register int s)
{
	register int r0 = -1;
	register int r1 = -1;
	register int r2 = -1;
	char buf[256];		/* This is much larger than it needs to be */

	/* Get first arg, with possible rename */
	if (tup[s].arg[0] != -1) {
		r0 = tup[ tup[s].arg[0] ].reg;

		if (tup[s].antidep != -1) {
			/* Arg was renamed; output a move */
			r0 = tup[s].reg;
			Ctab();

			if ( (r0 < regsavail()) &&
			     (tup[s].antidep < regsavail()) ) {
				/* Neither spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif
				fprintf(Cout,
					"vmr_r2r(%d, %d);\n",
					r0,
					tup[s].antidep);
			} else if ( (r0 >= regsavail()) &&
			     (tup[s].antidep < regsavail()) ) {
				/* Source spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif
				fprintf(Cout,
					"lvx_m2r(%s, %d);\n",
					spoolname(r0),
					tup[s].antidep);
			} else if ( (r0 < regsavail()) &&
			     (tup[s].antidep >= regsavail()) ) {
				/* Destination spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */ ", __LINE__);
				#endif
				fprintf(Cout,
					"stvx_r2m(%d, %s);\n",
					r0,
					spoolname(tup[s].antidep));
			} else if ( (r0 >= regsavail()) &&
			     (tup[s].antidep >= regsavail()) ) {
				/* Source and destination spilled */
				#ifdef DEBUG_CODEGEN
					fprintf(Cout, "/* %d */\n", __LINE__);
					Ctab();
				#endif
				/* Save v0 */
				fprintf(Cout,
					"stvx_r2m(0, %s);\n",
					spoolname(regsavail()-1));
				/* Move source into v0 */
				Ctab();
				fprintf(Cout,
					"lvx_m2r(%s, 0);\n",
					spoolname(r0));
				/* Move source into destination */
				Ctab();
				fprintf(Cout,
					"stvx_r2m(0, %s);\n",
					spoolname(tup[s].antidep));
				/* Restore v0 */
				Ctab();
				fprintf(Cout,
					"lvx_m2r(%s, 0);\n",
					spoolname(regsavail()-1));
			}
			tup[ tup[s].arg[0] ].reg = tup[s].antidep;
		}
	}

	/* Get second arg */
	if (tup[s].arg[1] != -1) {
		r1 = tup[ tup[s].arg[1] ].reg;
	}

	/* Get third arg */
	if (tup[s].arg[2] != -1) {
		r2 = tup[ tup[s].arg[2] ].reg;
	}

	/* Where does result go? */
	tup[s].reg = r0;

	Ctab();
	switch (tup[s].op) {
	case F2I:
		sprintf(buf, "vrfin");
		r1 = r0;
		break;
	case I2F:
		if (tup[s].type.attr & TYP_UNSIGN) {
			sprintf(buf, "vcfux");
		} else {
			sprintf(buf, "vcfsx");
		}
		r1 = r0;
		break;
	case RCP:
		sprintf(buf, "vrefp");
		r1 = r0;
		break;
	case RCP1:
	case RCP2:
		bug("extra iterations should not be used for float reciprocal"
		    " only AltiVec");
		break;

	case ADD:
		if (tup[s].type.attr & TYP_FLOAT) {
			sprintf(buf, "vaddfp");
		} else {
			snprintf(buf, 256, "vadd%s%s%s",
				 (tup[s].type.attr & TYP_UNSIGN)?  "u": "s",
				 show_suf(s),
				 (tup[s].type.attr & TYP_SAT)? "s": "");
		}
		break;
	case ADDH:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("Floating-point add high not supported");
		} else {
			snprintf(buf, 256, "vaddc%s%s%s",
				 (tup[s].type.attr & TYP_UNSIGN)?  "u": "s",
				 show_suf(s),
				 (tup[s].type.attr & TYP_SAT)? "s": "");
		}
		break;
	case SUB:
		if (tup[s].type.attr & TYP_FLOAT) {
			sprintf(buf, "vsubfp");
		} else {
			snprintf(buf, 256, "vsub%s%s%s",
				 (tup[s].type.attr & TYP_UNSIGN)?  "u": "s",
				 show_suf(s),
				 (tup[s].type.attr & TYP_SAT)? "s": "");
		}
		break;
	case MUL:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float MUL not implemented for AltiVec yet");
		} else {
			snprintf(buf, 256, "vmull%s", show_suf(s));
		}
		break;
	case MULH:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float MULH not implemented for AltiVec yet");
		} else if (tup[s].type.attr & TYP_UNSIGN) {
			snprintf(buf, 256, "vmulhu%s", show_suf(s));
		} else {
			snprintf(buf, 256, "vmulh%s", show_suf(s));
		}
		break;
	case MULEVEN:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float MULEVEN not implemented for AltiVec yet");
		} else {
			snprintf(buf,
				 256,
				 "vmulo%s%s",
				 (tup[s].type.attr & TYP_UNSIGN)? "u" : "s",
				 show_suf(s));
		}
		break;
	case MULODD:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float MULODD not implemented for AltiVec yet");
		} else {
			snprintf(buf,
				 256,
				 "vmule%s%s",
				 (tup[s].type.attr & TYP_UNSIGN)? "u" : "s",
				 show_suf(s));
		}
		break;
	case DIV:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float DIV not implemented for AltiVec yet");
		} else if (tup[s].type.attr & TYP_UNSIGN) {
			snprintf(buf, 256, "vdivu%s", show_suf(s));
		} else {
			snprintf(buf, 256, "vdivs%s", show_suf(s));
		}
		break;
	case MOD:
		if (tup[s].type.attr & TYP_FLOAT) {
			bug("float DIV not implemented for AltiVec yet");
		} else if (tup[s].type.attr & TYP_UNSIGN) {
			snprintf(buf, 256, "vmodu%s", show_suf(s));
		} else {
			snprintf(buf, 256, "vmods%s", show_suf(s));
		}
		break;
	case AVG:
		sprintf(buf, "vavg%s%s",
			((tup[s].type.attr & TYP_UNSIGN) ? "u" : "s"),
			show_suf(s));
		break;

	case MAX:
		if (tup[s].type.attr & TYP_FLOAT) {
			sprintf(buf, "vmaxfp");
		} else {
			sprintf(buf, "vmax%s%s",
				((tup[s].type.attr & TYP_UNSIGN) ? "u" : "s"),
				show_suf(s));
		}
		break;
	case MIN:
		if (tup[s].type.attr & TYP_FLOAT) {
			sprintf(buf, "vminfp");
		} else {
			sprintf(buf, "vmin%s%s",
				((tup[s].type.attr & TYP_UNSIGN) ? "u" : "s"),
				show_suf(s));
		}
		break;

	case AND:
		sprintf(buf, "vand");
		break;
	case ANDN:
		sprintf(buf, "vandn");
		break;
	case OR:
		sprintf(buf, "vor");
		break;
	case NOR:
		sprintf(buf, "vnor");
		break;
	case XOR:
		sprintf(buf, "vxor");
		break;

	case EQ:
		if (tup[s].type.attr & TYP_FLOAT) {
			sprintf(buf, "vcmpeqfp");
		} else {
			snprintf(buf, 256, "vcmpequ%s", show_suf(s));
		}
		break;
	case EQ_C: /* Compare for EQ, with C-like result */
		bug("EQ_C instruction not available for this target");
		break;

	case GT:
		if (tup[s].type.attr & TYP_FLOAT) {
			sprintf(buf, "vcmpgtfp");
		} else {
			snprintf(buf,
				 256,
				 "vcmpgt%s%s",
				 (tup[s].type.attr & TYP_UNSIGN)? "u":"s",
				 show_suf(s));
		}
		break;
	case GT_C: /* Compare for GT, with C-like result */
		bug("GT_C instruction not available for this target");
		break;

	case GE:
		if (tup[s].type.attr & TYP_FLOAT) {
			sprintf(buf, "vcmpgeqfp");
		} else {
			bug("Non-float GE instruction not available for this "
			    "target");
		}
		break;

	case SHL:	/* parallel shift */
	case SHLBIT:	/* single shift */
		snprintf(buf, 256, "vsl%s", show_suf(s));
		break;

	case SHLBYTE:
		snprintf(buf, 256, "vslo");
		break;

	case SHR:	/* parallel shift */
	case SHRBIT:	/* single shift */
		snprintf(buf, 256, "vsr%s%s",
			(tup[s].type.attr & TYP_UNSIGN)?
			    "" :
			    (tup[s].type.attr == TYP_NULL)? "" : "a",
			show_suf(s));
		break;

	case SHRBYTE:
		snprintf(buf, 256, "vsro");
		break;

	case PACKS2U:
		if (tup[s].type.attr & TYP_SAT) {
			snprintf(buf,
				 256,
				 "vpks%sus",
				 show_suf2(tup[s].type.bits*2));
		} else {
			bug("Generated PACKS2U is incorrect for modular types");
		}
		break;

	case PACK:
		if (tup[s].type.attr & TYP_SAT) {
			if (tup[s].type.attr & TYP_UNSIGN) {
				snprintf(buf,
					 256,
					 "vpku%sus",
					 show_suf2(tup[s].type.bits*2));
			} else {
				snprintf(buf,
					 256,
					 "vpks%sss",
					 show_suf2(tup[s].type.bits*2));
			}
		} else {
			snprintf(buf,
				 256,
				 "vpku%sum",
				 show_suf2(tup[s].type.bits*2));
		}
		break;

	case UNPACKH:
		snprintf(buf,
			 256,
			 "vupkhs%s",
			 show_suf2(bitsperfield(tup[s].type.bits) / 2));
		r1 = r0;
		break;
	case UNPACKL:
		snprintf(buf,
			 256,
			 "vupkls%s",
			 show_suf2(bitsperfield(tup[s].type.bits) / 2));
		r1 = r0;
		break;

	case INTRLVLOW:
		snprintf(buf,
			 256,
			 "vmrgl%s",
			 show_suf2(bitsperfield(tup[s].type.bits) / 2));
		break;
	case INTRLVHIGH:
		snprintf(buf,
			 256,
			 "vmrgh%s",
			 show_suf2(bitsperfield(tup[s].type.bits) / 2));
		break;

	case INTRLVEVEN:
		snprintf(buf, 256, "vperm");
		break;
	case INTRLVODD:
		snprintf(buf, 256, "vperm");
		break;

	case TPERM:
	case PERM:
		snprintf(buf, 256, "vperm");
		break;

	case REPL:
		snprintf(buf, 256, "vsplt%s", show_suf(s));
		break;

	case LVSL:
		snprintf(buf, 256, "lvsl");
		break;

	case LOADR:
		snprintf(buf, 256, "lvx");
		break;

	case LOADRR:
		snprintf(buf, 256, "loadrr");
		break;

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"altivecop op=%s (this cannot happen?)",
				opname(tup[s].op));
			bug(buf);
		}
		return;
	}


	/* Output the arguments */
	#ifdef DEBUG_m2rmode
		fprintf(Cout, "/* m2rmode is %d */", tup[s].m2rmode);
	#endif
	if (tup[s].m2rmode != -1) {
		if (tup[ tup[s].m2rmode ].op == LOAD) {
			#ifdef DEBUG_CODEGEN
				fprintf(Cout," /* %d */ ",__LINE__);
			#endif
			if (r0 < regsavail()) {
				fprintf(Cout, "%s_m2r(%s, %d);",
					buf,
					symname(tup[ tup[s].m2rmode ].symbol,
						tup[ tup[s].m2rmode ].fragment,
						tup[ tup[s].m2rmode ].offset),
					r0);
			} else {
				/* Destination spilled */
				/* Save v0 */
				fprintf(Cout,
					"stvx_r2m(0, %s);\n",
					spoolname(regsavail()-1));
				/* Move destination into v0 */
				Ctab();
				fprintf(Cout,
					"lvx_m2r(%s, 0);\n",
					spoolname(r0));

				/* Apply op to source and v0 */
				Ctab();
				fprintf(Cout, "%s_m2r(%s, 0);",
					buf,
					symname(tup[ tup[s].m2rmode ].symbol,
						tup[ tup[s].m2rmode ].fragment,
						tup[ tup[s].m2rmode ].offset));

				/* Move v0 into destination */
				Ctab();
				fprintf(Cout,
					"stvx_r2m(0, %s);\n",
					spoolname(r0));
				/* Restore v0 */
				Ctab();
				fprintf(Cout,
					"lvx_m2r(%s, 0);",
					spoolname(regsavail()-1));
			}
		} else {
			/* an immediate from the constant pool */
			switch (tup[s].op) {
			case SHL:
			case SHR:
			case PERM:
			case REPL:
				/* do not need constant pool for 8-bit
				   immediates, but this only works for shifts
				   and adds a byte to the opcode, so it is not
				   as clear a win as one might hope...  anyway,
				   use it if we can
				*/
				if (r0 < regsavail()) {
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					fprintf(Cout, "%s_i2r(%lld, %d);",
					buf,
					(tup[tup[s].m2rmode].immed.q[0] &
					 0xffLL),
					r0);
				} else {
					/* Destination spilled */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					/* Save v0 */
					fprintf(Cout,
						"stvx_r2m(0, %s);\n",
						spoolname(regsavail()-1));
					/* Move destination into v0 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 0);\n",
						spoolname(r0));

					/* Apply op to source and v0 */
					Ctab();
					fprintf(Cout, "%s_i2r(%lld, 0);\n",
					    buf,
					    (tup[tup[s].m2rmode].immed.q[0] &
					     0xffLL));

					/* Move v0 into destination */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(0, %s);\n",
						spoolname(r0));
					/* Restore v0 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 0);",
						spoolname(regsavail()-1));
				}
				break;
			default:
				/* load frag-sized immediates via the constant
				   pool
				*/
				cpoolenter(tup[ tup[s].m2rmode ].immed);
				if (r0 < regsavail()) {
					/* Didn't spill */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
				    	fprintf(Cout, "%s_m2r(%s, %d);",
					    buf,
					    cpoolname(
						tup[ tup[s].m2rmode ].immed),
					    r0);
				} else {
					/* Destination spilled */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					/* Save v0 */
					fprintf(Cout,
						"stvx_r2m(0, %s);\n",
						spoolname(regsavail()-1));
					/* Move destination into v0 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 0);\n",
						spoolname(r0));
					/* Apply op to source and v0 */
					Ctab();
/* HEREHERE - Possible problem? */
					fprintf(Cout, "%s_m2r(%s, 0);\n",
					    buf,
					    cpoolname(
						tup[ tup[s].m2rmode ].immed));
					/* Move v0 into destination */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(0, %s);\n",
						spoolname(r0));
					/* Restore v0 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 0);",
						spoolname(regsavail()-1));
				}
			}
		}
	} else {
		if (tup[s].op == LEA) {
			/* Generate the address */
			#ifdef DEBUG_CODEGEN
				fprintf(Cout," /* %d */ ",__LINE__);
			#endif

			if (r0 < regsavail()) {
				fprintf(Cout, "%s_m2r((p128_t *) &%s, %d);",
					buf, tup[s].symbol->text, r0);
			} else {
				/* Destination spilled */
				/* Save v0 */
				fprintf(Cout,
					"stvx_r2m(0, %s);\n",
					spoolname(regsavail()-1));
				/* Move destination into v0 */
				Ctab();
				fprintf(Cout,
					"lvx_m2r(%s, 0);\n",
					spoolname(r0));

				/* Apply op to source and v0 */
				Ctab();
				fprintf(Cout, "%s_m2r((p64_t *) &%s, 0);\n",
					buf,
					tup[s].symbol->text);

				/* Move v0 into destination */
				Ctab();
				fprintf(Cout,
					"stvx_r2m(0, %s);\n",
					spoolname(r0));
				/* Restore v0 */
				Ctab();
				fprintf(Cout,
					"lvx_m2r(%s, 0);",
					spoolname(regsavail()-1));
			}
		} else if (tup[s].op == LOADR) {
			/* Generate the load using address in register */
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */\n",__LINE__);
			#endif
			if (r0 < regsavail()) {
				fprintf(Cout, "%s_x2r(%d, %d);",
					buf, r0, r0);
			} else {
				/* Spilled */
				/* Save v0 */
				fprintf(Cout,
					"stvx_r2m(0, %s);\n",
					spoolname(regsavail()-1));
				/* Move destination into v0 */
				Ctab();
				fprintf(Cout,
					"lvx_m2r(%s, 0);\n",
					spoolname(r0));
				/* Apply op to source and v0 (as dest) */
				Ctab();
				fprintf(Cout,
					"%s_x2r(0, 0);",
					buf);
				/* Move v0 into destination */
				Ctab();
				fprintf(Cout,
					"stvx_r2m(0, %s);\n",
					spoolname(r0));
				/* Restore v0 */
				Ctab();
				fprintf(Cout,
					"lvx_m2r(%s, 0);",
					spoolname(regsavail()-1));
			}
		} else if (tup[s].op == LOADRR) {
			#ifdef DEBUG_CODEGEN
				fprintf(Cout,"/* %d */ ",__LINE__);
			#endif
			/* Generate the address */
			if (r0 < regsavail()) {
				fprintf(Cout,
					"%s_m2r(*((p128_t *)%s), %d, %d, %d);",
					buf,
					tup[s].symbol->text,
					r0,
					tup[s].offset,
					r0);
			} else {
				bug ("LOADRR Doesn't handle spills yet" );
			}
		} else {
			if ( r2 != -1 ) {
				/* Trinary operation - this is a hack... */
				/* Operand order is: src0, src1, index, dest
				   At this point, we always use src0 as the
				   destination.
				*/
				if ((r0<regsavail()) &&
				    (r1<regsavail()) &&
				    (r2<regsavail())) {
					/* Nothing spilled */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					fprintf(Cout,
						"%s_r2r(%d, %d, %d, %d);",
						buf, r0, r1, r2, r0);

				} else if ( (r0>=regsavail()) &&
					    (r1< regsavail()) &&
					    (r2< regsavail()) ) {
					/* Destination spilled */
					int savereg;

					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */\n",
							__LINE__);
					#endif

					/* Pick a save register */
					for (savereg=0;
					     (savereg==r1 || savereg==r2);
					     ++savereg);

					/* Save it */
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						savereg,
						spoolname(regsavail()-1));

					/* Move destination into savereg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(r0),
						savereg);

					/* Apply op */
					Ctab();
					fprintf(Cout,
						"%s_r2r(%d, %d, %d, %d);\n",
						buf,
						savereg,
						r1,
						r2,
						savereg);

					/* Move savereg into destination */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						savereg,
						spoolname(r0));

					/* Restore savereg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);",
						spoolname(regsavail()-1),
						savereg);

				} else if ( (r0< regsavail()) &&
					    (r1>=regsavail()) &&
					    (r2< regsavail()) ) {

					/* Source spilled */
					int savereg;

					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */\n",
							__LINE__);
						Ctab();
					#endif

					/* Pick a save register */
					for (savereg=0;
					     (savereg==r0 || savereg==r2);
					     ++savereg);

					/* Save it */
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						savereg,
						spoolname(regsavail()-1));

					/* Move source into savereg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(r1),
						savereg);

					/* Perform operation */
					Ctab();
					fprintf(Cout,
						"%s_r2r(%d, %d, %d, %d);\n",
						 buf,
						 r0,
						 savereg,
						 r2,
						 r0);

					/* Restore savereg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(regsavail()-1),
						savereg);

				} else if ( (r0< regsavail()) &&
					    (r1< regsavail()) &&
					    (r2>=regsavail()) ) {

					/* Index spilled */
					int savereg;

					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */\n",
							__LINE__);
					#endif

					/* Pick a save register */
					for (savereg=0;
					     (savereg==r0 || savereg==r1);
					     ++savereg);

					/* Save it */
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						savereg,
						spoolname(regsavail()-1));

					/* Move index into savereg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(r2),
						savereg);

					/* Perform operation */
					Ctab();
					fprintf(Cout,
						"%s_r2r(%d, %d, %d, %d);",
						 buf,
						 r0,
						 r1,
						 savereg,
						 r0);

					/* Restore savereg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(regsavail()-1),
						savereg);

				} else if ( (r0>=regsavail()) &&
					    (r1>=regsavail()) &&
					    (r2< regsavail()) ) {

					/* Destination and source spilled */
					int destreg, srcreg;

					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */\n",
							__LINE__);
					#endif

					/* Pick a destination register */
					for (destreg=0;
					     (destreg==r2);
					     ++destreg);
					/* Save it */
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						destreg,
						spoolname(regsavail()-1));
					/* Move destination into it */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(r0),
						destreg);

					/* Pick a source register */
					for (srcreg=0;
					     (srcreg==destreg || srcreg==r2);
					     ++srcreg);
					Ctab();
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						srcreg,
						spoolname(regsavail()-2));
					/* Move source into it */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(r1),
						srcreg);

					/* Apply operation */
					Ctab();
					fprintf(Cout,
						"%s_r2r(%d, %d, %d, %d);\n",
						buf,
						destreg,
						srcreg,
						r2,
						destreg);

					/* Move destreg into destination */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						destreg,
						spoolname(r0));

					/* Restore destreg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(regsavail()-1),
						destreg);

					/* Restore srcreg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);",
						spoolname(regsavail()-2),
						srcreg);

				} else if ( (r0>=regsavail()) &&
					    (r1< regsavail()) &&
					    (r2>=regsavail()) ) {

					/* Destination and index spilled */
					int destreg, indreg;

					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */\n",
							__LINE__);
					#endif

					/* Pick a destination register */
					for (destreg=0;
					     (destreg==r1);
					     ++destreg);
					/* Save it */
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						destreg,
						spoolname(regsavail()-1));
					/* Move destination into it */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(r0),
						destreg);

					/* Pick an index register */
					for (indreg=0;
					     (indreg==destreg || indreg==r1);
					     ++indreg);
					Ctab();
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						indreg,
						spoolname(regsavail()-2));
					/* Move index into it */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(r2),
						indreg);

					/* Apply operation */
					Ctab();
					fprintf(Cout,
						"%s_r2r(%d, %d, %d, %d);\n",
						buf,
						destreg,
						r1,
						indreg,
						destreg);

					/* Move destreg into destination */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						destreg,
						spoolname(r0));

					/* Restore destreg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(regsavail()-1),
						destreg);

					/* Restore indreg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);",
						spoolname(regsavail()-2),
						indreg);

				} else if ( (r0< regsavail()) &&
					    (r1>=regsavail()) &&
					    (r2>=regsavail()) ) {

					/* Source and index spilled */
					int srcreg, indreg;

					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */\n",
							__LINE__);
					#endif

					/* Pick a source register */
					for (srcreg=0;
					     (srcreg==r0);
					     ++srcreg);
					/* Save it */
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						srcreg,
						spoolname(regsavail()-1));
					/* Move source into it */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(r1),
						srcreg);

					/* Pick an index register */
					for (indreg=0;
					     (indreg==r0 || indreg==srcreg);
					     ++indreg);
					Ctab();
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						indreg,
						spoolname(regsavail()-2));
					/* Move index into it */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(r2),
						indreg);

					/* Apply operation */
					Ctab();
					fprintf(Cout,
						"%s_r2r(%d, %d, %d, %d);\n",
						buf,
						r0,
						srcreg,
						indreg,
						r0);

					/* Restore srcreg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(regsavail()-1),
						srcreg);

					/* Restore indreg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);",
						spoolname(regsavail()-2),
						indreg);

				} else if ( (r0>=regsavail()) &&
					    (r1>=regsavail()) &&
					    (r2>=regsavail()) ) {

					/* Everything spilled */

					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */\n",
							__LINE__);
					#endif

					/* Save v0 */
					fprintf(Cout,
						"stvx_r2m(0, %s);\n",
						spoolname(regsavail()-1));
					/* Move destination into v0 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 0);\n",
						spoolname(r0));

					/* Save v1 */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(1, %s);\n",
						spoolname(regsavail()-2));
					/* Move source into v1 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 1);\n",
						spoolname(r1));

					/* Save v2 */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(2, %s);\n",
						spoolname(regsavail()-3));
					/* Move index into v2 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 2);\n",
						spoolname(r2));


					/* Apply operation */
					Ctab();
					fprintf(Cout,
						"%s_r2r(0, 1, 2, 0);\n",
						buf);


					/* Move v0 into destination */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(0, %s);\n",
						spoolname(r0));

					/* Restore v0 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 0);\n",
						spoolname(regsavail()-1));

					/* Restore v1 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 1);",
						spoolname(regsavail()-2));

					/* Restore v2 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 2);",
						spoolname(regsavail()-3));
				}
			} else {
				if ( (r1<regsavail()) && (r0<regsavail()) ) {
					/* Neither spilled */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					fprintf(Cout, "%s_r2r(%d, %d);",
						buf, r1, r0);
				} else if ( (r1>=regsavail()) &&
					    (r0<regsavail()) ) {
					/* Source spilled */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */\n",
							__LINE__);
						Ctab();
					#endif

					/* Save v0 */
					fprintf(Cout,
						"stvx_r2m(0, %s);\n",
						spoolname(regsavail()-1));

					/* Move source into v0 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 0);\n",
						spoolname(r1));

					/* Perform operation on v0 and
					   destination */
					Ctab();
					fprintf(Cout,
						"%s_r2r(0, %d);\n",
						 buf,
						 r0);

					/* Restore v0 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 0);",
						spoolname(regsavail()-1));

				} else if ( (r1<regsavail()) &&
					    (r0>=regsavail()) ) {
					/* Destination spilled */
					int savereg = (r1==0)? 1:0;

					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */\n",
							__LINE__);
						Ctab();
					#endif

					/* Save a register */
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						savereg,
						spoolname(regsavail()-1));
					/* Move destination into savereg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						spoolname(r0),
						savereg);
					/* Apply op to source and savereg
					   (as dest) */
					Ctab();
					fprintf(Cout,
						"%s_r2r(%d, %d);\n",
						buf,
						r1,
						savereg);
					/* Move savereg into destination */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						savereg,
						spoolname(r0));
					/* Restore savereg */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, %d);",
						spoolname(regsavail()-1),
						savereg);
				} else if ( (r1>=regsavail()) &&
					    (r0>=regsavail()) ) {
					/* Source and destination spilled */
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */\n",
							__LINE__);
						Ctab();
					#endif

					/* Save v0 */
					fprintf(Cout,
						"stvx_r2m(0, %s);\n",
						spoolname(regsavail()-1));
					/* Move destination into v0 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 0);\n",
						spoolname(r0));

					/* Save v1 */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(1, %s);\n",
						spoolname(regsavail()-2));
					/* Move source into v1 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 1);\n",
						spoolname(r1));

					/* Apply op to v1 (src) and v0 (dest) */
					Ctab();
					fprintf(Cout, "%s_r2r(1, 0);\n", buf);

					/* Move v0 into destination */
					Ctab();
					fprintf(Cout,
						"stvx_r2m(0, %s);\n",
						spoolname(r0));

					/* Restore v0 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 0);\n",
						spoolname(regsavail()-1));

					/* Restore v1 */
					Ctab();
					fprintf(Cout,
						"lvx_m2r(%s, 1);",
						spoolname(regsavail()-2));
				}
			}
		}
	}
	fprintf(Cout, "\n");
}

static int
storeord1(register int root,
register int pos)
{
	register int i;

	if (root == -1) return(pos);

	/* Also done if scheduled earlier */
	for (i=0; i<pos; ++i) {
		if (sbest[i] == root) return(pos);
	}

	/* Pull arguments ahead of root */
	pos = storeord1(tup[root].arg[0], pos);
	pos = storeord1(tup[root].arg[1], pos);
	pos = storeord1(tup[root].arg[2], pos);

	/* Now put root here */
	sbest[pos] = root;

	/* And we've added another */
	return(pos + 1);
}

static void
storeord()
{
	/* order tuples by back-tracing stores */
	/* Does NOT allocate registers */

	register int i, j, k;
	register int stores;

	#ifdef DEBUG_SCHED
		fprintf(stderr, "storeord() schedsp=%d\n", schedsp);
	#endif

	/* move stores to front while maintaining order */
	stores = 0;
	for (i=0; i<schedsp; ++i) {
		if ((tup[ sched[i] ].op == STORE) ||
		    (tup[ sched[i] ].op == STORER) ||
		    (tup[ sched[i] ].op == STORERR)) {
			k = sched[i];
			sched[i] = sched[stores];
			sched[stores] = k;
			++stores;
		}
	}

	/* sort stores into increasing "before" value, without changing the
	   order of stores that may be to the same fragment. */
	/* This needs to be fixed a little: if the swap is disallowed because
	   of a possible conflict between i and some k, but there is no
	   conflict between i and j or between any k and j, then j should be
	   floated up before i if j has a smaller "before" value.
	*/
	for (i=0; i<stores; ++i) {
	    for (j=i+1; j<stores; ++j) {
		if (tup[ sched[i] ].before > tup[ sched[j] ].before) {
			int disallowed = possible_conflict(sched[i], sched[j]);
			int swap_disallowed = 0;

			for ( k=i+1; ((!disallowed) && (k<j)); ++k)
			{
			    disallowed = possible_conflict(sched[j], sched[k]);
			    swap_disallowed =
					swap_disallowed ||
					possible_conflict(sched[i], sched[k]) ||
					disallowed;
			}

			if (!disallowed) {
			    if (!swap_disallowed) {
				/* Just swap i and j */
				k = sched[i];
				sched[i] = sched[j];
				sched[j] = k;
			    } else {
				/* Promote j to i without changing i through
				   j-1's relative position.
					(e.g. 4,5,6,7 -> 7,4,5,6)
				*/
				int tmp = sched[j];
				for (k=j; k>i; --k) {
					sched[k] = sched[k-1];
				}
				sched[i] = tmp;
			    }
			}
		}
	    }
	}

	/* schedule tree under each store */
	j = 0;
	for (i=0; i<stores; ++i) {
		j = storeord1(sched[i], j);
	}

	if (j != schedsp) {
		bug("basic block prescheduler failed");
	}

	for (i=0; i<schedsp; ++i) {
		sched[i] = sbest[i];
	}
}

#if (defined DEBUG_SEARCH || defined DEBUG_FSCHED || defined DEBUG_TUPLIST)
static void
dump_tuples(FILE *strm,
char *msg)
{
	int i;
	tuple *t;

	fprintf (strm, "Dumping tuple list %s:\n", msg);
	for (i=0; i<tupsp; ++i) {
		t = &(tup[i]);
		/* This is split into multiple calls because the version of
		   gcc on my laptop computer doesn't handle 64-bit function
		   arguments properly and screws up the values after t->immed.
		   This has probably been fixed in later versions of gcc.
		*/
		fprintf(strm,
			" %4d:%s args(%d,%d,%d) \"%s\" [%llu]%+d (%llu)\n",
			i, opname(t->op),
			t->arg[0], t->arg[1], t->arg[2],
			((t->symbol)? t->symbol->text : "(null)"),
			tup[t->fragment].immed.uq[0],
			t->offset,
			t->immed.uq[0]);
		fprintf(strm,
			"      ad=%d rfs=%d,%d dpth=%d bfr/aft=%d,%d",
			t->antidep,
			t->refs, t->trefs,
			t->depth,
			t->before, t->after);
		fprintf(strm,
			" reg=%d,%d m2r=%d spld=%d splrg=%d\n",
			t->reg, t->oreg,
			t->m2rmode,
			t->spilled, t->spillreg);
	}
}
#endif

void dump_sched(FILE *strm, char *msg)
{
	int i;
	tuple *t;

	fprintf (strm, "Dumping schedule %s:\n", msg);
	for (i=0; i<schedsp; ++i) {
		t = &(tup[sched[i]]);
		fprintf(strm,
			" %4d[%d]:%s args(%d,%d,%d) \"%s\" [%llu]%+d (%llu)\n",
			i, sched[i], opname(t->op),
			t->arg[0], t->arg[1], t->arg[2],
			((t->symbol)? t->symbol->text : "(null)"),
			tup[t->fragment].immed.uq[0],
			t->offset,
			t->immed.uq[0]);
		fprintf(strm,
			"     ad=%d rfs/t=%d,%d dpth=%d bfr/aft=%d,%d",
			t->antidep,
			t->refs, t->trefs,
			t->depth,
			t->before, t->after);
		fprintf(strm,
			" reg/o=%d,%d m2r=%d spld=%d splrg=%d\n",
			t->reg, t->oreg,
			t->m2rmode,
			t->spilled, t->spillreg);
	}
}

#ifdef NEWSTUFF
static int tos;
void sched_node(int i)
{
	#ifdef DEBUG_SEARCH
		fprintf(stderr, "Start sched_node(%d)\n", i);
		fprintf(stderr, "tup[%d].refs=%d, tup[%d].reg=%d\n",
				i, tup[i].refs, i, tup[i].reg);
		fflush(stderr);
	#endif

	if ((tup[i].refs < 1) || (tup[i].reg != 0)) return;

	switch (tup[i].op) {

	/* Nullary (What, if anything, is latin for zero?) ops
	   which require a register for result */
	case NUM:
	case LOAD:
	case LVSL:
			tup[i].reg = tos++;
			/* sched[schedsp++] = i; */
			break;

	/* Unary ops not requiring register for result */
	case STORE:
			sched_node(tup[i].arg[0]);
			/* sched[schedsp++] = i; */
			break;

	/* Unary ops requiring register for result */
	case I2F:
	case F2I:
	case LNOT:
	case NEG:
	case NOT:
	case RCP:
	case UNPACKH:
	case UNPACKL:
			sched_node(tup[i].arg[0]);
			tup[i].reg = (tup[tup[i].arg[0]].refs == 1)?
					tup[tup[i].arg[0]].reg : tos++;
			/* sched[schedsp++] = i; */
			break;

	/* Binary ops */
	case ADD:
	case ADDH:
	case AND:
	case ANDN:
	case AVG:
	case DIV:
	case EQ:
	case EQ_C:
	case GE:
	case GT:
	case GT_C:
	case MOD:
	case MUL:
	case MULH:
	case MULEVEN:
	case MULODD:
	case OR:
	case SHL:
	case SHLBIT:
	case SHLBYTE:
	case SHR:
	case SHRBIT:
	case SHRBYTE:
	case SUB:
	case XOR:
	case PACK:
	case PACKS2U:
	case INTRLVLOW:
	case INTRLVHIGH:
	case INTRLVEVEN:
	case INTRLVODD:
	case PERM:
	case MAX:
	case MIN:
	case RCP1:
	case RCP2:
			if (tup[tup[i].arg[0]].arg[1] == -1) {
				sched_node(tup[i].arg[1]);
				sched_node(tup[i].arg[0]);
			} else {
				sched_node(tup[i].arg[0]);
				sched_node(tup[i].arg[1]);
			}
			tup[i].reg = (tup[tup[i].arg[0]].refs == 1)?
					tup[tup[i].arg[0]].reg : tos++;
			/* sched[schedsp++] = i; */
			break;

	default:
		{
			char buf[256];
			sprintf(buf,
				"sched_node(%s) defaulted",
				opname(tup[i].op));
			error(buf);
		}
	}
	#ifdef DEBUG_SEARCH
		fprintf(stderr, "End sched_node(%d)\n", i);
		fflush(stderr);
	#endif
}

static int
sched_simple(void)
{
	int i;

	#ifdef DEBUG_SEARCH
		fprintf(stderr, "Start sched_simple()\n");
		fprintf(stderr, "schedsp=%d, tupsp=%d\n", schedsp, tupsp);
		fflush(stderr);
	#endif

	/* Set the number of allowed spills */
	numspills = maxspills();

	#ifdef DEBUG_SEARCH
		dump_tuples(stderr, "before storeord(simple)");
		fprintf(stderr, "schedsp=%d, tupsp=%d\n", schedsp, tupsp);
		fflush(stderr);
	#endif

	/* Mark tuples as unspilled, and Make schedule index */
	schedsp = 0;
	for (i=0; i<tupsp; ++i) {
		if (tup[i].refs > 0) {
			tup[i].spilled = -1;
			sched[schedsp++] = i;
		}
	}

	storeord();

	#ifdef DEBUG_SEARCH
		dump_sched(stderr, "after storeord(simple)");
		fprintf(stderr, "schedsp=%d, tupsp=%d\n", schedsp, tupsp);
		fflush(stderr);
	#endif

	/* Search for a good schedule */
	sbestval = NOTFOUND;
	schedval = 0;

	tos = 0;
	for (i=0; i<tupsp; ++i) sched_node(i);

	for (i=0; i<schedsp; ++i) schedval += sched_cost(i);
	sched_eval();

	#ifdef DEBUG_SEARCH
		dump_sched(stderr, "after sched_simple()");
		fprintf(stderr, "sbestval = %d\n", sbestval);
		fflush(stderr);
	#endif

	return sbestval;
}
#endif

static void
refcounts(void)
{
	register int i;

	/* Set reference counts */
	for (i=0; i<tupsp; ++i) {
		if ((tup[i].op == STORE) && (tup[i].refs > 0)) {
			incref(tup[i].arg[0]);
		} else if ((tup[i].op == STORER) && (tup[i].refs > 0)) {
			incref(tup[i].arg[0]);
			incref(tup[i].arg[1]);
		} else if ((tup[i].op == STORERR) && (tup[i].refs > 0)) {
			incref(tup[i].arg[0]);
			incref(tup[i].arg[1]);
		} else {
			tup[i].refs = 0;
		}
	}
}


static int
sched_search(int allowed)
{
	register int i, j;

	#ifdef DEBUG_SEARCH
	{
		char buf[256];
		snprintf(buf, 256, "Starting sched_search(%d)", allowed);
		info(0, buf);
	}
	#endif

	/* Set the number of allowed spills */
	numspills = allowed;

	/* Mark tuples as unspilled, and make schedule index */
	schedsp = 0;
	for (i=0; i<tupsp; ++i) {
		if (tup[i].refs > 0) {
			tup[i].spilled = -1;
			sched[schedsp++] = i;
		}
	}
	#ifdef DEBUG_SEARCH
		dump_tuples(stderr, "before storeord()");
	#endif


	/* Create "initial schedule" that will help
	   the search find better schedules quicker...
	   This schedule will not have register allocation.
	*/
	#ifdef DEBUG_SEARCH
		info(0, "creating initial schedule");
	#else
		info(2, "creating initial schedule");
	#endif
	storeord();
	#ifdef DEBUG_SEARCH
		dump_sched(stderr, "after storeord()");
	#endif


	/* Search for a good schedule and register allocation */
	sbestval = NOTFOUND;
	schedtim = (time(0) + opttime);
	schedval = 0;
	sched_perm(0);

	#ifdef DEBUG_SEARCH
		dump_sched(stderr, "after sched_perm()");
		fprintf(stderr, "sbestval = %d\n", sbestval);
	#endif



	/* Force use of non-RISC addressing forms (i.e. operands can be in
	   memory) for non-RISC CPUs
	*/
	if (!(optcpu & CPU_RISC)) {
		#ifdef DEBUG_SEARCH
			fprintf(stderr,
				"Forcing non-RISC addressing: sbestval=%d\n",
				sbestval);
		#endif
		sbestval = NOTFOUND;
	}

	if ((sbestval == NOTFOUND) && !(optcpu & CPU_RISC)) {
		/* Try again, this time using i2r shifts and permutations */
		#ifdef DEBUG_SEARCH
			info(0, "using i2r for shifts and permutes to free "
				"registers");
		#else
			info(2, "using i2r for shifts and permutes to free "
				"registers");
		#endif
		for (i=0; i<tupsp; ++i) {
			if ((tup[i].refs > 0) &&
			    (tup[i].op != STORE) &&
			    (tup[i].op != STORER) &&
			    (tup[i].op != STORERR)) {
				register int j = tup[i].arg[1];

				if (j != -1) {
					/* Take constant shift and perm indices
					   from memory */
					if (((tup[i].op == SHL) ||
					     (tup[i].op == SHR) ||
					     (tup[i].op == PERM)) &&
					    (tup[j].op == NUM)) {
						tup[i].m2rmode = j;
						--(tup[j].refs);
						tup[i].arg[1] = -1;
					}
				}
			}
		}

		#ifdef DEBUG_SEARCH
			info(0, "out of first loop");
		#endif

		/* Set reference counts */
		refcounts();

		/* Now do before and after fields */
		before_bb(0,0,1);	/* Just inits the static cache */
		for (i=0; i<tupsp; ++i) {
			if (tup[i].refs > 0) {
				tup[i].before = 0;
				tup[i].after = 0;
				for (j=0; j<tupsp; ++j) if (i != j) {
					if (before_bb(j, i, 0)) {
						++(tup[i].before);
					} else if (before_bb(i, j, 0)) {
						++(tup[i].after);
					}
				}
			}
		}

		#ifdef DEBUG_SEARCH
			info(0, "done setting before and after fields");
		#endif


		/* Make schedule index */
		schedsp = 0;
		for (i=0; i<tupsp; ++i) {
			if (tup[i].refs > 0) {
				tup[i].spilled = -1;
				sched[schedsp++] = i;
			}
		}

		#ifdef DEBUG_SEARCH
			info(0, "done making schedule index");
		#endif

		/* Search for a good schedule */
		sbestval = NOTFOUND;
		schedtim = (time(0) + opttime);
		schedval = 0;
		#ifdef DEBUG_SEARCH
			dump_sched(stderr, "before sched_perm()");
			fprintf(stderr, "sbestval = %d\n", sbestval);
			info(0, "calling sched_perm...");
		#endif

		sched_perm(0);
		#ifdef DEBUG_SEARCH
			info(0, "...back from sched_perm");
			dump_sched(stderr, "after sched_perm()");
			fprintf(stderr, "sbestval = %d\n", sbestval);
		#endif
	}



	/* Force use of i2r replicates for AltiVec */
	if (optcpu & CPU_AltiVec) {
		#ifdef DEBUG_SEARCH
			fprintf(stderr,
				"Forcing i2r replicates: sbestval=%d\n",
				sbestval);
		#endif
		sbestval = NOTFOUND;
	}

	if ((sbestval == NOTFOUND) && (optcpu & CPU_AltiVec)) {
		/* Try again, this time using i2r replicates */
		#ifdef DEBUG_SEARCH
			info(0, "using i2r for replicates to free registers");
		#else
			info(2, "using i2r for replicates to free registers");
		#endif
		for (i=0; i<tupsp; ++i) {
			if ((tup[i].refs > 0) &&
			    (tup[i].op == REPL)) {
				register int j = tup[i].arg[0];

				/* Make constant replicate indices immediates */
				if (tup[j].op == NUM) {
					tup[i].m2rmode = j;
					--(tup[j].refs);
					tup[i].arg[0] = tup[i].arg[1];
					tup[i].arg[1] = -1;
				}
			}
		}

		#ifdef DEBUG_SEARCH
			info(0, "out of first loop");
		#endif

		/* Set reference counts */
		refcounts();

		/* Now do before and after fields */
		before_bb(0,0,1);	/* Just inits the static cache */
		for (i=0; i<tupsp; ++i) {
			if (tup[i].refs > 0) {
				tup[i].before = 0;
				tup[i].after = 0;
				for (j=0; j<tupsp; ++j) if (i != j) {
					if (before_bb(j, i, 0)) {
						++(tup[i].before);
					} else if (before_bb(i, j, 0)) {
						++(tup[i].after);
					}
				}
			}
		}

		#ifdef DEBUG_SEARCH
			info(0, "done setting before and after fields");
		#endif


		/* Make schedule index */
		schedsp = 0;
		for (i=0; i<tupsp; ++i) {
			if (tup[i].refs > 0) {
				tup[i].spilled = -1;
				sched[schedsp++] = i;
			}
		}

		#ifdef DEBUG_SEARCH
			info(0, "done making schedule index");
		#endif

		/* Search for a good schedule */
		sbestval = NOTFOUND;
		schedtim = (time(0) + opttime);
		schedval = 0;
		#ifdef DEBUG_SEARCH
			dump_sched(stderr, "before sched_perm()");
			fprintf(stderr, "sbestval = %d\n", sbestval);
			info(0, "calling sched_perm...");
		#endif

		sched_perm(0);
		#ifdef DEBUG_SEARCH
			info(0, "...back from sched_perm");
			dump_sched(stderr, "after sched_perm()");
			fprintf(stderr, "sbestval = %d\n", sbestval);
		#endif
	}



	/* Note: Do NOT use m2r modes for RISC CPUs */
	if ((sbestval == NOTFOUND) && !(optcpu & CPU_RISC)) {
		/* Try again, this time using m2r for single-refs */
		#ifdef DEBUG_SEARCH
			info(0, "using m2r for single refs to free registers");
		#else
			info(2, "using m2r for single refs to free registers");
		#endif
		for (i=0; i<tupsp; ++i) {
			if ((tup[i].refs > 0) &&
			    (tup[i].op != STORE) &&
			    (tup[i].op != STORER) &&
			    (tup[i].op != STORERR)) {
				register int j = tup[i].arg[1];

				if (j != -1) {
					/* Second arg is a singly-referenced
					   constant */
					if ((tup[j].refs == 1) &&
						   ((tup[j].op == NUM) ||
						    (tup[j].op == LOAD))) {
						tup[i].m2rmode = j;
						--(tup[j].refs);
						tup[i].arg[1] = -1;
					}

					/* First arg is a singly-referenced
					   constant */
					j = tup[i].arg[0];
					if ((j != -1) &&
					    (tup[i].m2rmode == -1) &&
					    (tup[j].refs == 1) &&
					    (!ordered(tup[i].op)) &&
					    ((tup[j].op == NUM) ||
					     (tup[j].op == LOAD))) {
					     /* swap args and absorb constant */
						tup[i].m2rmode = j;
						tup[j].refs = 0;
						tup[i].arg[0] = tup[i].arg[1];
						tup[i].arg[1] = -1;
					}
				}
			}
		}

		#ifdef DEBUG_SEARCH
			info(0, "out of first loop");
		#endif

		/* Set reference counts */
		refcounts();

		/* Now do before and after fields */
		before_bb(0,0,1);	/* Just inits the static cache */
		for (i=0; i<tupsp; ++i) {
			if (tup[i].refs > 0) {
				tup[i].before = 0;
				tup[i].after = 0;
				for (j=0; j<tupsp; ++j) if (i != j) {
					if (before_bb(j, i, 0)) {
						++(tup[i].before);
					} else if (before_bb(i, j, 0)) {
						++(tup[i].after);
					}
				}
			}
		}

		#ifdef DEBUG_SEARCH
			info(0, "done setting before and after fields");
		#endif


		/* Make schedule index */
		schedsp = 0;
		for (i=0; i<tupsp; ++i) {
			if (tup[i].refs > 0) {
				tup[i].spilled = -1;
				sched[schedsp++] = i;
			}
		}

		#ifdef DEBUG_SEARCH
			info(0, "done making schedule index");
		#endif

		/* Search for a good schedule */
		sbestval = NOTFOUND;
		schedtim = (time(0) + opttime);
		schedval = 0;
		#ifdef DEBUG_SEARCH
			dump_sched(stderr, "before sched_perm()");
			fprintf(stderr, "sbestval = %d\n", sbestval);
			info(0, "calling sched_perm...");
		#endif

		sched_perm(0);
		#ifdef DEBUG_SEARCH
			info(0, "...back from sched_perm");
			dump_sched(stderr, "after sched_perm()");
			fprintf(stderr, "sbestval = %d\n", sbestval);
		#endif
	}



	/* Note: Do NOT use m2r modes for RISC CPUs */
	if ((sbestval == NOTFOUND) && !(optcpu & CPU_RISC)) {
		/* Try again, this time using m2r constants */
		#ifdef DEBUG_SEARCH
			info(0, "using m2r constants to free registers");
		#else
			info(2, "using m2r constants to free registers");
		#endif
		for (i=0; i<tupsp; ++i) {
			if ((tup[i].refs > 0) &&
			    (tup[i].op != STORE) &&
			    (tup[i].op != STORER) &&
			    (tup[i].op != STORERR)) {
				register int j = tup[i].arg[1];

				if (j != -1) {
					if (tup[j].op == NUM) {
						tup[i].m2rmode = j;
						--(tup[j].refs);
						tup[i].arg[1] = -1;
					}

					j = tup[i].arg[0];
					if ((j != -1) &&
					    (tup[i].m2rmode == -1) &&
					    (!ordered(tup[i].op)) &&
					    (tup[ j ].op == NUM)) {
						/* swap args and absorb
						   constant */
						tup[i].m2rmode = j;
						tup[j].refs = 0;
						tup[i].arg[0] = tup[i].arg[1];
						tup[i].arg[1] = -1;
					}
				}
			}
		}

		/* Set reference counts */
		refcounts();

		/* Now do before and after fields */
		before_bb(0,0,1);	/* Just inits the static cache */
		for (i=0; i<tupsp; ++i) {
			if (tup[i].refs > 0) {
				tup[i].before = 0;
				tup[i].after = 0;
				for (j=0; j<tupsp; ++j) if (i != j) {
					if (before_bb(j, i, 0))
						++(tup[i].before);
					else if (before_bb(i, j, 0))
						++(tup[i].after);
				}
			}
		}

		/* Make schedule index */
		schedsp = 0;
		for (i=0; i<tupsp; ++i) {
			if (tup[i].refs > 0) {
				tup[i].spilled = -1;
				sched[schedsp++] = i;
			}
		}

		/* Search for a good schedule */
		sbestval = NOTFOUND;
		schedtim = (time(0) + opttime);
		schedval = 0;
		sched_perm(0);
	}



	if ((sbestval == NOTFOUND) && !(optcpu & CPU_RISC)) {
		/* Try again, this time using m2r loads */
		#ifdef DEBUG_SEARCH
			info(0, "using m2r loads to free registers");
		#else
			info(2, "using m2r loads to free registers");
		#endif
		for (i=0; i<tupsp; ++i) {
			if ((tup[i].refs > 0) &&
			    (tup[i].op != STORE) &&
			    (tup[i].op != STORER) &&
			    (tup[i].op != STORERR)) {
				register int j = tup[i].arg[1];

				if (j != -1) {
					if (tup[j].op == LOAD) {
						tup[i].m2rmode = j;
						--(tup[j].refs);
						tup[i].arg[1] = -1;
					}

					j = tup[i].arg[0];
					if ((j != -1) &&
					    (tup[i].m2rmode == -1) &&
					    (!ordered(tup[i].op)) &&
					    (tup[ j ].op == LOAD)) {
					     /* swap args and absorb constant */
						tup[i].m2rmode = j;
						tup[j].refs = 0;
						tup[i].arg[0] = tup[i].arg[1];
						tup[i].arg[1] = -1;
					}
				}
			}
		}

		/* Set reference counts */
		refcounts();

		/* Now do before and after fields */
		before_bb(0,0,1);	/* Just inits the static cache */
		for (i=0; i<tupsp; ++i) {
			if (tup[i].refs > 0) {
				tup[i].before = 0;
				tup[i].after = 0;
				for (j=0; j<tupsp; ++j) if (i != j) {
					if (before_bb(j, i, 0))
						++(tup[i].before);
					else if (before_bb(i, j, 0))
						++(tup[i].after);
				}
			}
		}

		/* Make schedule index */
		schedsp = 0;
		for (i=0; i<tupsp; ++i) {
			if (tup[i].refs > 0) {
				tup[i].spilled = -1;
				sched[schedsp++] = i;
			}
		}

		/* Search for a good schedule */
		sbestval = NOTFOUND;
		schedtim = (time(0) + opttime);
		schedval = 0;
		sched_perm(0);
	}

	return sbestval;
}

void
output_schedule(void)
{
	register int i;

		for (i=0; i<schedsp; ++i) {
			register int s = sched[i];

			if (argregs(i)) {
				bug("scheduled register allocation was not "
				    "valid (this cannot happen?)");
			}

			/* write final tuple order to file */
			switch (tup[s].op) {
			case LOAD:
				Ctab();
				if (optcpu & CPU_MMX) {
					if (tup[s].oreg < regsavail()) {
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						fprintf(Cout,
						"movq_m2r(%s, mm%d);\n",
						symname(tup[s].symbol,
							tup[s].fragment,
							tup[s].offset),
						(tup[s].reg = tup[s].oreg));
					} else {
						/* Destination spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save mm0 */
						fprintf(Cout,
						    "movq_r2m(mm0, %s);\n",
						    spoolname(regsavail()-1));
						/* Move source into r0 */
						Ctab();
						fprintf(Cout,
						"movq_m2r(%s, mm0);\n",
						symname(tup[s].symbol,
							tup[s].fragment,
							tup[s].offset));

						/* Move mm0 into destination */
						Ctab();
						fprintf(Cout,
							"movq_r2m(mm0, %s);\n",
							spoolname (tup[s].reg =
							   tup[s].oreg));

						/* Restore mm0 */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, mm0);\n",
						    spoolname(regsavail()-1));
					}
				} else if (optcpu & CPU_MAX) {
					if (tup[s].oreg < regsavail()) {
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						fprintf(Cout,
						"movq_m2r(%s, r%d);\n",
						symname(tup[s].symbol,
							tup[s].fragment,
							tup[s].offset),
						(tup[s].reg = tup[s].oreg));
					} else {
						/* Destination spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save r0 */
						fprintf(Cout,
						    "movq_r2m(r0, %s);\n",
						    spoolname(regsavail()-1));
						/* Move source into r0 */
						Ctab();
						fprintf(Cout,
							"movq_m2r(%s, r0);\n",
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset));

						/* Move r0 into destination */
						Ctab();
						fprintf(Cout,
							"movq_r2m(r0, %s);\n",
							spoolname (tup[s].reg =
							   tup[s].oreg));

						/* Restore r0 */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, r0);\n",
						    spoolname(regsavail()-1));
					}
				} else if (optcpu == GenericIA32) {
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					fprintf(Cout,
						"movl_m2r(%s, %s);\n",
						symname(tup[s].symbol,
							tup[s].fragment,
							tup[s].offset),
						ia32regname(tup[s].reg =
								tup[s].oreg));
				} else if (optcpu & CPU_AltiVec) {
					if (tup[s].oreg < regsavail()) {
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
#define NOELLOADS
#ifdef NOELLOADS
						fprintf(Cout,
							"lvx_m2r(%s, %d);\n",
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset),
							(tup[s].reg =
								tup[s].oreg));
#else
						fprintf(Cout,
						"lv%sx_m2r(%s, %d);\n",
						((tup[s].symbol->type.attr &
						  TYP_LLONG)? "ed" :

						  ((tup[s].symbol->type.attr &
						    TYP_LONG)? "ew" :
						    ((tup[s].symbol->type.attr &
						      TYP_INT)? "ew" :
						      ((tup[s].symbol->type.attr &
						        TYP_SHORT)? "eh" :
						        ((tup[s].symbol->type.attr &
						          TYP_CHAR)? "eb" : ""
						        )
						      )
						    )
						  )
						),
						symname(tup[s].symbol,
							tup[s].fragment,
							tup[s].offset),
						(tup[s].reg = tup[s].oreg));
#endif
					} else {
						/* Destination spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save v0 */
						fprintf(Cout,
						    "stvx_r2m(0, %s);\n",
						    spoolname(regsavail()-1));
						/* Move source into v0 */
						Ctab();
						fprintf(Cout,
							"lvx_m2r(%s, 0);\n",
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset));

						/* Move v0 into destination */
						Ctab();
						fprintf(Cout,
							"stvx_r2m(0, %s);\n",
							spoolname (tup[s].reg =
							   tup[s].oreg));

						/* Restore v0 */
						Ctab();
						fprintf(Cout,
						    "lvx_m2r(%s, 0);\n",
						    spoolname(regsavail()-1));
					}
				} else {
					bug("Need to support target in LOAD"
					    " case of output_schedule()");
				}
				break;
			case LVSL:
				Ctab();
				if (optcpu & CPU_AltiVec) {
					if (tup[s].oreg < regsavail()) {
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						fprintf(Cout,
							"lvsl_m2r(%s, %d);\n",
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset),
							(tup[s].reg =
								tup[s].oreg));
					} else {
						/* Destination spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save v0 */
						fprintf(Cout,
						    "stvx_r2m(0, %s);\n",
						    spoolname(regsavail()-1));

						/* Move source into v0 */
						Ctab();
						fprintf(Cout,
							"lvsl_m2r(%s, 0);\n",
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset));

						/* Move v0 into destination */
						Ctab();
						fprintf(Cout,
							"stvx_r2m(0, %s);\n",
							spoolname (tup[s].reg =
							   tup[s].oreg));

						/* Restore v0 */
						Ctab();
						fprintf(Cout,
						    "lvx_m2r(%s, 0);\n",
						    spoolname(regsavail()-1));
					}
				} else {
					bug("Need to support target in LVSL"
					    " case of output_schedule()");
				}
				break;
			case STORE:
				Ctab();
				if (optcpu & CPU_MMX) {
					if (tup[tup[s].arg[0]].reg<regsavail())
					{
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						fprintf(Cout,
							"movq_r2m(mm%d, %s);\n",
							tup[tup[s].arg[0]].reg,
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset));
					} else {
						/* Source spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save mm0 */
						fprintf(Cout,
						    "movq_r2m(mm0, %s);\n",
						    spoolname(regsavail()-1));
						/* Move source into mm0 */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, mm0);\n",
						    spoolname(
						       tup[tup[s].arg[0]].reg));

						/* Move mm0 into destination */
						Ctab();
						fprintf(Cout,
							"movq_r2m(mm0, %s);\n",
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset));

						/* Restore mm0 */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, mm0);\n",
						    spoolname(regsavail()-1));
					}
				} else if (optcpu & CPU_MAX) {
					if (tup[tup[s].arg[0]].reg<regsavail())
					{
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						fprintf(Cout,
							"movq_r2m(r%d, %s);\n",
							tup[tup[s].arg[0]].reg,
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset));
					} else {
						/* Source spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save r0 */
						fprintf(Cout,
						    "movq_r2m(r0, %s);\n",
						    spoolname(regsavail()-1));
						/* Move source into r0 */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, r0);\n",
						    spoolname(
						       tup[tup[s].arg[0]].reg));

						/* Move r0 into destination */
						Ctab();
						fprintf(Cout,
							"movq_r2m(r0, %s);\n",
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset));

						/* Restore r0 */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, r0);\n",
						    spoolname(regsavail()-1));
					}
				} else if (optcpu == GenericIA32) {
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					fprintf(Cout,
						"movl_r2m(%s, %s);\n",
						ia32regname(
							tup[tup[s].arg[0]].reg),
						symname(tup[s].symbol,
							tup[s].fragment,
							tup[s].offset));
				} else if (optcpu & CPU_AltiVec) {
					if (tup[tup[s].arg[0]].reg<regsavail())
					{
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
#define ELSTORES_CALLTYPE
#ifdef NOELSTORES
						fprintf(Cout,
						"stvx_r2m(%d, %s);\n",
						tup[ tup[s].arg[0] ].reg,
						symname(tup[s].symbol,
							tup[s].fragment,
							tup[s].offset));
#elif defined ELSTORES_SYMTYPE
						fprintf(Cout,
						"stv%sx_r2m(%d, %s);\n",
						((tup[s].symbol->type.attr &
						  TYP_LLONG)? "ed" :

						  ((tup[s].symbol->type.attr &
						    TYP_LONG)? "ew" :
						    ((tup[s].symbol->type.attr &
						      TYP_INT)? "ew" :
						      ((tup[s].symbol->type.attr &
						        TYP_SHORT)? "eh" :
						        ((tup[s].symbol->type.attr &
						          TYP_CHAR)? "eb" : ""
						        )
						      )
						    )
						  )
						),
						tup[ tup[s].arg[0] ].reg,
						symname(tup[s].symbol,
							tup[s].fragment,
							tup[s].offset));
#elif defined ELSTORES_CALLTYPE
						fprintf(Cout,
						"stv%sx_r2m(%d, %s);\n",
						((tup[s].type.bits == 64)?
							"ed" : 
						  ((tup[s].type.bits == 32)?
							"ew" :
						    ((tup[s].type.bits == 16)?
							"eh" :
						      ((tup[s].type.bits == 8)?
							"eb" : ""
						      )
						    )
						  )
						),
						tup[ tup[s].arg[0] ].reg,
						symname(tup[s].symbol,
							tup[s].fragment,
							tup[s].offset));
#endif
					} else {
						/* Source spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save v0 */
						fprintf(Cout,
						    "stvx_r2m(0, %s);\n",
						    spoolname(regsavail()-1));
						/* Move source into v0 */
						Ctab();
						fprintf(Cout,
						    "lvx_m2r(%s, 0);\n",
						    spoolname(
						       tup[tup[s].arg[0]].reg));

						/* Store v0 into destination */
						Ctab();
						fprintf(Cout,
							"stvx_r2m(0, %s);\n",
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset));

						/* Restore v0 */
						Ctab();
						fprintf(Cout,
						    "lvx_m2r(%s, 0);\n",
						    spoolname(regsavail()-1));
					}
				} else {
					bug("Need to support target in STORE"
					    " case of output_schedule()");
				}
				break;
			case STORER:
				Ctab();
				if (optcpu & CPU_MMX) {
					if ((tup[tup[s].arg[0]].reg<regsavail())
					&& (tup[tup[s].arg[1]].reg<regsavail()))
					{
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						fprintf(Cout,
						"movq_r2x(mm%d, mm%d);\n",
						tup[ tup[s].arg[1] ].reg,
						tup[ tup[s].arg[0] ].reg);
					} else if
					  ((tup[tup[s].arg[0]].reg>=regsavail())
					&& (tup[tup[s].arg[1]].reg<regsavail()))
					{
						/* EA spilled */
						int savereg =
						  (tup[tup[s].arg[1]].reg==0)?
						  1:0;

						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif

						/* Save savereg */
						fprintf(Cout,
						    "movq_r2m(mm%d, %s);\n",
						    savereg,
						    spoolname(regsavail()-1));

						/* Move EA into savereg */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, mm%d);\n",
						    spoolname(
						       tup[tup[s].arg[0]].reg),
						    savereg);

						/* Move data (arg1) to
						   (savereg) */
						Ctab();
						fprintf(Cout,
						    "movq_r2x(mm%d, mm%d);\n",
						    tup[tup[s].arg[1]].reg,
						    savereg);

						/* Restore savereg */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, mm%d);\n",
						    spoolname(regsavail()-1),
						    savereg);
					} else if
					  ((tup[tup[s].arg[0]].reg<regsavail())
					&&(tup[tup[s].arg[1]].reg>=regsavail()))
					{
						/* Data spilled */
						/* Use the m2x form */
						fprintf(Cout,
						    "movq_m2x(%s, mm%d);\n",
						    spoolname(
							tup[tup[s].arg[1]].reg),
						    tup[ tup[s].arg[0] ].reg);
					} else if
					  ((tup[tup[s].arg[0]].reg>=regsavail())
					&&(tup[tup[s].arg[1]].reg>=regsavail()))
					{
						/* Both spilled */
						/* Spill EA then use m2x form */

						int savereg =
						  (tup[tup[s].arg[1]].reg==0)?
						  1:0;

						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif

						/* Save savereg */
						fprintf(Cout,
						    "movq_r2m(mm%d, %s);\n",
						    savereg,
						    spoolname(regsavail()-1));

						/* Move EA into savereg */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, mm%d);\n",
						    spoolname(
						       tup[tup[s].arg[0]].reg),
						    savereg);

						/* Move data (arg1) to
						   (savereg) */
						Ctab();
						fprintf(Cout,
						    "movq_m2x(%s, mm%d);\n",
						    spoolname(
							tup[tup[s].arg[1]].reg),
						    savereg);

						/* Restore savereg */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, mm%d);\n",
						    spoolname(regsavail()-1),
						    savereg);
					}
				} else if (optcpu & CPU_MAX) {
					if (tup[tup[s].arg[0]].reg<regsavail())
					{
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif

						fprintf(Cout,
							"movqr_r2m(r%d, %s);\n",
							tup[tup[s].arg[0]].reg,
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset));
					} else {
						/* Source spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save r0 */
						fprintf(Cout,
						    "movqr_r2m(r0, %s);\n",
						    spoolname(regsavail()-1));
						/* Move source into r0 */
						Ctab();
						fprintf(Cout,
						    "movqr_m2r(%s, r0);\n",
						    spoolname(
						       tup[tup[s].arg[0]].reg));

						/* Move r0 into destination */
						Ctab();
						fprintf(Cout,
							"movqr_r2m(r0, %s);\n",
							symname(tup[s].symbol,
								tup[s].fragment,
								tup[s].offset));

						/* Restore r0 */
						Ctab();
						fprintf(Cout,
						    "movqr_m2r(%s, r0);\n",
						    spoolname(regsavail()-1));
					}
				} else if (optcpu == GenericIA32) {
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					fprintf(Cout,
					  "movl_r2x(%s, ",
					  ia32regname(tup[tup[s].arg[1]].reg));
					fprintf(Cout,
					  "%s);\n",
					  ia32regname(tup[tup[s].arg[0]].reg));
				} else if (optcpu & CPU_AltiVec) {
					info(0, "Shouldn't be here: STORER");
					if ((tup[tup[s].arg[0]].reg<regsavail())
					&& (tup[tup[s].arg[1]].reg<regsavail()))
					{
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						fprintf(Cout,
						"stvx_r2x(%d, %d);\n",
						tup[ tup[s].arg[1] ].reg,
						tup[ tup[s].arg[0] ].reg);
					} else if
					  ((tup[tup[s].arg[0]].reg>=regsavail())
					&& (tup[tup[s].arg[1]].reg<regsavail()))
					{
						/* EA spilled */
						int savereg =
						  (tup[tup[s].arg[1]].reg==0)?
						  1:0;

						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save savereg */
						fprintf(Cout,
						    "stvx_r2m(%d, %s);\n",
						    savereg,
						    spoolname(regsavail()-1));

						/* Move EA into savereg */
						Ctab();
						fprintf(Cout,
						    "lvx_m2r(%s, %d);\n",
						    spoolname(
						       tup[tup[s].arg[0]].reg),
						    savereg);

						/* Move data (arg1) to
						   (savereg) */
						Ctab();
						fprintf(Cout,
						    "stvx_r2x(%d, %d);\n",
						    tup[tup[s].arg[1]].reg,
						    savereg);

						/* Restore savereg */
						Ctab();
						fprintf(Cout,
						    "lvx_m2r(%s, %d);\n",
						    spoolname(regsavail()-1),
						    savereg);
					} else if
					  ((tup[tup[s].arg[0]].reg<regsavail())
					&&(tup[tup[s].arg[1]].reg>=regsavail()))
					{
						/* Data spilled */
						/* Use the m2x form */
						fprintf(Cout,
						    "lvx_m2x(%s, %d);\n",
						    spoolname(
							tup[tup[s].arg[1]].reg),
						    tup[ tup[s].arg[0] ].reg);
					} else if
					  ((tup[tup[s].arg[0]].reg>=regsavail())
					&&(tup[tup[s].arg[1]].reg>=regsavail()))
					{
						/* Both spilled */
						/* Spill EA then use m2x form */

						int savereg =
						  (tup[tup[s].arg[1]].reg==0)?
						  1:0;

						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save savereg */
						fprintf(Cout,
						    "stvx_r2m(%d, %s);\n",
						    savereg,
						    spoolname(regsavail()-1));

						/* Move EA into savereg */
						Ctab();
						fprintf(Cout,
						    "lvx_m2r(%s, %d);\n",
						    spoolname(
						       tup[tup[s].arg[0]].reg),
						    savereg);

						/* Move data (arg1) to
						   (savereg) */
						Ctab();
						fprintf(Cout,
						    "lvx_m2x(%s, %d);\n",
						    spoolname(
							tup[tup[s].arg[1]].reg),
						    savereg);

						/* Restore savereg */
						Ctab();
						fprintf(Cout,
						    "lvx_m2r(%s, %d);\n",
						    spoolname(regsavail()-1),
						    savereg);
					}
				} else {
					bug("Need to support target in STORER"
					    " case of output_schedule()");
				}
				break;
			case STORERR:
				Ctab();
				if (optcpu & CPU_AltiVec) {
					if ((tup[tup[s].arg[0]].reg<regsavail())
					&& (tup[tup[s].arg[1]].reg<regsavail()))
					{
						/* Didn't spill */
						/* arg0 = fragment */
						/* arg1 = data */

						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif

						fprintf(Cout,
						  "storerr_r2m(%d, "
						  "*((p128_t *)%s), %d, %d);\n",
						  tup[ tup[s].arg[1] ].reg,
						  tup[s].symbol->text,
						  tup[ tup[s].arg[0] ].reg,
						  tup[s].offset);
					} else {
						bug ("STORERR Doesn't handle "
						     "spills yet" );
					}
				} else {
					bug("Need to support target in STORERR"
					    " case of output_schedule()");
				}
				break;
			case NUM:
				cpoolenter(tup[s].immed);
				Ctab();
				if (optcpu & CPU_MMX) {
					if (tup[s].oreg < regsavail()) {
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						fprintf(Cout,
						"movq_m2r(%s, mm%d);\n",
						cpoolname(tup[s].immed),
						(tup[s].reg = tup[s].oreg));
					} else {
						/* Destination spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save mm0 */
						fprintf(Cout,
						    "movq_r2m(mm0, %s);\n",
						    spoolname(regsavail()-1));
						/* Move source into mm0 */
						Ctab();
						fprintf(Cout,
						"movq_m2r(%s, mm0);\n",
						cpoolname(tup[s].immed));

						/* Move mm0 into destination */
						Ctab();
						fprintf(Cout,
							"movq_r2m(mm0, %s);\n",
							spoolname (tup[s].reg =
							   tup[s].oreg));
						/* Restore mm0 */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, mm0);\n",
						    spoolname(regsavail()-1));
					}
				} else if (optcpu & CPU_MAX) {
					if (tup[s].oreg < regsavail()) {
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						fprintf(Cout,
						    "movq_m2r(%s, r%d);\n",
						    cpoolname(tup[s].immed),
						    (tup[s].reg = tup[s].oreg));
					} else {
						/* Destination spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						/* Save r0 */
						fprintf(Cout,
						    "movq_r2m(r0, %s);\n",
						    spoolname(regsavail()-1));
						/* Move source into r0 */
						Ctab();
						fprintf(Cout,
						"movq_m2r(%s, r0);\n",
						cpoolname(tup[s].immed));

						/* Move r0 into destination */
						Ctab();
						fprintf(Cout,
							"movq_r2m(r0, %s);\n",
							spoolname (tup[s].reg =
							   tup[s].oreg));
						/* Restore r0 */
						Ctab();
						fprintf(Cout,
						    "movq_m2r(%s, r0);\n",
						    spoolname(regsavail()-1));
					}
				} else if (optcpu == GenericIA32) {
					#ifdef DEBUG_CODEGEN
						fprintf(Cout,
							"/* %d */ ",
							__LINE__);
					#endif
					fprintf(Cout,
					"movl_m2r(%s, %s);\n",
					cpoolname(tup[s].immed),
					ia32regname(tup[s].reg = tup[s].oreg));
				} else if (optcpu & CPU_AltiVec) {
					if (tup[s].oreg < regsavail()) {
						/* Didn't spill */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */ ",
								__LINE__);
						#endif
						fprintf(Cout,
						"lvx_m2r(%s, %d);\n",
						cpoolname(tup[s].immed),
						(tup[s].reg = tup[s].oreg));
					} else {
						/* Destination spilled */
						#ifdef DEBUG_CODEGEN
							fprintf(Cout,
								"/* %d */\n",
								__LINE__);
							Ctab();
						#endif
						/* Save v0 */
						fprintf(Cout,
						    "stvx_r2m(0, %s);\n",
						    spoolname(regsavail()-1));
						/* Move source into v0 */
						Ctab();
						fprintf(Cout,
						"lvx_m2r(%s, 0);\n",
						cpoolname(tup[s].immed));

						/* Move v0 into destination */
						Ctab();
						fprintf(Cout,
							"stvx_r2m(0, %s);\n",
							spoolname (tup[s].reg =
							   tup[s].oreg));
						/* Restore v0 */
						Ctab();
						fprintf(Cout,
						    "lvx_m2r(%s, 0);\n",
						    spoolname(regsavail()-1));
					}
				} else {
					bug("Need to support target in NUM"
					    " case of output_schedule()");
				}
				break;
			default:
				/* restore original register */
				tup[s].reg = tup[s].oreg;

				if (optcpu & CPU_MMX) {
					mmxop(s);
				} else if (optcpu & CPU_MAX) {
					maxop(s);
				} else if (optcpu & CPU_AltiVec) {
					altivecop(s);
				} else if (optcpu == GenericIA32) {
					ia32op(s);
				} else {
					bug ("Unknown target in "
					     "output_schedule()");
					mmxop(s);
				}
			}
		}
}

static void
sched_bb()
{
	register int i;

#ifdef NEWSTUFF
	if (sched_simple() == NOTFOUND) {
		info(0, "basic block register allocator/scheduler failed");
		fflush(stderr);
	} else if (opttime > 0) {
#endif
		/* Search for a schedule using no spills... */
		if ((sched_search(0) == NOTFOUND) && (maxspills())) {
			/* ... if none found, reset reference counts, then
			   look for a schedule allowing spills */
			refcounts();
			sched_search(maxspills());
		}
#ifdef NEWSTUFF
	}
#endif

	if (sbestval == NOTFOUND) {
		bug("basic block register allocator/scheduler failed");
		#ifdef DEBUG_FSCHED
			dump_tuples(stderr, "for failed schedule");
		#endif
	} else if (schedsp > 0) {
		/* Copy best schedule to sched[] */
		for (i=0; i<schedsp; ++i) {
			sched[i] = sbest[i];
		}

		/* Performance stats */
		if (optperf && schedsp) {
			register double d =
				(fieldcount / (0.5 * (schedsp + sbestval)));

			Ctab();
			fprintf(Cout,
				"/* block cost estimate is %d%s clocks, "
				"%1.1fx speedup */\n",
				((schedsp + sbestval) / 2),
				(((schedsp + sbestval) & 1) ? ".5" : ""),
				d);

			clocktotal += (schedsp + sbestval);
			fieldtotal += fieldcount;
		}

		/* Optionally dump list of scheduled tuples */
		#ifdef DEBUG_FSCHED
			dump_sched(stderr, "- final version");
		#endif

		output_schedule();
	}
}

void
end_bb(void)
{
	register int i, j;

	#ifdef DEBUG_ENDBB
		info(0, "Start end_bb()");
	#endif

	#ifdef DEBUG_TUPLIST
		dump_tuples(stderr, "at start of end_bb()");
	#endif

	/* Don't schedule empty blocks (this should be checked elsewere) */
	if (tupsp == -1) return;

	/* First set reference counts */
	refcounts();

	/* Pull immediate values into the instructions...
	   Intel recommends that single-use constants be
	   pulled into the instructions that use them;
	   this loop effectively does that.
	*/
	for (i=0; i<tupsp; ++i) {
		tup[i].m2rmode = -1;
	}

#ifdef	NEVER
	for (i=0; i<tupsp; ++i) {
		if ((tup[i].refs > 0) &&
		    (tup[i].op != STORE) &&
		    (tup[i].op != STORER) &&
		    (tup[i].op != STORERR)) {
			register int j = tup[i].arg[1];

			if (j != -1) {
				if (((tup[j].op == NUM) ||
				     (tup[j].op == LOAD)) &&
				    (tup[j].refs == 1)) {
					tup[i].m2rmode = j;
					tup[j].refs = 0;
					tup[i].arg[1] = -1;
				}

				j = tup[i].arg[0];
				if ((j != -1) &&
				    (tup[i].m2rmode == -1) &&
				    (!ordered(tup[i].op)) &&
				    ((tup[ j ].op == NUM) ||
				     (tup[ j ].op == LOAD)) &&
				    (tup[ j ].refs == 1)) {
					/* swap args and absorb constant */
					tup[i].m2rmode = j;
					tup[j].refs = 0;
					tup[i].arg[0] = tup[i].arg[1];
					tup[i].arg[1] = -1;
				}
			}
		}
	}

	/* Set reference counts */
	refcounts();
#endif

	#ifdef DEBUG_PERF
		fprintf(stderr, "%s: tupsp=%d\n", funcname, tupsp);
		fflush(stderr);
	#endif

	/* Now do before and after fields */
	before_bb(0,0,1);	/* This just inits the static cache */
	for (i=0; i<tupsp; ++i) {
		#ifdef DEBUG_ENDBB
			fprintf(stderr, "\ni=%d\n  j=", i);
			fflush(stderr);
		#endif
		if (tup[i].refs > 0) {
			tup[i].before = 0;
			tup[i].after = 0;
			for (j=0; j<tupsp; ++j) if (i != j) {
				#ifdef DEBUG_ENDBB
					fprintf(stderr, "%d ", j);
					fflush(stderr);
				#endif
				if (before_bb(j, i, 0))
					++(tup[i].before);
				else if (before_bb(i, j, 0))
					++(tup[i].after);
			}
		}
	}

	#ifdef DEBUG_ENDBB
		info(0, "end_bb(): calling sched_bb()");
	#endif

	/* Schedule the basic block */
	sched_bb();

	#ifdef DEBUG_ENDBB
		info(0, "End of end_bb()");
	#endif
}

