/*	tuplegen.c

	binary and unary operations on fragment tuples
*/

/*
	May want to use the SUB*us method for ADD*us rather than doing gen/prop
	Note: (SHL*, n, 1) may be better as (ADD32u n, n)
*/

#undef DEBUG
#undef DEBUG_MMXPACK
#undef DEBUG_IA32PACK
#undef DEBUG_COFOLD
#undef DEBUG_BPEEP
#undef DEBUG_UPEEP
#undef NOTDEFD

#include "swartypes.h"
#include "tuple.h"
#include "tokens.h"
#include "scheduler.h"
#include "tuplegen.h"
#include "tuple_immed.h"
#include "tuple_binop.h"
#include "tuple_trinop.h"
#include "oputils.h"
#include "be_optimizer.h"
#include "messages.h"
#include "Libstdswar/stdswar.h"
#include "showir.h"


static int
samefragtyp(typ t0,
typ t1)
{
	if ((bitsperfield(t0.bits) == bitsperfield(t1.bits)) &&
	    ((t0.attr & TYP_UNSIGN) == (t0.attr & TYP_UNSIGN))) {
		return(1);
	}
	return(0);
}

int
trinop(int op,
int arg0,
int arg1,
int arg2,
typ t)
{
	register int i;

	#ifdef DEBUG
		char buf[256];
	#endif

	#ifdef DEBUG
		snprintf ( buf,
			   256,
			   "Start trinop(%s) with %d bits/field",
			   opname(op),
			   bitsperfield(t.bits) );
		info(0, buf );
	#endif

	/* Break constructed ops down to the point where they are now trees of
	   typeless ops */
	switch (bitsperfield(t.bits)) {
	case 8:
		if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "trinop() calling trinop8us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = trinop8us(op, arg0, arg1, arg2, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"trinop() calling trinop8u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = trinop8u(op, arg0, arg1, arg2);
			}
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "trinop() calling trinop8ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = trinop8ss(op, arg0, arg1, arg2, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"trinop() calling trinop8(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = trinop8(op, arg0, arg1, arg2);
			}
		}
		if (i != -1) return(i);
		break;
	case 16:
		if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "trinop() calling trinop16us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = trinop16us(op, arg0, arg1, arg2, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"trinop() calling trinop16u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = trinop16u(op, arg0, arg1, arg2);
			}
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "trinop() calling trinop16ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = trinop16ss(op, arg0, arg1, arg2, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"trinop() calling trinop16(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = trinop16(op, arg0, arg1, arg2);
			}
		}
		if (i != -1) return(i);
		break;
	case 32:
		if (t.attr & TYP_FLOAT) {
			#ifdef DEBUG
				snprintf(buf,
					256,
					"trinop() calling trinop32f(%s)",
					opname(op));
				info(0, buf );
			#endif
			i = trinop32f(op, arg0, arg1, arg2);
		} else if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "trinop() calling trinop32us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = trinop32us(op, arg0, arg1, arg2, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"trinop() calling trinop32u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = trinop32u(op, arg0, arg1, arg2);
			}
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "trinop() calling trinop32ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = trinop32ss(op, arg0, arg1, arg2, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"trinop() calling trinop32(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = trinop32(op, arg0, arg1, arg2);
			}
		}
		if (i != -1) return(i);
		break;
	}

	/* If we are here, the operation is not implemented as a tree of
	   other operations, or has been stripped of type */

	#ifdef DEBUG
		snprintf(buf,
			256,
			"trinop() before normalizing types: case %s",
			opname(op) );
		info(0, buf );
	#endif

	/* Normalize types */
	switch (op) {
	case TPERM:
		/* typeless = 0u */
		t.attr &= ~TYP_UNSIGN;
		t.bits = 0;
		break;
	}

#ifdef NOTDEFD
	if ( !optnobepeep ) {
		#ifdef DEBUG_BPEEP
			info(0, "trinop() before 2nd peephole optimization" );
		#endif
		/* Perform peephole optimizations */
		if ( (rval=peephole(op, arg0, arg1, arg2, t)) != -1 )
			return rval;
		#ifdef DEBUG_BPEEP
			info(0, "trinop() after 2nd peephole optimization" );
		#endif
	}

	/* Normalize operand order */
	if ((arg0 > arg1) && !ordered(op)) {
		i = arg0;
		arg0 = arg1;
		arg1 = i;
	}
#endif

	/* Reuse an available computation's result if possible */
	for (i=tupsp-1; i>=0; --i) {
		if ((tup[i].op == op) &&
		    (tup[i].arg[0] == arg0) &&
		    (tup[i].arg[1] == arg1) &&
		    (tup[i].arg[1] == arg2) &&
		    samefragtyp(tup[i].type, t)) {
			return(i);
		}
	}

	/* Ops which could not be optimized, are not constructed of other ops,
	   and could not reuse an available computation's result, need to be
	   handled here.
	*/
	tup[tupsp].op = op;
	tup[tupsp].arg[0] = arg0;
	tup[tupsp].arg[1] = arg1;
	tup[tupsp].arg[2] = arg2;
	tup[tupsp].type = t;
	tup[tupsp].refs = 0;
	return(tupsp++);
} /* trinop() */

int
binop(int op,
int arg0,
int arg1,
typ t)
{
	register int i, rval;

	#ifdef DEBUG
		char buf[256];
	#endif

	#ifdef DEBUG
		snprintf ( buf,
			   256,
			   "Start binop(%s) with %d bits/field",
			   opname(op),
			   bitsperfield(t.bits) );
		info(0, buf );
	#endif
#undef NEWOPTS
#ifdef NEWOPTS
	if ( !optnobecf ) {
		/* Fold this if possible */
		#ifdef DEBUG_COFOLD
			info(0, "binop() before 1st constant folding" );
		#endif
		if ( (rval=be_cofold(op, arg0, arg1, -1, t)) != -1 )
			return rval;
	}

	if ( !optnobepeep ) {
		#ifdef DEBUG_BPEEP
			info(0, "binop() before 1st peephole optimization" );
		#endif
		/* Perform peephole optimizations */
		if ( (rval=peephole(op, arg0, arg1, -1, t)) != -1 )
			return rval;
		#ifdef DEBUG_BPEEP
			info(0, "binop() after 1st peephole optimization" );
		#endif
	}
#endif

	/* Break constructed ops down to the point where they are now trees of
	   typeless ops */
	switch (bitsperfield(t.bits)) {
	case 1:
		if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop1us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop1us(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop1u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop1u(op, arg0, arg1);
			}
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop1ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop1ss(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop1(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop1(op, arg0, arg1);
			}
		}
		if (i != -1) return(i);
		break;
	case 2:
		if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop2us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop2us(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop2u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop2u(op, arg0, arg1);
			}
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop2ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop2ss(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop2(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop2(op, arg0, arg1);
			}
		}
		if (i != -1) return(i);
		break;
	case 4:
		if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop4us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop4us(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop4u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop4u(op, arg0, arg1);
			}
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop4ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop4ss(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop4(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop4(op, arg0, arg1);
			}
		}
		if (i != -1) return(i);
		break;
	case 8:
		if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop8us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop8us(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop8u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop8u(op, arg0, arg1);
			}
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop8ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop8ss(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop8(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop8(op, arg0, arg1);
			}
		}
		if (i != -1) return(i);
		break;
	case 16:
		if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop16us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop16us(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop16u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop16u(op, arg0, arg1);
			}
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop16ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop16ss(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop16(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop16(op, arg0, arg1);
			}
		}
		if (i != -1) return(i);
		break;
	case 32:
		if (t.attr & TYP_FLOAT) {
			#ifdef DEBUG
				snprintf(buf,
					256,
					"binop() calling binop32f(%s)",
					opname(op));
				info(0, buf );
			#endif
			i = binop32f(op, arg0, arg1);
		} else if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop32us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop32us(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop32u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop32u(op, arg0, arg1);
			}
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop32ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop32ss(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop32(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop32(op, arg0, arg1);
			}
		}
		if (i != -1) return(i);
		break;
	case 64:
		if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop64us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop64us(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop64u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop64u(op, arg0, arg1);
			}
		} else if (t.attr == TYP_NULL) {
			/* Treat non-typed as unsigned */
			#ifdef DEBUG
				snprintf(buf,
					256,
					"binop() calling binop64u(%s)",
					opname(op));
				info(0, buf );
			#endif
			i = binop64u(op, arg0, arg1);
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop64ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop64ss(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop64(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop64(op, arg0, arg1);
			}
		}
		if (i != -1) return(i);
		break;
	case 128:
		if (t.attr & TYP_UNSIGN) {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop128us(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop128us(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop128u(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop128u(op, arg0, arg1);
			}
		} else if (t.attr == TYP_NULL) {
			/* Treat non-typed as unsigned */
			#ifdef DEBUG
				snprintf(buf,
					256,
					"binop() calling binop128u(%s)",
					opname(op));
				info(0, buf );
			#endif
			i = binop128u(op, arg0, arg1);
		} else {
			if (t.attr & TYP_SAT) {
				#ifdef DEBUG
					snprintf(buf,
					    256,
					    "binop() calling binop128ss(%s,%d)",
					    opname(op),
					    t.bits);
					info(0, buf );
				#endif
				i = binop128ss(op, arg0, arg1, t.bits);
			} else {
				#ifdef DEBUG
					snprintf(buf,
						256,
						"binop() calling binop128(%s)",
						opname(op));
					info(0, buf );
				#endif
				i = binop128(op, arg0, arg1);
			}
		}
		if (i != -1) return(i);
		break;
	}

	/* If we are here, the operation is not implemented as a tree of
	   other operations, or has been stripped of type */

	#ifdef DEBUG
		snprintf(buf,
			256,
			"binop() before normalizing types: case %s",
			opname(op) );
		info(0, buf );
	#endif

	/* Normalize types */
	switch (op) {
	case XOR:
	case OR:
	case ANDN:
	case AND:
		/* typeless = 0u */
		t.attr &= ~TYP_UNSIGN;
		t.bits = 0;
		break;

	case EQ:
	case MUL:
	case NE:
	case SHL:
	case SHLBIT:
	case SHLBYTE:
	case LAND:
	case LOR:
		/* the same thing signed or unsigned, so make it unsigned */
		t.attr &= ~TYP_UNSIGN;
		break;

	case ADD:
	case SUB:
	case DIV:
	case MOD:
	case MIN:
	case GT:
	case LT:
	case LE:
	case MAX:
	case GE:
	case SHR:
	case SHRBIT:
	case SHRBYTE:
	case PACK:
	case INTRLVLOW:
	case INTRLVHIGH:
		/* Different thing signed vs. unsigned, so no changes */
		break;
	}

	if ( !optnobecf ) {
		/* Fold this if possible */
		#ifdef DEBUG_COFOLD
			info(0, "binop() before 2nd constant folding" );
		#endif
		if ( (rval=be_cofold(op, arg0, arg1, -1, t)) != -1 )
			return rval;
	}

	if ( !optnobepeep ) {
		#ifdef DEBUG_BPEEP
			info(0, "binop() before 2nd peephole optimization" );
		#endif
		/* Perform peephole optimizations */
		if ( (rval=peephole(op, arg0, arg1, -1, t)) != -1 )
			return rval;
		#ifdef DEBUG_BPEEP
			info(0, "binop() after 2nd peephole optimization" );
		#endif
	}

	/* Normalize operand order */
	if ((arg0 > arg1) && !ordered(op)) {
		i = arg0;
		arg0 = arg1;
		arg1 = i;
	}

	/* Reuse an available computation's result if possible */
	for (i=tupsp-1; i>=0; --i) {
		if ((tup[i].op == op) &&
		    (tup[i].arg[0] == arg0) &&
		    (tup[i].arg[1] == arg1) &&
		    samefragtyp(tup[i].type, t)) {
			return(i);
		}
	}

	/* Ops which could not be optimized, are not constructed of other ops,
	   and could not reuse an available computation's result, need to be
	   handled here.
	*/
	tup[tupsp].op = op;
	tup[tupsp].arg[0] = arg0;
	tup[tupsp].arg[1] = arg1;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].type = t;
	tup[tupsp].refs = 0;
	return(tupsp++);
} /* binop() */


int
unop(int op,
int arg,
typ t)
{
	/* Return the index to the a tuple tree for a unary operation */

	register int i;
	int rval;

	/* Reuse an available computation's result? */
	for (i=tupsp-1; i>=0; --i) {
		if ((tup[i].op == op) &&
		    (tup[i].arg[0] == arg) &&
		    samefragtyp(tup[i].type, t)) {
			return(i);
		}
	}

	if ( !optnobecf ) {
		/* Fold this if possible */
		#ifdef DEBUG_COFOLD
			info(0, "unop() before constant folding" );
		#endif
		if ( (rval=be_cofold(op, arg, -1, -1, t)) != -1 )
			return rval;
	}

	if ( !optnobepeep ) {
		#ifdef DEBUG_UPEEP
			info(0, "unop() before peephole optimization" );
		#endif
		/* Perform peephole optimizations */
		if ( (rval=peephole(op, arg, -1, -1, t)) != -1 )
			return rval;
		#ifdef DEBUG_UPEEP
			info(0, "unop() after peephole optimization" );
		#endif
	}

	/* Take care of constructed basic ops */
	switch (op) {
	case ANY: {
		p128_t tmp;
		tmp.q[1] = 0ULL;
		tmp.q[0] = (unsigned long long) bitsperfrag() / 2ULL;

		/* HEREHERE - This isn't right, is it? */
		return(binop(OR,
			     arg,
			     binop(SHR, arg, immed128(tmp), typnull),
			     typnull));
	}
	case ALL:
		/* Convert ALL into ANY != 0 */
		/* HEREHERE - This isn't right, is it? */
		return(unop(ANY, binop(EQ, arg, immed64u((p64_t)0ULL), t), t));
	case LNOT:
		return(binop(EQ, arg, immed64u((p64_t) 0ULL), t));
	case NEG:
		return(binop(SUB, immed64u((p64_t) 0ULL), arg, t));
	case NOT:
		/* One's compliment */
		switch(bitsperfrag()) {
		case 32:
		    return(binop(XOR, immed32u((p32_t) -1U), arg, typnull));
		case 64:
		    return(binop(XOR, immed64u((p64_t) -1ULL), arg, typnull));
		case 128:
		    return(binop(XOR, immed64s((p64_t) -1ULL), arg, typnull));
		}
	}

	/* Ops which could not be optimized, are not constructed of other ops,
	   and could not reuse an available computation's result, need to be
	   handled here.
	*/
	tup[tupsp].op = op;
	tup[tupsp].arg[0] = arg;
	tup[tupsp].arg[1] = -1;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].type = t;
	tup[tupsp].refs = 0;
	return(tupsp++);
} /* unop() */



int
leaop(sym *s,
int frag)
{
	tup[tupsp].op = LEA;
	tup[tupsp].symbol = s;
	tup[tupsp].fragment = -1;
	tup[tupsp].offset = 0;
	tup[tupsp].arg[0] = frag;
	tup[tupsp].arg[1] = -1;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].refs = 0;
	return(tupsp++);
}

int
loadrop(int address)
{
	/* Create a load using address in register tuple */
	/* Should gen: movq_x2r(address, address); */

	tup[tupsp].op = LOADR;
	tup[tupsp].arg[0] = address;
	tup[tupsp].arg[1] = -1;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].refs = 0;
	return (tupsp++);
}

int
loadrrop(sym *s,
int frag,
int off)
{
	tup[tupsp].op = LOADRR;
	tup[tupsp].symbol = s;
	tup[tupsp].fragment = -1;
	tup[tupsp].offset = off;
	tup[tupsp].arg[0] = frag;
	tup[tupsp].arg[1] = -1;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].refs = 0;
	return(tupsp++);
}

int
lvslop(sym *s,
int frag)
{
	/* Creates a tuple for a load-vector-for-shift-left operation */
	/* This is used in AltiVec to generate an index vector for the
	   permute instruction to combined two aligned loads into a single
	   unaligned value.  (i.e. generates an alignment vector.)
	*/

	int i;

	#ifdef DEBUG
		info(0, "Start lvslop()" );
	#endif

	/* Reuse an available computation's result if possible */
	/* Note: Accesses offset by an integer number of fragments have the
	   same index. */
	for (i=tupsp-1; i>=0; --i) {
		if ((tup[i].op == LVSL) &&
		    (tup[i].symbol == s) &&
		    (tup[i].offset == 0)) {
			/* Identical lvsl, reuse it */
			return(i);
		}
	}

	tup[tupsp].op = LVSL;
	tup[tupsp].symbol = s;
	tup[tupsp].fragment = frag;
	tup[tupsp].offset = 0;
	tup[tupsp].arg[0] = -1;
	tup[tupsp].arg[1] = -1;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].refs = 0;
	return(tupsp++);
}

int
loadxop(sym *s,
int index)
{
	/* Creates a tuple tree for an indexed load operation */

	register int i, j, k;
	register int position, frag;
	register unsigned long long bpf, fpf;

	#ifdef DEBUG
		info(0, "Start loadxop()" );
	#endif

#ifdef NOTYET

	/* Do we already know what's being loaded? */
	for (i=tupsp-1; i>=0; --i) {
		if ((tup[i].op == STORE) &&
		    (tup[i].symbol == s) &&
		    (tup[i].fragment == frag) &&
		    (tup[i].offset == off) &&
		    (tup[i].refs == 1)) {
			/* Store to same place, reuse value */
			i = tup[i].arg[0];
			++(tup[i].refs);
			return(i);
		}

		if ((tup[i].op == LOAD) &&
		    (tup[i].symbol == s) &&
		    (tup[i].fragment == frag)) {
			/* Identical load, reuse it */
			return(i);
		}
	}
#endif

	bpf = (unsigned long long)s->type.bits;
	fpf = fieldsperfrag (s->type.bits);

	if (fpf%2 == 0) {
		register unsigned long long pos, off, scount;

		switch(fpf) {
		case 1: off=(unsigned long long)index; pos=0ULL; break;
		case 2: off=1ULL; pos=1ULL; break;
		case 4: off=2ULL; pos=3ULL; break;
		case 8: off=3ULL; pos=7ULL; break;
		case 16: off=4ULL; pos=15ULL; break;
		case 32: off=5ULL; pos=31ULL; break;
		case 64: off=6ULL; pos=63ULL; break;
		default:
			{
				char buf[256];
				snprintf(buf,
					 256,
					 "loadxop(s=\"%s\",i=%d): Unsupported "
					 "fpf=%llu, bits=%d",
					 s->text,
					 index,
					 fpf,
					 s->type.bits);
				bug(buf);
				off=0ULL; pos=0ULL;
			}
		}

		switch(bpf) {
		case 1: scount=0ULL; break;
		case 2: scount=1ULL; break;
		case 4: scount=2ULL; break;
		case 8: scount=3ULL; break;
		case 16: scount=4ULL; break;
		case 32: scount=5ULL; break;
		case 64: scount=6ULL; break;
		default:
			{
				char buf[256];
				snprintf(buf,
					 256,
					 "loadxop():Unsupported bpf=%llu",
					 bpf);
				bug(buf);
				scount=0ULL;
			}
		}

		if (!pos) {
			position = immed64u((p64_t)0ULL);
			frag = immed64u((p64_t)off);
		} else {
			position = binop(SHL,
					 binop(AND,
					       index,
					       immed64u((p64_t)pos),
					       typnull),
					 immed64u((p64_t)scount),
					 typnull);
			frag = binop(SHR,
				       index,
				       immed64u((p64_t)off),
				       typnull);
		}
	} else {
		position = binop(MUL,
			 	binop(MOD, index, immed64u((p64_t)fpf),typnull),
			 	immed64u((p64_t)bpf),
			 	typnull);
		frag = binop(DIV, index, immed64u((p64_t)fpf), typnull);
	}

	/* At this point, frag is the tuple containing the word
	   number, and position is the lsb of the field in the word. */



#define TESTING
#ifdef TESTING
	/* Do we already know what's being loaded? */
	for (i=tupsp-1; i>=0; --i) {
		if ((tup[i].op == STORER) &&
		    (tup[ tup[i].arg[0] ].symbol == s) &&
		    (tup[ tup[i].arg[0] ].arg[0] == frag) &&
		    (tup[i].refs == 1)) {
			/* Store to same place, reuse value */
			i = tup[i].arg[0];
			++(tup[i].refs);
			return(i);
		}
		else if (tup[i].op == STORER) {
				fprintf(Cout,
					"frag=(%d,%d) refs=%d\n",
					tup[ tup[i].arg[0] ].arg[0],
					frag,
					tup[i].refs);
		}
#ifdef NOTYET
		if ((tup[i].op == LOADR) &&
		    (tup[i].symbol == s) &&
		    (tup[i].fragment == frag)) {
			/* Identical load, reuse it */
			return(i);
		}
#endif
	}
#endif

	if (optcpu & CPU_AltiVec) {
		/* Do this differently:
			1) move fragment from vreg to regb.
				a) store fragment from vreg to a known location
				b) load it into regb from that location
			2) load address of s into rega.
				Use gcc magic
			3) lvx_m2r (vd, rega, regb).
		*/
		i = loadrrop(s, frag, 0);
		j = loadrrop(s, frag, 8);
		/* This should generate the correct vector because a fragment
		   offset whould need the same index */
		k = lvslop(s, frag);
		i = trinop(TPERM, i, j, k, typ8u);
		i = trinop(TPERM,
			i,
			i,
			immedu((p128_t)
				{{0x01020304050607ULL, 0x08090a0b0c0d0e0fULL}}),
			typ8u);
	} else {
		/* This should load the effective address into i */
		i = leaop(s, frag);

		/* Load the fragment at the effective address */
		i = loadrop(i);
	}

	/* Mask and align at LSb */
	i = binop(SHR, i, position, typnull);
	return (binop(AND, i, immed64u((p64_t)((1ULL<<bpf)-1ULL)), typnull));
}

int
load_align(sym *s,
int i)
{
	/* Perform data alignment:
		Swap the 64-bit halves of the SWAR fragment, or place the non-
		SWAR object in virtual field 0 and sign-extend if necessary.
	*/
	if (s->type.attr & TYP_SWAR) {
			i = trinop(TPERM,
				   i,
				   i,
				   immedu((p128_t)
				     {{0x01020304050607ULL,
				       0x08090a0b0c0d0e0fULL}}),
				   typ8u);
	} else {
		switch (s->type.bits) {
		case 128:
			bug("uload of 128 bit non-SWAR type not supported");
			break;
		case 64:
			if (s->type.attr & TYP_UNSIGN) {
				i = trinop(TPERM,
					   immed64u((p64_t)0x0ULL),
					   i,
					   immed64u((p64_t)0x11121314151617ULL),
					   typ8u);
			} else {
				i = binop(REPL,
					  immed64u((p64_t)((unsigned long long)
							   target_field(0,64))),
					  i,
					  typ32u);
				i = binop(SHR,
					  i,
					  immedu((p128_t)
						{{0x0000003f00000000ULL,
						  0x0000003f0000003fULL}}),
					  typ32);
			}
			break;
		case 32:
			i = binop(REPL, immed64u((p64_t)0ULL), i, typ32u);
			if (s->type.attr & TYP_UNSIGN) {
				i = binop(AND,
					  i,
					  immedu((p128_t)
						{{0x00000000ffffffffULL,
						  0x0000000000000000ULL}}),
					  typ32u);
			} else {
				i = binop(SHR,
					  i,
					  immedu((p128_t)
						{{0x0000001f00000000ULL,
						  0x0000001f0000001fULL}}),
					  typ32);
			}
			break;
		case 16:
			i = binop(REPL, immed64u((p64_t)0ULL), i, typ16u);
			if (s->type.attr & TYP_UNSIGN) {
				i = binop(AND,
					  i,
					  immedu((p128_t)
						{{0x000000000000ffffULL,
					  	  0x0000000000000000ULL}}),
					  typnull);
			} else {
				i = binop(SHR,
					  i,
					  immedu((p128_t)
						{{0x000f000f000f0000ULL,
					  	  0x000f000f000f000fULL}}),
					  typ16);
			}
			break;
		case 8:
			i = binop(REPL, immed64u((p64_t)0ULL), i, typ8u);
			if (s->type.attr & TYP_UNSIGN) {
				i = binop(AND,
					  i,
					  immedu((p128_t)
						{{0x00000000000000ffULL,
					  	  0x0000000000000000ULL}}),
					  typnull);
			} else {
				i = binop(SHR,
					  i,
					  immedu((p128_t)
						{{0x0707070707070700ULL,
					  	  0x0707070707070707ULL}}),
					  typ8);
			}
			break;
		}
	}
	return i;
}

int
uloadop(sym *s,
int frag)
{
	/* Creates a tuple for an unaligned load operation */
	register int i, j, k, l;

	#ifdef DEBUG
		info(0, "Start uloadop()" );
	#endif

	/* If possbile, we want to eliminate this before breaking
	   it up into aligned loads.
	*/
	for (i=tupsp-1; i>=0; --i) {
		if ((s->type.attr & TYP_SWAR) || (s->type.bits == 128)) {
			if ((tup[i].op == STORE) &&
			    (tup[i].symbol == s) &&
			    (tup[i].fragment == frag) &&
			    (tup[i].offset == 12) &&
			    (tup[i].refs == 1)) {
				j = tup[i].arg[0];
				if ((tup[j].op == TPERM) &&
				    (tup[j].arg[0] == tup[j].arg[1]) &&
				    (tup[tup[j].arg[2]].op == NUM)) {
					j = tup[i].arg[0];
					if ((tup[j].arg[0] == tup[j].arg[1]) &&
					    (tup[tup[j].arg[2]].op == LVSL)) {
						/* UStore to same place:
						   reuse value */
/*XXX*/	fprintf(Cout, "%d-reusing %s:%d\n", __LINE__, s->text, frag);

						i = tup[j].arg[0];
						++(tup[i].refs);
						return(i);
					}
				}
			}
		} else {
			switch (s->type.bits) {
			case 64:
				if ((tup[i].op == STORE) &&
				    (tup[i].symbol == s) &&
				    (tup[i].fragment == frag) &&
				    (tup[i].offset == 4) &&
				    (tup[i].refs == 1)) {
					j = tup[i].arg[0];
					if ((tup[j].op == TPERM) &&
					    (tup[j].arg[0] == tup[j].arg[1]) &&
					    (tup[tup[j].arg[2]].op == NUM) &&
					    (tup[tup[j].arg[2]].immed.uq[1] == 
						0x08090a0b0c0d0e0fULL) &&
					    (tup[tup[j].arg[2]].immed.uq[0] == 
						0x08090a0b0c0d0e0fULL)) {
							/* UStore to same place:
							   reuse value */
/*XXX*/	fprintf(Cout, "%d-reusing %s:%d\n", __LINE__, s->text, frag);
							i = tup[j].arg[0];
							++(tup[i].refs);
							return(i);
					}
				}
				break;
			case 32:
				if ((tup[i].op == STORE) &&
				    (tup[i].symbol == s) &&
				    (tup[i].fragment == frag) &&
				    (tup[i].offset == 0) &&
				    (tup[i].refs == 1)) {
					j = tup[i].arg[0];
					if ((tup[j].op == REPL) &&
					    (tup[j].type.bits == 32) &&
					    (tup[tup[j].arg[0]].op == NUM) &&
					    (tup[tup[j].arg[0]].immed.uq[1] == 
						(unsigned long long)
						target_field(0,32)) &&
					    (tup[tup[j].arg[2]].immed.uq[0] == 
						0x0ULL)) {
							/* UStore to same place:
							   reuse value */
/*XXX*/	fprintf(Cout, "%d-reusing %s:%d\n", __LINE__, s->text, frag);
							i = tup[j].arg[1];
							++(tup[i].refs);
							return(i);
					}
				}
				break;
			case 16:
			case 8:
			default:
				info(0, "case not implemented in uloadop()");
				break;
			}
		}

#ifdef NOTYET
		/* The first part of an identical uload looks like this:
			tup[i]:	op == TPERM
				arg[0]: op == LOAD
					symbol == s
					fragment == frag
					offset == 0
				arg[1]: op == LOAD
					symbol == s
					fragment == frag
					offset == 8
				arg[2]: op == LVSL
					symbol == s
					fragment == frag
		*/

		if (tup[i].op == TPERM) {
			j = tup[i].arg[0];
			k = tup[i].arg[1];
			l = tup[i].arg[2];

			if ((tup[j].op == LOAD) &&
			    (tup[j].symbol == s) &&
			    (tup[j].fragment == frag) &&
			    (tup[j].offset == 0) &&

			    (tup[k].op == LOAD) &&
			    (tup[k].symbol == s) &&
			    (tup[k].fragment == frag) &&
			    (tup[k].offset == 8) &&

			    (tup[l].op == LVSL) &&
			    (tup[l].symbol == s) &&
			    (tup[l].fragment == frag)) {
				/* Identical load, reuse it */
/*XXX*/	fprintf(Cout, "%d-reusing %s:%d\n", __LINE__, s->text, frag);
				return(load_align(s,i));
			}
		}
#endif (NOTYET)
	}

	/* On AltiVec, this will place the data object in hardware field 0 */
	j = loadop(s, frag, 0);
	k = loadop(s, frag, 8);
	l = lvslop(s, frag);
	i = trinop(TPERM, j, k, l, typ8u);
	return(load_align(s,i));
}

int
uloadpadop(sym *s,
int frag)
{
	/* Creates a tuple for an unaligned load operation */
	/* This load is used when we are reading a non-swar
	   value to get the padding -- if we used a regular
	   load, it would think that the value stored
	   last was the one to get, but that's wrong in
	   terms of the padding.  In fact, we might get
	   the wrong valid field values when we do a
	   loadpadop(), but we will mask-out those anyway!
	*/
	register int i, j, k, l;

	#ifdef DEBUG
		info(0, "Start uloadpadop()" );
	#endif

	/* If possbile, we want to eliminate this before breaking
	   it up into aligned loads.
	*/
	for (i=tupsp-1; i>=0; --i) {
		/* A uload with an identical first part (before the data
		   alignment below) looks like this:
			tup[i]:	op == TPERM
				arg[0]: op == LOAD
					symbol == s
					fragment == frag
					offset == 0
				arg[1]: op == LOAD
					symbol == s
					fragment == frag
					offset == 8
				arg[2]: op == LVSL
					symbol == s
					fragment == frag
		*/

		if (tup[i].op == TPERM) {
			j = tup[i].arg[0];
			k = tup[i].arg[1];
			l = tup[i].arg[2];

			if ((tup[j].op == LOAD) &&
			    (tup[j].symbol == s) &&
			    (tup[j].fragment == frag) &&
			    (tup[j].offset == 0) &&

			    (tup[k].op == LOAD) &&
			    (tup[k].symbol == s) &&
			    (tup[k].fragment == frag) &&
			    (tup[k].offset == 8) &&

			    (tup[l].op == LVSL) &&
			    (tup[l].symbol == s) &&
			    (tup[l].fragment == frag)) {
				/* Identical load, reuse it */
				return(load_align(s,i));
			}
		}
	}

	/* On AltiVec, this will place the data object in hardware field 0 */
	j = loadpadop(s, frag, 0);
	k = loadpadop(s, frag, 8);
	l = lvslop(s, frag);
	i = trinop(TPERM, j, k, l, typ8u);
	return(load_align(s,i));
}


int
loadop(sym *s,
int frag,
int off)
{
	/* Creates a tuple for a load operation */
	register int i;

	#ifdef DEBUG
		info(0, "Start loadop()" );
	#endif

	/* Do we already know what's being loaded? */
	for (i=tupsp-1; i>=0; --i) {
		if ((tup[i].op == STORE) &&
		    (tup[i].symbol == s) &&
		    (tup[i].fragment == frag) &&
		    (tup[i].offset == off) &&
		    (tup[i].refs == 1)) {
			/* Last store to same place: reuse value */
			i = tup[i].arg[0];
			++(tup[i].refs);
			return(i);
		}

		if ((tup[i].op == LOAD) &&
		    (tup[i].symbol == s) &&
		    (tup[i].fragment == frag) &&
		    (tup[i].offset == off)) {
			/* Identical load, reuse it */
			return(i);
		}
	}

	/* Nope */
	tup[tupsp].op = LOAD;
	tup[tupsp].symbol = s;
	tup[tupsp].fragment = frag;
	tup[tupsp].offset = off;
	tup[tupsp].arg[0] = -1;
	tup[tupsp].arg[1] = -1;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].refs = 0;
	return(tupsp++);
}

int
loadpadop(sym *s,
int frag,
int off)
{
	/* This load is used when we are reading a non-swar
	   value to get the padding -- if we used a regular
	   load, it would think that the value stored
	   last was the one to get, but that's wrong in
	   terms of the padding.  In fact, we might get
	   the wrong valid field values when we do a
	   loadpadop(), but we will mask-out those anyway!
	*/
	register int i;

	#ifdef DEBUG
		info(0, "Start loadpadop()" );
	#endif
	for (i=tupsp-1; i>=0; --i) {
		if ((tup[i].op == LOAD) &&
		    (tup[i].symbol == s) &&
		    (tup[i].fragment == frag) &&
		    (tup[i].offset == off)) {
			/* Identical load, reuse it */
			return(i);
		}
	}

	/* Nope */
	tup[tupsp].op = LOAD;
	tup[tupsp].symbol = s;
	tup[tupsp].fragment = frag;
	tup[tupsp].offset = off;
	tup[tupsp].arg[0] = -1;
	tup[tupsp].arg[1] = -1;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].refs = 0;
	return(tupsp++);
}

void
storerrop(int data,
sym *s,
int frag,
int off,
typ t)
{
	#ifdef DEBUG
		info(0, "Start storerrop()" );
	#endif

	tup[tupsp].op = STORERR;
	tup[tupsp].symbol = s;
	tup[tupsp].fragment = -1;
	tup[tupsp].offset = off;
	tup[tupsp].arg[0] = frag;
	tup[tupsp].arg[1] = data;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].type = t;
	tup[tupsp].refs = 1;
	++tupsp;
	return;
}

void
storerop(int data, int address)
{
	/* Create a store using address in register tuple */
	/* Should gen: movq_r2x(data, address); */

	#ifdef DEBUG
		info(0, "Start storerop()" );
	#endif

	tup[tupsp].op = STORER;
	tup[tupsp].arg[0] = address;
	tup[tupsp].arg[1] = data;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].refs = 1;
	++tupsp;
	return;
}

/* Creates a tuple for a store operation */
void
storexop(int data,
sym *s,
int index)
{
	/* Store, and incrementally kill dead code and check dependencies. */
	register int i, j, k, l;
	register int position, frag;
	register unsigned long long bpf, fpf;

	#ifdef DEBUG
		info(0, "Start storexop()" );
	#endif

	/* No load known to precede this */
	tup[tupsp].antidep = -1;

	for (i=tupsp-1; i>=0; --i) {
		#ifdef NOTNOW
			/* Zero reference (i.e. mark for removal) dead stores */
			if ((tup[i].op == STORE) &&
			    (tup[i].symbol == s) &&
			    (tup[i].fragment == frag) &&
			    (tup[i].refs == 1)) {
				/* Store to same place is now dead */
				tup[ tup[i].arg[0] ].refs = 0;
				tup[i].refs = 0;
			}
		#endif

		/* Find and note anti-dependencies */
		/* For now, assume any hits to the same symbol will affect this
		   fragment.  Horribly non-optimal and possibly incorrect. */
		if ((tup[i].op == LOAD) &&
		    (tup[i].symbol == s)) {
			tup[tupsp].antidep = i;
		}
		if ((tup[i].op == LOADR) &&
		    (tup[ tup[i].arg[0] ].symbol == s)) {
			/* Assume LOADR must precede this */
			tup[tupsp].antidep = i;
		}
	}

	bpf = (unsigned long long)s->type.bits;
	fpf = fieldsperfrag (s->type.bits);

	if (fpf%2 == 0) {
		register unsigned long long pos, off, scount;

		switch(fpf) {
		case 1: off=(unsigned long long)index; pos=0ULL; break;
		case 2: off=1ULL; pos=1ULL; break;
		case 4: off=2ULL; pos=3ULL; break;
		case 8: off=3ULL; pos=7ULL; break;
		case 16: off=4ULL; pos=15ULL; break;
		case 32: off=5ULL; pos=31ULL; break;
		case 64: off=6ULL; pos=63ULL; break;
		default:
			{
				char buf[256];
				snprintf(buf,
					 256,
					 "storexop(s=\"%s\",i=%d): Unsupported "
					 "fpf=%llu, bits=%d",
					 s->text,
					 index,
					 fpf,
					 s->type.bits);
				bug(buf);
				off=0ULL; pos=0ULL;
			}
		}

		switch(bpf) {
		case 1: scount=0ULL; break;
		case 2: scount=1ULL; break;
		case 4: scount=2ULL; break;
		case 8: scount=3ULL; break;
		case 16: scount=4ULL; break;
		case 32: scount=5ULL; break;
		case 64: scount=6ULL; break;
		default:
			{
				char buf[256];
				snprintf(buf,
					 256,
					 "storexop():Unsupported bpf=%llu",
					 bpf);
				bug(buf);
				scount=0ULL;
			}
		}

		if (!pos) {
			position = immed64u((p64_t)0ULL);
			frag = immed64u((p64_t)off);
		} else {
			position = binop(SHL,
					 binop(AND,
					       index,
					       immed64u((p64_t)pos),
					       typnull),
					 immed64u((p64_t)scount),
					 typnull);
			frag = binop(SHR,
				     index,
				     immed64u((p64_t)off),
				     typnull);
		}
	} else {
		position = binop(MUL,
			 binop(MOD, index, immed64u((p64_t)fpf),typnull),
			 immed64u((p64_t)bpf),
			 typnull);
		frag = binop(DIV, index, immed64u((p64_t)fpf), typnull);
	}

	/* At this point, frag is the tuple containing the word
	   number, and position is the lsb of the field in the word. */

	if (optcpu & CPU_AltiVec) {
		i = loadrrop(s, frag, 0);
		j = loadrrop(s, frag, 8);
		/* This should generate the correct vector because a fragment
		   offset whould need the same index */
		k = lvslop(s, frag);
		i = trinop(TPERM, i, j, k, typ8u);
		i = trinop(TPERM,
			   i,
			   i,
			   immedu((p128_t)
				{{0x01020304050607ULL, 0x08090a0b0c0d0e0fULL}}),
			   typ8u);

		/* Now we need to insert the field, then store the fragment */
		/* Get field mask */
		j = immed64u((p64_t)((1ULL<<bpf)-1ULL));
		j = binop(SHL, j, position, typnull);

		/* Mask out old field */
		i = binop(AND, i, unop(NOT, j, typnull), typnull);

		/* Align data to be stored at LSb of field and insert */
		l = binop(SHL, data, position, typnull);
		l = binop(AND, j, l, typnull);
		i = binop(OR, i, l, typnull);

		/* Store the updated fragment i at the effective address i */
		i = trinop(TPERM,
			   i,
			   i,
			   immedu((p128_t)
				{{0x01020304050607ULL, 0x08090a0b0c0d0e0fULL}}),
			   typ8u);
		i = trinop(TPERM, i, i, k, typ8u);
		storerrop(i, s, frag, 0, typ32);
		storerrop(i, s, frag, 4, typ32);
		storerrop(i, s, frag, 8, typ32);
		storerrop(i, s, frag, 12, typ32);

	} else {
		/* This should load the effective address into i */
		i = leaop(s, frag);

		/* Load the fragment at the effective address */
		j = loadrop(i);

		/* Now we need to insert the field, then store the fragment */
		/* Get field mask */
		k = immed64u((p64_t)((1ULL<<bpf)-1ULL));
		k = binop(SHL, k, position, typnull);

		/* Mask out old field */
		j = binop(AND, j, unop(NOT, k, typnull), typnull);

		/* Align data to be stored at LSb of field and insert */
		l = binop(SHL, data, position, typnull);
		l = binop(AND, k, l, typnull);
		j = binop(OR, j, l, typnull);

		/* Store the updated fragment j at the effective address i */
		storerop(j, i);
	}

	return;
}

void
ustoreop(int data,
sym *s,
int frag,
typ t)
{
	/* Creates a tuple for an unaligned store operation */
	register int i;

	#ifdef DEBUG
		info(0, "Start ustoreop()" );
	#endif

	if ((s->type.attr & TYP_SWAR) || (s->type.bits == 128)) {
		i = lvslop(s, frag);
		i = trinop(TPERM, data, data, i, typ8u);
		i = trinop(TPERM,
			   i,
			   i,
			   immedu((p128_t)
				{{0x01020304050607ULL, 0x08090a0b0c0d0e0fULL}}),
			   typ8u);
		storeop(i, s, frag, 0, typ32);
		storeop(i, s, frag, 4, typ32);
		storeop(i, s, frag, 8, typ32);
		storeop(i, s, frag, 12, typ32);
	} else {
		switch (s->type.bits) {
		case 64:
			i = trinop(TPERM,
				  data,
				  data,
				  immedu((p128_t){{0x08090a0b0c0d0e0fULL,
						   0x08090a0b0c0d0e0fULL}}),
				  typ8u);
			storeop(i, s, frag, 0, typ32);
			storeop(i, s, frag, 4, typ32);
			break;
		case 32:
			i = binop(REPL,
				  immed64u((p64_t)((unsigned long long)
						   target_field(0,32))),
				  data,
				  typ32u);
			storeop(i, s, frag, 0, typ32);
			break;
		case 16:
		case 8:
		default:
			info(0, "case not implemented in ustoreop()");
		}
	}
}


void
storeop(int data,
sym *s,
int frag,
int off,
typ t)
{
	/* Creates a tuple for a store operation */

	/* Store, and incrementally kill dead code and check dependencies. */
	register int i;

	#ifdef DEBUG
		info(0, "Start storeop()" );
	#endif

	/* No load known to precede this */
	tup[tupsp].antidep = -1;

	for (i=tupsp-1; i>=0; --i) {
		/* Zero reference (i.e. mark for removal) dead stores */
		if ((tup[i].op == STORE) &&
		    (tup[i].symbol == s) &&
		    (tup[i].fragment == frag) &&
		    (tup[i].offset == off) &&
		    (tup[i].refs == 1)) {
			/* Store to same place is now dead */
			tup[ tup[i].arg[0] ].refs = 0;
			tup[i].refs = 0;
		}

		/* Find and note anti-dependencies */
		if ((tup[i].op == LOAD) &&
		    (tup[i].symbol == s) &&
		    (tup[i].fragment == frag) &&
		    (tup[i].offset == off)) {
			/* Load that must precede this,
			   unless we are storing the same
			   value that we loaded, in which
			   case this is a nop!
			*/
			if (data == i) {
				return;
			}
			tup[tupsp].antidep = i;
		}
		if ((tup[i].op == LOADR) &&
		    (tup[ tup[i].arg[0] ].symbol == s)) {
			/* Assume LOADR must precede this */
			tup[tupsp].antidep = i;
		}
	}

	/* Nope */
	tup[tupsp].op = STORE;
	tup[tupsp].symbol = s;
	tup[tupsp].fragment = frag;
	tup[tupsp].offset = off;
	tup[tupsp].arg[0] = data;
	tup[tupsp].arg[1] = -1;
	tup[tupsp].arg[2] = -1;
	tup[tupsp].type = t;
	tup[tupsp].refs = 1;
	++tupsp;
	return;
}

void
deadscope(int serial)
{
	/* Stores to objects in this scope number are dead */
	register int i;

	for (i=0; i<tupsp; ++i) {
		if ((tup[i].op == STORE) &&
		    (tup[i].refs > 0) &&
		    (tup[i].symbol->serial == serial)) {
			tup[i].refs = 0;
		}
	}
}


int
const_replicate(int i,
int bits)
{
	/* Returns the index to a immediate tuple which is the constant
	   value equal to the low field value replicated to a partitioned
	   value with fields of "bits" bits.
	*/
	p128_t mask;
	p128_t rval;

	#ifdef DEBUG
		info(0, "Start const_replicate()" );
	#endif

	/* Make a mask */
	switch (bitsperfrag()) {
	case 128:
		{
		if (bits>63) {
			mask.uq[1] = (1ULL << ((unsigned long long)bits-64ULL))
				    - 1ULL;
			mask.uq[0] = 0xffffffffffffffffULL;
		} else {
			mask.uq[1] = 0ULL;
			mask.uq[0] = (1ULL << (unsigned long long) bits) - 1ULL;
		}
		break;
		}
	case 64:
		mask.uq[1] = 0ULL;
		mask.uq[0] = (1ULL << (unsigned long long)bits) - 1ULL;
		break;
	case 32:
		mask.uq[1] = mask.ud[1] = 0U;
		mask.ud[0] = (1U << (unsigned int) bits) - 1U;
		break;
	}

	rval.uq[1] = mask.uq[1] & tup[i].immed.uq[1];
	rval.uq[0] = mask.uq[0] & tup[i].immed.uq[0];

	switch (bits) {
	case 1:
		rval.uq[0] = rval.uq[0] | (rval.uq[0] << 1);
	case 2:
		rval.uq[0] = rval.uq[0] | (rval.uq[0] << 2);
	case 4:
		rval.uq[0] = rval.uq[0] | (rval.uq[0] << 4);
	case 8:
		rval.uq[0] = rval.uq[0] | (rval.uq[0] << 8);
	case 16:
		rval.uq[0] = rval.uq[0] | (rval.uq[0] << 16);
	case 32:
		if (bitsperfrag() <= 32) break;
		rval.uq[0] = rval.uq[0] | (rval.uq[0] << 32);
	case 64:
		if (bitsperfrag() <= 64) break;
		rval.uq[1] = rval.uq[0];
		break;
	default:
	    {
		char buf[256];
		snprintf(buf,
			 256,
			 "Unsupported field size %d in const_replicate()",
			 bits);
		bug(buf);
		break;
	    }
	}
	return (immed128(rval));
}

int
replicate(int i,
int bits)
{
	/* Builds and returns the index to a tuple tree for converting
	   a single-field value into a replicated partitioned value with
	   fields of "bits" bits.
	*/
	#ifdef DEBUG
		info(0,"Start replicate()");
	#endif

	/* Find the actual number of bits/field for the target type */
	bits = bitsperfield(bits);

	/* Optimization - Do replicates on constants statically */
	if (tup[i].op == NUM) return ( const_replicate(i, bits) );

	/* Create a bitmask to separate the data in the lower "bits" bits of
	   the fragment from the stuff in the rest of the fragment.
	*/
	switch (bitsperfrag()) {
	case 128:
		if ((optcpu & CPU_AltiVec) &&
		    ((bits==8) || (bits==16) || (bits==32))) {
		    	/* The splat instructions will work for these, so... */
			break;
		} else if (((optcpu & CPU_athlon) || (optcpu & CPU_MAX)) &&
		           ((bits==16) || (bits==32))) {
			break;
		} else {
			p128_t tmp;
			if (bits>63) {
				tmp.uq[1] = (1ULL <<
					     ((unsigned long long)bits-64ULL))
					    - 1ULL;
				tmp.uq[0] = 0xffffffffffffffffULL;
			} else {
				tmp.uq[1] = 0ULL;
				tmp.uq[0] = (1ULL << (unsigned long long) bits)
					    - 1ULL;
			}
			i = binop(AND, i, immed128(tmp), typnull);
			break;
		}
	case 64:
		i = binop(AND,
			  i,
			  immed64u((p64_t)
				   ((1ULL << (unsigned long long)bits) - 1ULL)),
			  typnull);
		break;
	case 32:
		i = binop(AND,
			  i,
			  immed32u((p32_t) ((1U << (unsigned int) bits) - 1U)),
			  typnull);
		break;
	}

	/* Now replicate the low field in the rest of the fragment */
	switch (bits) {
	case 1:
		/* Sneaky sequence uses 32-bit neg... */
		switch (bitsperfrag()) {
		case 128:
			if (optcpu & CPU_AltiVec) {
				i = unop(NEG, i, typ32);
				return(binop(REPL,
					immed64u((p64_t)((unsigned long long)
							 target_field(0,32))),
					i,
					typ32u));
			} else {
				register int j;

				i = unop(NEG, i, typ32);
				j = binop(SHL,
					  i,
					  immed64u((p64_t)32ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
				j = binop(SHL,
					  i,
					  immed64u((p64_t)64ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}
			break;
		case 64:
			if (0) {
				/* Use this when SUB64s is available... */
				/* The 64-bit integer archs (like the Alpha)
				   should have this. */
				i = unop(NEG, i, typ64);
			} else {
				/* else use this when SUB32s is available... */
				i = unop(NEG, i, typ32);
				i = binop(OR,
					  i,
					  binop(SHL,
						i,
						immed64u((p64_t) 32ULL),
						typnull),
					  typnull);
			}
			break;
		case 32:
			i = unop(NEG, i, typ32);
			break;
		}
		break;
	case 2:
		/* Trick from 1-bit doesn't work... need tree */
		i = binop(OR,
			  i,
			  binop(SHL,
				i,
				(optcpu & CPU_AltiVec)?
					immedu(cvt1x8uto16x8u(0x02)) :
					immed64u((p64_t) 2ULL),
				typnull),
			  typnull);
		/* Fall through... */
	case 4:
		i = binop(OR,
			  i,
			  binop(SHL,
				i,
				(optcpu & CPU_AltiVec)?
					immedu(cvt1x8uto16x8u(0x04)) :
					immed64u((p64_t) 4ULL),
				typnull),
			  typnull);
		/* Fall through... */
	case 8:
		if (optcpu & CPU_AltiVec) {
			return(binop(REPL,
				     immed64u((p64_t)((unsigned long long)
							target_field(0,8))),
				     i,
				     typ8u));
		} else {
			i = binop(OR,
				  i,
				  binop(SHL,
					i,
					immed64u((p64_t) 8ULL),
					typnull),
				  typnull);
		}
		/* Fall through... */
	case 16:
		if (optcpu & CPU_AltiVec) {
			return(binop(REPL,
				     immed64u((p64_t)((unsigned long long)
							target_field(0,16))),
				     i,
				     typ16u));
		} else if (optcpu & CPU_athlon) {
			return(binop(PERM, i, immed32u((p32_t) 0), typ16u));
		} else if (optcpu & CPU_MAX) {
			return(binop(PERM,
				     i,
				     immed32u((p32_t) 3333),
				     typ16u));
		} else {
			i = binop(OR,
				  i,
				  binop(SHL,
					i,
					immed64u((p64_t) 16ULL),
					typnull),
				  typnull);
		}

		/* 32-bit registers should end here. */
		if (bitsperfrag() == 32) break;

		/* Others fall through... */
	case 32:
		if (optcpu & CPU_AltiVec) {
			return(binop(REPL,
				     immed64u((p64_t)((unsigned long long)
							target_field(0,32))),
				     i,
				     typ32u));
		} else if (optcpu & CPU_athlon) {
			return(binop(PERM,
				     i,
				     immed32u((p32_t) 0x44),
				     typ16u));
		} else if (optcpu & CPU_MAX) {
			return(binop(PERM,
				     i,
				     immed32u((p32_t) 2323),
				     typ16u));
		} else {
			i = binop(OR,
				  i,
				  binop(SHL,
					i,
					immed64u((p64_t) 32ULL),
					typnull),
				  typnull);
			break;
		}

		/* 64-bit registers should end here. */
		if (bitsperfrag() == 64) break;

		/* Others fall through... */
	case 64:
		if (bitsperfrag() == 128) {
			i = binop(OR,
				  i,
				  binop(SHL,
					i,
					immed64u((p64_t) 64ULL),
					typnull),
				  typnull);
			/* 128-bit registers should end here. */
			break;
		}

		bug("replicate() reports unsupported CPU_type"); 

	default:
		bug("bad replicate width (this cannot happen?)"); 
	}

	return(i);
} /* replicate() */



static void
ia32_pack(int sb,
int *sr,
int db,
int *dr,
int sw)
{
	/* Pack sw words from sr into dr */
	register int i;

	if ((sb == 32) && (db == 16)) {
		/* This case does happen, but should it? */
		/* Pack vectors which have an odd number of elements with 0 */
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed32u((p32_t) 0),
					    typ16);
			sw &= ~1;
		}
		/* Now pack the rest odd/even */
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ16);
		}
		return;
	}

	if ((sb == 16) && (db == 8)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed32((p32_t) 0),
					    typ8);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ8);
		}
		return;
	}

	if ((sb == 8) && (db == 4)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed32((p32_t) 0),
					    typ4);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ4);
		}
		return;
	}

	if ((sb == 4) && (db == 2)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed32((p32_t) 0),
					    typ2);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ2);
		}
		return;
	}

	if ((sb == 2) && (db == 1)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed32((p32_t) 0),
					    typ1);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ1);
		}
		return;
	}


	/* Recursively apply multi-step conversions */
	if (sb > db) {
		#ifdef DEBUG_IA32PACK
			char buf[256];
			snprintf ( buf,
				   256,
				   "ia32_pack sb=%d db=%d",
				   sb,
				   db );
			info(0, buf );
		#endif
		ia32_pack(sb, sr, (sb / 2), dr, sw);
		sw = ((sw + 1) / 2);
		for (i=0; i<sw; ++i) sr[i] = dr[i];
		ia32_pack((sb / 2), sr, db, dr, sw);
	}
} /* ia32_pack() */

static void
mmx_pack(int sb,
int *sr,
int db,
int *dr,
int sw)
{
	/* Pack sw words from sr into dr */
	register int i;

	if ((sb == 64) && (db == 32)) {
		/* This case does happen, but should it? */
		/* Pack vectors which have an odd number of elements with 0 */
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ32);
			sw &= ~1;
		}
		/* Now pack the rest odd/even */
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ32);
		}
		return;
	}

	if ((sb == 32) && (db == 16)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ16);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ16);
		}
		return;
	}

	if ((sb == 16) && (db == 8)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ8);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ8);
		}
		return;
	}

	if ((sb == 8) && (db == 4)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ4);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ4);
		}
		return;
	}

	if ((sb == 4) && (db == 2)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ2);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ2);
		}
		return;
	}

	if ((sb == 2) && (db == 1)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ1);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ1);
		}
		return;
	}


	/* Recursively apply multi-step conversions */
	if (sb > db) {
		#ifdef DEBUG_MMXPACK
			char buf[256];
			snprintf ( buf,
				   256,
				   "mmx_pack sb=%d db=%d",
				   sb,
				   db );
			info(0, buf );
		#endif
		mmx_pack(sb, sr, (sb / 2), dr, sw);
		sw = ((sw + 1) / 2);
		for (i=0; i<sw; ++i) sr[i] = dr[i];
		mmx_pack((sb / 2), sr, db, dr, sw);
	}
} /* mmx_pack() */

static void
sse_pack(int sb,
int *sr,
int db,
int *dr,
int sw)
{
	/* Pack sw words from sr into dr */
	/* This function is probably broken. */
	register int i;
	p128_t tmp;

	if ((sb == 128) && (db == 64)) {
		/* This case does happen, but should it? */
		/* Pack vectors which have an odd number of elements with 0 */
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ64);
			sw &= ~1;
		}
		/* Now pack the rest odd/even */
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ64);
		}
		return;
	}

	if ((sb == 64) && (db == 32)) {
		/* Pack vectors which have an odd number of elements with 0 */
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ32);
			sw &= ~1;
		}
		/* Now pack the rest odd/even */
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ32);
		}
		return;
	}

	if ((sb == 32) && (db == 16)) {
		if (sw & 1) {
			tmp.q[1] = 0ULL;
			tmp.q[0] = 0ULL;
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed128(tmp),
					    typ16);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ16);
		}
		return;
	}

	if ((sb == 16) && (db == 8)) {
		if (sw & 1) {
			tmp.q[1] = 0ULL;
			tmp.q[0] = 0ULL;
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed128(tmp),
					    typ8);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ8);
		}
		return;
	}

	if ((sb == 8) && (db == 4)) {
		if (sw & 1) {
			tmp.q[1] = 0ULL;
			tmp.q[0] = 0ULL;
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed128(tmp),
					    typ4);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ4);
		}
		return;
	}

	if ((sb == 4) && (db == 2)) {
		if (sw & 1) {
			tmp.q[1] = 0ULL;
			tmp.q[0] = 0ULL;
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed128(tmp),
					    typ2);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ2);
		}
		return;
	}

	if ((sb == 2) && (db == 1)) {
		if (sw & 1) {
			tmp.q[1] = 0ULL;
			tmp.q[0] = 0ULL;
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed128(tmp),
					    typ1);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ1);
		}
		return;
	}


	/* Recursively apply multi-step conversions */
	if (sb > db) {
		sse_pack(sb, sr, (sb / 2), dr, sw);
		sw = ((sw + 1) / 2);
		for (i=0; i<sw; ++i) sr[i] = dr[i];
		sse_pack((sb / 2), sr, db, dr, sw);
	}
} /* sse_pack() */

static void
max_pack(int sb,
int *sr,
int db,
int *dr,
int sw)
{
	/* Pack sw words from sr into dr */
	register int i;

	if ((sb == 32) && (db == 16)) {
		/* Pack vectors which have an odd number of elements with 0 */
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ16);
			sw &= ~1;
		}
		/* Now pack the rest odd/even */
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ16);
		}
		return;
	}

	if ((sb == 16) && (db == 8)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ8);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ8);
		}
		return;
	}

	if ((sb == 8) && (db == 4)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ4);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ4);
		}
		return;
	}

	if ((sb == 4) && (db == 2)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ2);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ2);
		}
		return;
	}

	if ((sb == 2) && (db == 1)) {
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ1);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ1);
		}
		return;
	}


	/* Recursively apply multi-step conversions */
	if (sb > db) {
		max_pack(sb, sr, (sb / 2), dr, sw);
		sw = ((sw + 1) / 2);
		for (i=0; i<sw; ++i) sr[i] = dr[i];
		max_pack((sb / 2), sr, db, dr, sw);
	}
} /* max_pack() */

static void
altivec_pack(int sb,
int *sr,
int db,
int *dr,
int sw)
{
	/* Pack sw words from sr into dr */
	/* This function is probably broken. */
	register int i;
	p128_t tmp;

	if ((sb == 128) && (db == 64)) {
		/* This case does happen, but should it? */
		/* Pack vectors which have an odd number of elements with 0 */
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ64);
			sw &= ~1;
		}
		/* Now pack the rest odd/even */
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ64);
		}
		return;
	}

	if ((sb == 64) && (db == 32)) {
		/* Pack vectors which have an odd number of elements with 0 */
		if (sw & 1) {
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed64u((p64_t) 0ULL),
					    typ32);
			sw &= ~1;
		}
		/* Now pack the rest odd/even */
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ32);
		}
		return;
	}

	if ((sb == 32) && (db == 16)) {
		if (sw & 1) {
			tmp.q[1] = 0ULL;
			tmp.q[0] = 0ULL;
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed128(tmp),
					    typ16);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ16);
		}
		return;
	}

	if ((sb == 16) && (db == 8)) {
		if (sw & 1) {
			tmp.q[1] = 0ULL;
			tmp.q[0] = 0ULL;
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed128(tmp),
					    typ8);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ8);
		}
		return;
	}

	if ((sb == 8) && (db == 4)) {
		if (sw & 1) {
			tmp.q[1] = 0ULL;
			tmp.q[0] = 0ULL;
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed128(tmp),
					    typ4);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ4);
		}
		return;
	}

	if ((sb == 4) && (db == 2)) {
		if (sw & 1) {
			tmp.q[1] = 0ULL;
			tmp.q[0] = 0ULL;
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed128(tmp),
					    typ2);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ2);
		}
		return;
	}

	if ((sb == 2) && (db == 1)) {
		if (sw & 1) {
			tmp.q[1] = 0ULL;
			tmp.q[0] = 0ULL;
			dr[sw >> 1] = binop(PACK,
					    sr[sw - 1],
					    immed128(tmp),
					    typ1);
			sw &= ~1;
		}
		for (i=0; i<sw; i+=2) {
			dr[i >> 1] = binop(PACK,
					   sr[i],
					   sr[i + 1],
					   typ1);
		}
		return;
	}


	/* Recursively apply multi-step conversions */
	if (sb > db) {
		altivec_pack(sb, sr, (sb / 2), dr, sw);
		sw = ((sw + 1) / 2);
		for (i=0; i<sw; ++i) sr[i] = dr[i];
		altivec_pack((sb / 2), sr, db, dr, sw);
	}
} /* altivec_pack() */

void
pack(int sb,
int *sr,
int db,
int *dr,
int sw)
{
	if (optcpu & CPU_SSE) {
		sse_pack(sb, sr, db, dr, sw);
	} else if (optcpu & CPU_MMX) {
		mmx_pack(sb, sr, db, dr, sw);
	} else if (optcpu == GenericIA32) {
		ia32_pack(sb, sr, db, dr, sw);
	} else if (optcpu & CPU_MAX) {
		max_pack(sb, sr, db, dr, sw);
	} else if (optcpu & CPU_AltiVec) {
		altivec_pack(sb, sr, db, dr, sw);
	}
}



/* This needs to be looked over */
static void
ia32_interleave(int sb,
int *sr,
int db,
int *dr,
int dw,
int attr)
{
	/* Interleave sw words from sr into dr */

	if (sb == (db / 2)) {
		register int i;
		typ t;
		register int sign = 1;

		if (attr & TYP_UNSIGN) sign = 0;

		switch (db) {
		case 1:		t = typ1u; break;
		case 2:		t = typ2u; break;
		case 4:		t = typ4u; break;
		case 8:		t = typ8u; sign <<= 1; break;
		default:	t = typ16u; sign <<= 1; break;
		}

		for (i=0; i<dw; ++i) {
			if (sign == 2) {
				/* Use Intel recommended
				   trick for signed interleave
				*/
				dr[i] = binop(((i & 1) ? INTRLVHIGH:INTRLVLOW),
					      immed32((p32_t) 0),
					      sr[i / 2],
					      t);
				dr[i] = binop(SHR, dr[i],
					      immed32((p32_t)
						((int) (db / 2))),
					      t);
			} else {
				/* Do ordinary unsigned interleave */
				dr[i] = binop(((i & 1) ? INTRLVHIGH:INTRLVLOW),
					      sr[i / 2],
					      immed32((p32_t) 0),
					      t);
			}
		}
		return;
	}

	/* Recursively interleave... */
	if (sb < db) {
		/* Should I make this sizeof(char) ? */
		register int *p = ((int *) malloc(dw * sizeof(int)));
		register int i, j;

		/* HEREHERE - Is this correct for 32-bit regs? */
		if (sb < 8) {
			/* Always unsigned */
			attr |= TYP_UNSIGN;
		}

		i = dw;
		for (j=sb; j<(db/2); j+=j) {
			i = ((i + 1) / 2);
		}

		ia32_interleave(sb, sr, (sb * 2), p, i, attr);
		ia32_interleave((sb * 2), p, db, dr, dw, attr);
		return;
	}

	error("this interleave conversion not implemented");
} /* ia32_interleave() */

static void
mmx_interleave(int sb,
int *sr,
int db,
int *dr,
int dw,
int attr)
{
	/* Interleave sw words from sr into dr */

	if (sb == (db / 2)) {
		register int i;
		typ t;
		register int sign = 1;

		if (attr & TYP_UNSIGN) sign = 0;

		switch (db) {
		case 1:		t = typ1u; break;
		case 2:		t = typ2u; break;
		case 4:		t = typ4u; break;
		case 8:		t = typ8u; break;
		case 16:	t = typ16u; sign <<= 1; break;
		default:	t = typ32u; sign <<= 1; break;
		}

		for (i=0; i<dw; ++i) {
			if (sign == 2) {
				/* Use Intel recommended
				   trick for signed interleave
				*/
				dr[i] = binop(((i & 1) ? INTRLVHIGH:INTRLVLOW),
					      immed64u((p64_t) 0ULL),
					      sr[i / 2],
					      t);
				dr[i] = binop(SHR, dr[i],
					      immed64u((p64_t)
						((long long) (db / 2))),
					      t);
			} else {
				/* Do ordinary unsigned interleave */
				dr[i] = binop(((i & 1) ? INTRLVHIGH:INTRLVLOW),
					      sr[i / 2],
					      immed64u((p64_t) 0ULL),
					      t);
			}
		}
		return;
	}

	/* Recursively interleave... */
	if (sb < db) {
		register int *p = ((int *) malloc(dw * sizeof(int)));
		register int i, j;

		if (sb < 8) {
			/* Always unsigned */
			attr |= TYP_UNSIGN;
		}

		i = dw;
		for (j=sb; j<(db/2); j+=j) {
			i = ((i + 1) / 2);
		}

		mmx_interleave(sb, sr, (sb * 2), p, i, attr);
		mmx_interleave((sb * 2), p, db, dr, dw, attr);
		return;
	}

	error("this interleave conversion not implemented");
} /* mmx_interleave() */

static void
sse_interleave(int sb,
int *sr,
int db,
int *dr,
int dw,
int attr)
{
	/* Interleave sw words from sr into dr */

	p128_t tmp;

	if (sb == (db / 2)) {
		register int i;
		typ t;
		register int sign = 1;

		if (attr & TYP_UNSIGN) sign = 0;

		switch (db) {
		case 1:		t = typ1u; break;
		case 2:		t = typ2u; break;
		case 4:		t = typ4u; break;
		case 8:		t = typ8u; break;
		case 16:	t = typ16u; break;
		case 32:	t = typ32u; sign <<= 1; break;
		default:	t = typ64u; sign <<= 1; break;
		}

		for (i=0; i<dw; ++i) {
			if (sign == 2) {
				/* Use Intel recommended
				   trick for signed interleave
				*/
				tmp.q[1] = 0ULL;
				tmp.q[0] = 0ULL;
				dr[i] = binop(((i & 1) ? INTRLVHIGH:INTRLVLOW),
					      immed128(tmp),
					      sr[i / 2],
					      t);

				/* What should the limit be on db?
				   It is currently an int, so I can get away
				   with zeroing the upper qword.
				*/
				tmp.q[1] = 0ULL;
				tmp.q[0] = ((long long) (db / 2)),
				dr[i] = binop(SHR, dr[i],
					      immed128(tmp),
					      t);
			} else {
				/* Do ordinary unsigned interleave */
				tmp.q[1] = 0ULL;
				tmp.q[0] = 0ULL;
				dr[i] = binop(((i & 1) ? INTRLVHIGH:INTRLVLOW),
					      sr[i / 2],
					      immed128(tmp),
					      t);
			}
		}
		return;
	}

	/* Recursively interleave... */
	if (sb < db) {
		/* HEREHERE - Should this be sizeof(long long)? */
		register int *p = ((int *) malloc(dw * sizeof(int)));
		register int i, j;

		/* HEREHERE - Is this correct for 128-bit regs? */
		if (sb < 8) {
			/* Always unsigned */
			attr |= TYP_UNSIGN;
		}

		i = dw;
		for (j=sb; j<(db/2); j+=j) {
			i = ((i + 1) / 2);
		}

		sse_interleave(sb, sr, (sb * 2), p, i, attr);
		sse_interleave((sb * 2), p, db, dr, dw, attr);
		return;
	}

	error("this interleave conversion not implemented");
} /* sse_interleave() */

static void
max_interleave(int sb,
int *sr,
int db,
int *dr,
int dw,
int attr)
{
	/* Interleave sw words from sr into dr */

	if (sb == (db / 2)) {
		register int i;
		typ t;
		register int sign = 1;

		if (attr & TYP_UNSIGN) sign = 0;

		switch (db) {
		case 1:		t = typ1u; break;
		case 2:		t = typ2u; break;
		case 4:		t = typ4u; break;
		case 8:		t = typ8u; break;
		case 16:	t = typ16u; sign <<= 1; break;
		default:	t = typ32u; sign <<= 1; break;
		}

		for (i=0; i<dw; ++i) {
			if (sign == 2) {
				/* Use Intel recommended
				   trick for signed interleave
				*/
				dr[i] = binop(((i & 1) ? INTRLVHIGH:INTRLVLOW),
					      immed64u((p64_t) 0ULL),
					      sr[i / 2],
					      t);
				dr[i] = binop(SHR, dr[i],
					      immed64u((p64_t)
						((long long) (db / 2))),
					      t);
			} else {
				/* Do ordinary unsigned interleave */
				dr[i] = binop(((i & 1) ? INTRLVHIGH:INTRLVLOW),
					      sr[i / 2],
					      immed64u((p64_t) 0ULL),
					      t);
			}
		}
		return;
	}

	/* Recursively interleave... */
	if (sb < db) {
		register int *p = ((int *) malloc(dw * sizeof(int)));
		register int i, j;

		if (sb < 8) {
			/* Always unsigned */
			attr |= TYP_UNSIGN;
		}

		i = dw;
		for (j=sb; j<(db/2); j+=j) {
			i = ((i + 1) / 2);
		}

		max_interleave(sb, sr, (sb * 2), p, i, attr);
		max_interleave((sb * 2), p, db, dr, dw, attr);
		return;
	}

	error("this interleave conversion not implemented");
} /* max_interleave() */

static void
altivec_interleave(int sb,
int *sr,
int db,
int *dr,
int dw,
int attr)
{
	/* Interleave sw words from sr into dr */

	p128_t tmp;

	if (sb == (db / 2)) {
		register int i;
		typ t;
		register int sign = 1;

		if (attr & TYP_UNSIGN) sign = 0;

		switch (db) {
		case 1:		t = typ1u; break;
		case 2:		t = typ2u; break;
		case 4:		t = typ4u; break;
		case 8:		t = typ8u; break;
		case 16:	t = typ16u; break;
		case 32:	t = typ32u; sign <<= 1; break;
		default:	t = typ64u; sign <<= 1; break;
		}

		for (i=0; i<dw; ++i) {
			if (sign == 2) {
				/* Use Intel recommended
				   trick for signed interleave
				*/
				tmp.q[1] = 0ULL;
				tmp.q[0] = 0ULL;
				dr[i] = binop(((i & 1) ? INTRLVHIGH:INTRLVLOW),
					      immed128(tmp),
					      sr[i / 2],
					      t);

				/* What should the limit be on db?
				   It is currently an int, so I can get away
				   with zeroing the upper qword.
				*/
				tmp.q[1] = 0ULL;
				tmp.q[0] = ((long long) (db / 2)),
				dr[i] = binop(SHR, dr[i],
					      immed128(tmp),
					      t);
			} else {
				/* Do ordinary unsigned interleave */
				tmp.q[1] = 0ULL;
				tmp.q[0] = 0ULL;
				dr[i] = binop(((i & 1) ? INTRLVHIGH:INTRLVLOW),
					      sr[i / 2],
					      immed128(tmp),
					      t);
			}
		}
		return;
	}

	/* Recursively interleave... */
	if (sb < db) {
		/* HEREHERE - Should this be sizeof(long long)? */
		register int *p = ((int *) malloc(dw * sizeof(int)));
		register int i, j;

		/* HEREHERE - Is this correct for 128-bit regs? */
		if (sb < 8) {
			/* Always unsigned */
			attr |= TYP_UNSIGN;
		}

		i = dw;
		for (j=sb; j<(db/2); j+=j) {
			i = ((i + 1) / 2);
		}

		altivec_interleave(sb, sr, (sb * 2), p, i, attr);
		altivec_interleave((sb * 2), p, db, dr, dw, attr);
		return;
	}

	error("this interleave conversion not implemented");
} /* altivec_interleave() */

void
interleave(int sb,
int *sr,
int db,
int *dr,
int dw,
int attr)
{
	if (optcpu & CPU_SSE) {
		sse_interleave(sb, sr, db, dr, dw, attr);
	} else if (optcpu & CPU_MMX) {
		mmx_interleave(sb, sr, db, dr, dw, attr);
	} else if (optcpu == GenericIA32) {
		ia32_interleave(sb, sr, db, dr, dw, attr);
	} else if (optcpu & CPU_MAX) {
		max_interleave(sb, sr, db, dr, dw, attr);
	} else if (optcpu & CPU_AltiVec) {
		altivec_interleave(sb, sr, db, dr, dw, attr);
	}
}

