/*	tuple_binop.c

	binary operations on fragment tuples
*/
#undef NOTDEFD

#include "swartypes.h"
#include "tuple.h"
#include "tokens.h"
#include "oputils.h"
#include "scheduler.h"
#include "tuplegen.h"
#include "tuple_immed.h"
#include "tuple_binop.h"
#include "showir.h"
#include "messages.h"
#include "Libstdswar/stdswar.h"


static int
shiftconst(int arg,
typ t)
{
	/* Is this tuple a constant with the same value in every field?
	   Yes it is...  even if it is not (cause MMX does that).
	   If so, return a tuple with that field value.
	*/
	/* RJF: As I read the above comment, it is wrong.  MMX treats the
	   entire 64-bit register as a single unsigned number, and uses only
	   the lower ? bits of the value.  The value is not replicated in each
	   field of a partition shiftcount value.  Note that this function is
	   doing the right thing regardless of the comment.

	   How are VNUMs handled?  They are treated as an error case (Not Yet
	   Implemented) in the calling function.  Off hand, I don't recall how
	   the language is defined.

	   Also, it isn't clear to me that negative values are forbidden.
	   If negatives are allowed, should the count be sign extended?
	*/

	if (tup[arg].op != NUM) return(arg);

	if (optcpu & CPU_AltiVec) return(arg);

	return(immed64u((p64_t) ((long long) tup[arg].immed.b[0])));
}


int
binop1u(int op,
int arg0,
int arg1)
{
	/* 1-bit unsigned field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l;

	switch (op) {
	case MUL: /* 1u */
	case MIN: /* 1u */
	case AND: /* 1u */
	case LAND: /* 1u */
		/* These are all equivalent to an AND:1u */
		return(binop(AND, arg0, arg1, typnull));

	case DIV: /* 1u */
		/* Can return 1, 0, or an FP exception */
		/* I don't feel responsible for preempting the exception */
		warn("division of 1-bit fields always yields numerator");
		return(arg0);

	case MOD: /* 1u */
		/* Can return 0 or an FP exception. */
		/* We'll force the value to 0 and warn the user */
		warn("modulus of 1-bit fields always gives 0 result");
		return(immed128((p128_t) {{0ULL, 0ULL}}));

	case GE: /* 1u */
		/* x>=y is equivalent to y<=x */
		return(binop(LE, arg1, arg0, typ1u));

	case LE: /* 1u */
		/* True, except when arg0=1 and arg1=0 */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(ANDN, arg1, arg0, typnull);
			return(unop(NOT, i, typnull));
		} else {
			i = unop(NOT, arg1, typnull);
			i = binop(AND, i, arg0, typnull);
			return(unop(NOT, i, typnull));
		}

	case MAX: /* 1u */
	case AVG: /* Average rounds up */ /* 1u */
	case LOR: /* 1u */
	case OR: /* 1u */
		/* These are all equivalent to an OR:1u */
		return(binop(OR, arg0, arg1, typnull));

	case ADD: /* 1u */
	case SUB: /* 1u */
	case NE: /* 1u */
	case XOR: /* 1u */
		/* These are all equivalent to an XOR:1u */
		return(binop(XOR, arg0, arg1, typnull));

	case GT: /* 1u */
	case SHL: /* 1u */
	case SHR: /* 1u */
		/* These are all equivalent to a reversed ANDN:1u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			return(binop(ANDN, arg1, arg0, typnull));
		} else {
			i = unop(NOT, arg1, typnull);
			return(binop(AND, i, arg0, typnull));
		}

	case LT: /* 1u */
	case ANDN: /* 1u */
		/* These are all equivalent to an ANDN:1u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			return(binop(ANDN, arg0, arg1, typnull));
		} else {
			i = unop(NOT, arg0, typnull);
			return(binop(AND, i, arg1, typnull));
		}

	case EQ: /* 1u */
		return(unop(NOT, binop(XOR, arg0, arg1, typnull), typnull));

	case PACK: /* 1u */
		/* 2u -> 1u */
		/* PACK (without saturation) each 2-bit field value to
		   1 bit, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Pack arg0 */
		i = binop(AND, arg0, immedu(cvt1x8uto16x8u(0x55)), typnull);
		j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, i, j, typnull);	/* low 2 bits correct */

		/* Pack arg1 */
		k = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x55)), typnull);
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
		k = binop(OR, k, l, typnull);	/* low 2 bits correct */

		/* Combine packed arg0 and arg1 in result */
		return(binop(PACK, i, k, typ2u));

	case INTRLVLOW: /* 1u */
	case INTRLVHIGH: /* 1u */
		/* 1/2-bit to 1-bit interleave: meaningless */
		bug("generating INTRLV[LH] for 1u");
		break;

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop1u op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}
	return(-1);
}

int
binop1us(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 1-bit unsigned field binary ops with saturation */
	register int i, j;

	switch (op) {
	case MUL: /* 1us */
	case DIV: /* 1us */
	case MOD: /* 1us */
	case AVG: /* 1us */

	case MIN: /* 1us */
	case MAX: /* 1us */

	case AND: /* 1us */
	case ANDN: /* 1us */
	case OR: /* 1us */
	case XOR: /* 1us */
	case LAND: /* 1us */
	case LOR: /* 1us */

	case NE: /* 1us */
	case EQ: /* 1us */
	case GT: /* 1us */
	case GE: /* 1us */
	case LT: /* 1us */
	case LE: /* 1us */
		/* Same as unsaturated unsigned */
		return(binop(op, arg0, arg1, typ1u));

	case ADD: /* 1us */
		/* This is equivalent to an OR:1u */
		return(binop(OR, arg0, arg1, typnull));

	case SUB: /* 1us */
	case SHR: /* 1us */
		/* These are all equivalent to an ANDN:1u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			return(binop(ANDN, arg1, arg0, typnull));
		} else {
			i = unop(NOT, arg1, typnull);
			return(binop(AND, i, arg0, typnull));
		}

	case SHL: /* 1us */
		/* 1 -> 2(or more) -sat-> 1 */
		return(arg0);

	case PACK: /* 1us */
		/* 2us -> 1us */
		/* PACK (with unsigned saturation) each 2-bit field value to
		   1 bit, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Pack arg0 */
		/* Calculate the saturated values, then pack as unsaturated */
		i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, arg0, i, typnull);

		/* Pack arg1 */
		/* Calculate the saturated values, then pack as unsaturated */
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, arg1, j, typnull);

		/* Combine packed arg0 and arg1 in result */
		return(binop(PACK, i, j, typ1u));

	case INTRLVLOW: /* 1us */
	case INTRLVHIGH: /* 1us */
		/* 1/2-bit to 1-bit interleave: meaningless */
		bug("generating INTRLV[LH] for 1us");
		break;

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop1us op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}
	return(-1);
}

int
binop1(int op,
int arg0,
int arg1)
{
	/* 1-bit signed field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	switch (op) {
	case DIV: /* 1s */
	case MOD: /* 1s */
	case MUL: /* 1s */
	case LAND: /* 1s */
	case AND: /* 1s */
	case LOR: /* 1s */
	case OR: /* 1s */
	case ADD: /* 1s */
	case SUB: /* 1s */
	case NE: /* 1s */
	case XOR: /* 1s */
	case SHL: /* 1s */
	case ANDN: /* 1s */
	case EQ: /* 1s */
	case PACK: /* 1s */
		/* These are all the same as unsigned */
		return(binop(op, arg0, arg1, typ1u));

	case SHR: /* 1s */
		return(arg1);

	case LT: /* 1s */
	case GT: /* 1s */
	case GE: /* 1s */
	case LE: /* 1s */
		/* These are all the reverse of unsigned */
		return(binop(op, arg1, arg0, typ1u));

	case MIN: /* 1s */
		/* This is equivalent to an OR:1u */
		return(binop(OR, arg0, arg1, typnull));

	case AVG: /* Average rounds up */ /* 1s */
	case MAX: /* 1s */
		/* These are equivalent to an AND:1u */
		return(binop(AND, arg0, arg1, typnull));

	case INTRLVLOW: /* 1s */
	case INTRLVHIGH: /* 1s */
		/* 1/2-bit to 1-bit interleave: meaningless */
		bug("generating INTRLV[LH] for 1");
		break;

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop1 op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}
	return(-1);
}

int
binop1ss(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 1-bit signed field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j;

	switch (op) {

	case LAND: /* 1ss */
	case LOR: /* 1ss */

	case AND: /* 1ss */
	case ANDN: /* 1ss */
	case OR: /* 1ss */
	case XOR: /* 1ss */

	case NE: /* 1ss */
	case EQ: /* 1ss */
		/* These are the same as unsigned unsaturated */
		return(binop(op, arg0, arg1, typ1u));

	case AVG: /* 1ss */
	case MIN: /* 1ss */
	case MAX: /* 1ss */

	case GT: /* 1ss */
	case GE: /* 1ss */
	case LT: /* 1ss */
	case LE: /* 1ss */

	case SHR: /* 1ss */
		/* These are the same as signed unsaturated */
		return(binop(op, arg0, arg1, typ1));

	case ADD: /* 1ss */
	case SUB: /* 1ss */
	case SHL: /* 1ss */
		/* These are all the same as unsigned saturated */
		return(binop(op, arg0, arg1, typ1us));

	case MUL: /* 1ss */
		/* -1 * -1 = 1 -sat-> 0 */
		return(immed64u((p64_t) 0LL));

	case DIV: /* 1ss */
		/* Can return 0 or an FP exception */
		/* I don't feel responsible for preempting the exception */
		warn("saturated division of signed 1-bit fields always yields "
		     "0");
		return(immed64u((p64_t) 0LL));

	case MOD: /* 1ss */
		/* This is equivalent to a reverse ANDN:1u */
		/* -1%0 saturates negatively */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			return(binop(ANDN, arg1, arg0, typnull));
		} else {
			i = unop(NOT, arg1, typnull);
			return(binop(AND, i, arg0, typnull));
		}

	case INTRLVLOW: /* 1ss */
	case INTRLVHIGH: /* 1ss */
		/* 1/2-bit to 1-bit interleave: meaningless */
		bug("generating INTRLV[LH] for 1ss");
		break;

	case PACK: /* 1ss */
		/* 2ss -> 1ss */
		/* PACK (with saturation) each 2-bit field value to
		   1 bit, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Saturate arg0 */
		i = binop(MIN, arg0, immedu(cvt1x16uto8x16u(0x0000)), typ2);
		i = binop(MAX, i, immedu(cvt1x16uto8x16u(0xffff)), typ2);

		/* Saturate arg1 */
		j = binop(MIN, arg1, immedu(cvt1x16uto8x16u(0x0000)), typ2);
		j = binop(MAX, j, immedu(cvt1x16uto8x16u(0xffff)), typ2);

		/* Pack as signed */
		return(binop(PACK, i, j, typ1));

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop1ss op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}
	return(-1);
}

int
binop2u(int op,
int arg0,
int arg1)
{
	/* 2-bit unsigned field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i=-1, j, k, l, m, n;

	switch (op) {
	case ADD: /* 2u */
		/* first, add without carry */
		i = binop(XOR, arg0, arg1, typnull);

		/* then add in carry */
		j = binop(AND, arg0, arg1, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x55)), typnull);
		j = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		return(binop(XOR, i, j, typnull));

	case SUB: /* 2u */
		/* x SUB y is really x ADD (~y ADD 1) */
		i = unop(NOT, arg1, typnull);
		i = binop(ADD, i, immedu(cvt1x8uto16x8u(0x55)), typ2u);
		return(binop(ADD, arg0, i, typ2u));

	case MUL: /* 2u */
		/* shift and add sequence in disguise */

		/* Build the *2 partial product */
		i = binop(AND, arg1, immedu(cvt1x8uto16x8u(0xaa)), typnull);
		j = binop(SHL, arg0, immed64u((p64_t) 1ULL), typnull);
		i = binop(AND, i, j, typnull);
		/* the *2 portion (only top bit per field) */

		/* Build the *1 partial product */
		j = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x55)), typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, j, k, typnull);
		j = binop(AND, j, arg0, typnull); /* the *1 portion */

		/* Add the partial products without carry.  There won't be one
		   from bit 0 to bit 1 anyway, but this keeps fields separated.
		*/
		return(binop(XOR, i, j, typnull));

	case MULH: /* 2u */
		i = binop(AND, arg0, arg1, typnull);
		j = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);
		k = binop(AND, i, j, typnull);
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0xaa)), typnull);

		l = unop(NOT, j, typnull);
		l = binop(AND, i, l, typnull);
		l = binop(AND, l, immedu(cvt1x8uto16x8u(0xaa)), typnull);
		l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);

		return(binop(OR, k, l, typnull));

	case DIV: /* 2u */
		/*	abcd	00	01	11	10
			00	xx	00	00	00
			01	xx	01	00	00
			11	xx	11	01	01
			10	xx	10	00	01

			low(ab / cd) = (b & ~c) | (a & b) | (a & ~d)
			high(ab / cd) = (a & ~c)
		*/
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
								/* c */
			i = binop(ANDN, i, arg0, typnull);	/* ~(c) & b */
			j = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
								/* a */
			k = binop(AND, j, arg0, typnull);	/* a & b */
			i = binop(OR, i, k, typnull);
			j = binop(ANDN, arg1, j, typnull);	/* ~(d) & a */
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);
			j = binop(ANDN, arg1, arg0, typnull);	/* ~(c) & a */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
								/* c */
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);	/* ~(c) & b */
			j = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
								/* a */
			k = binop(AND, j, arg0, typnull);	/* a & b */
			i = binop(OR, i, k, typnull);
			l = unop(NOT, arg1, typnull);
			j = binop(AND, l, j, typnull);	/* ~(d) & a */
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);
			l = unop(NOT, arg1, typnull);
			j = binop(AND, l, arg0, typnull);	/* ~(c) & a */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);
			return(binop(OR, i, j, typnull));
		}

	case MOD: /* 2u */
		/*	abcd	00	01	11	10
			00	xx	00	00	00
			01	xx	00	01	01
			11	xx	00	00	01
			10	xx	00	10	00

			low(ab % cd) = (~a & b & c) | (b & ~d)
			high(ab % cd) = (a & ~b & c & d)
		*/
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(ANDN, arg0, arg1, typnull);
			i = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
								/* ~(a)&c */
			i = binop(AND, i, arg0, typnull);	/* (~a&c)&b */
			j = binop(ANDN, arg1, arg0, typnull);	/* ~(d) & b */
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			j = binop(ANDN, arg0, arg1, typnull);
			j = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
								/* ~(b)&d */
			k = binop(AND, arg0, arg1, typnull);	/* a & c */
			j = binop(AND, j, k, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = unop(NOT, arg0, typnull);
			i = binop(AND, i, arg1, typnull);
			i = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
								/* ~(a)&c */
			i = binop(AND, i, arg0, typnull);	/* (~a&c)&b */
			j = unop(NOT, arg1, typnull);
			j = binop(AND, j, arg0, typnull);	/* ~(d) & b */
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			j = unop(NOT, arg0, typnull);
			j = binop(AND, j, arg1, typnull);
			j = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
								/* ~(b)&d */
			k = binop(AND, arg0, arg1, typnull);	/* a & c */
			j = binop(AND, j, k, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);
			return(binop(OR, i, j, typnull));
		}

	case AVG: /* 2u */
		/* Average rounds up */
		i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0x55)), typnull);
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x55)), typnull);

		/* Do the largest available unsigned add of at least 2 bits */
		if (optcpu & CPU_MMX) {
			i = binop(ADD, i, j, typ32u);
		} else if (optcpu & CPU_MAX) {
			i = binop(ADD, i, j, typ16u);
		} else {
			i = binop(ADD, i, j, typ32u);
		}

		j = binop(OR, arg0, arg1, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x55)), typnull);

		/* Do the largest available unsigned add of at least 2 bits */
		if (optcpu & CPU_MMX) {
			i = binop(ADD, i, j, typ32u);
		} else if (optcpu & CPU_MAX) {
			i = binop(ADD, i, j, typ16u);
		} else {
			i = binop(ADD, i, j, typ32u);
		}
		return(i);

	case SHL: /* 2u */
		/* Mask the shift count to lower bits if a NUM */
		arg1 = shiftconst(arg1, typ2u);

		/* only three cases: shift by 0, by 1, or too big */
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:
				return(arg0);
			case 0x1ULL:
				i = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x55)),
					  typnull);
				if (optcpu & CPU_MMX) {
					/* Sneaky (Intel recommended) use of
					   32-bit add... */
					i = binop(ADD, i, i, typ32);
				} else if (optcpu & CPU_MAX) {
					i = binop(ADD, i, i, typ16);
				} else {
					i = binop(SHL,
						  i,
						  immed64u((p64_t) 1ULL),
						  typnull);
				}
				return(i);
			default:
				return(immed64u((p64_t) 0ULL));
			}
		}

		/* shift by a vector is NYI */
		error("shift left of 2-bit field values only implemented for "
		      "a constant shift");
		return(immed64u((p64_t) 0ULL));

	case SHR: /* 2u */
		/* Mask the shift count to lower bits if a NUM */
		arg1 = shiftconst(arg1, typ2u);

		/* only three cases: shift by 0, by 1, or too big */
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:
				return(arg0);
			case 0x1ULL:
				i = binop(SHR, arg0, arg1, typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x55)),
					  typnull);
				return(i);
			default:
				return(immed64u((p64_t) 0ULL));
			}
		}
		/* shift by a vector is NYI */
		error("shift right of 2-bit field values only implemented for "
		      "a constant shift");
		return(immed64u((p64_t) 0ULL));

	case MIN: /* 2u */
		/* nearly AND, but 10,01 and 01,10 are 00 and should be 01 */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(AND, arg0, arg1, typnull);

			j = binop(ANDN, arg0, arg1, typnull);
			j = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(AND, j, arg0, typnull);

			k = binop(ANDN, arg1, arg0, typnull);
			k = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, arg1, typnull);

			l = binop(OR, j, k, typnull);
			l = binop(AND,
				  l,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			return(binop(OR, i, l, typnull));
		} else {
			i = binop(AND, arg0, arg1, typnull);

			j = unop(NOT, arg0, typnull);
			j = binop(AND, j, arg1, typnull);
			j = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(AND, j, arg0, typnull);

			k = unop(NOT, arg1, typnull);
			k = binop(AND, k, arg0, typnull);
			k = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, arg1, typnull);

			l = binop(OR, j, k, typnull);
			l = binop(AND,
				  l,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			return(binop(OR, i, l, typnull));
		}

	case MAX: /* 2u */
		/* nearly OR, but 10,01 and 01,10 are 11 and should be 10 */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(OR, arg0, arg1, typnull);

			j = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
			k = binop(ANDN, j, arg0, typnull);
			l = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
			m = binop(ANDN, arg1, l, typnull);
			n = binop(AND, k, m, typnull);

			k = binop(ANDN, arg0, j, typnull);
			m = binop(ANDN, l, arg1, typnull);
			k = binop(AND, k, m, typnull);

			k = binop(OR, k, n, typnull);
			k = binop(AND,
				  k,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);
			return(binop(XOR, i, k, typnull));
		} else {
			i = binop(OR, arg0, arg1, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);

			j = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
			k = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
			l = unop(NOT, j, typnull);
			m = unop(NOT, k, typnull);

			j = binop(AND, j, arg0, typnull);
			k = binop(AND, k, arg1, typnull);
			j = binop(OR, j, k, typnull);

			l = binop(AND, l, arg1, typnull);
			m = binop(AND, m, arg0, typnull);
			l = binop(OR, l, m, typnull);

			j = binop(OR, j, l, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			return(binop(OR, i, j, typnull));
		}

	case EQ: /* 2u */
		/* x EQ y is really NOT (x NE y) */
		i = binop(NE, arg0, arg1, typ2u);
		return(unop(NOT, i, typnull));

	case NE: /* 2u */
		/* XOR does the compare, but results need to be 2-bit */
		i = binop(XOR, arg0, arg1, typnull);
		j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, i, j, typnull);
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0x55)), typnull);
		j = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);
		return(binop(OR, i, j, typnull));

	case LT: /* 2u */
		/* x LT y is really x NE (x MAX y) */
		/* x LT y is also y GT x */
		return(binop(GT, arg1, arg0, typ2u));

	case LE: /* 2u */
		/* x LE y is really x EQ (x MIN y) */
		i = binop(MIN, arg0, arg1, typ2u);
		return(binop(EQ, arg0, i, typ2u));

	case GT: /* 2u */
		/* x GT y is really x NE (x MIN y) */
		i = binop(MIN, arg0, arg1, typ2u);
		return(binop(NE, arg0, i, typ2u));

	case GE: /* 2u */
		/* x GE y is really x EQ (x MAX y) */
		/* x GE y is also y LE x */
		return(binop(LE, arg1, arg0, typ2u));

	case LAND: /* 2u */
		/* convert to logical and then AND and replicate */
		i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, i, arg0, typnull);
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, j, arg1, typnull);
		i = binop(AND, i, j, typnull);
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0x55)), typnull);
		j = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);
		return(binop(OR, i, j, typnull));

	case LOR: /* 2u */
		/* convert to logical and then OR and replicate */
		i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, i, arg0, typnull);
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, j, arg1, typnull);
		i = binop(OR, i, j, typnull);
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0x55)), typnull);
		j = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);
		return(binop(OR, i, j, typnull));

	case AND: /* 2u */
		return(binop(AND, arg0, arg1, typnull));

	case ANDN: /* 2u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			return(binop(ANDN, arg0, arg1, typnull));
		} else {
			i = unop(NOT, arg0, typnull);
			return(binop(AND, i, arg1, typnull));
		}

	case OR: /* 2u */
		return(binop(OR, arg0, arg1, typnull));

	case XOR: /* 2u */
		return(binop(XOR, arg0, arg1, typnull));

	case PACK: /* 2u */
		/* 4u -> 2u */
		/* PACK (without saturation) each 4-bit field value to
		   2 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Pack arg0 */
		i = binop(AND, arg0, immedu(cvt1x8uto16x8u(0x33)), typnull);
		j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
		i = binop(OR, i, j, typnull);	/* low 4 bits correct */

		/* Pack arg1 */
		k = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x33)), typnull);
		l = binop(SHR, k, immed64u((p64_t) 2ULL), typnull);
		k = binop(OR, k, l, typnull);	/* low 4 bits correct */

		/* Combine packed arg0 and arg1 in result */
		return(binop(PACK, i, k, typ4u));

	case INTRLVLOW: /* 2u */
		/* 1-bit to 2-bit interleave of 1-bit fields */
		/* another nasty shift and AND sequence... */
		{
			int bpf = bitsperfrag();

			/* high bit */
			j = arg1;
			switch (bitsperfrag()) {
			case 128:
				/* Interleave to 64 bits */
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				/* Interleave to 32 bits */
				i = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x000000000000ffffULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 32:
				/* Interleave to 16 bits */
				i = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x000000ffULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x000000ff)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x0000ff00ULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x0000ff00)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 8ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

				/* Interleave to 4 bits */
				i = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x000f)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x00f0)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 4ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

				/* Interleave to 4 bits */
				i = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x03)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x0c)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 2ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

				/* Interleave to 2 bits */
				i = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x11)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x22)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}
			k = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);


			/* low bit */
			j = arg0;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					  j,
					  (bpf==64)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 32:
				i = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x000000ffULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x000000ff)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x0000ff00ULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x0000ff00)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 8ULL),
					  typnull);
				j = binop(OR, i, j, typnull);


				i = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x000f)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x00f0)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 4ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

				i = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x03)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x0c)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 2ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

				i = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x11)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x22)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}
		}

		return(binop(OR, i, k, typnull));

	case INTRLVHIGH: /* 2u */
		/* 1-bit to 2-bit interleave of 1-bit fields */
		/* sneaky way to reuse INTRLVLOW code... */
		{
			unsigned long long bpf_2 =
				(unsigned long long) bitsperfrag()/2ULL;
			i = binop(SHR, arg0, immed64u((p64_t) bpf_2), typnull);
			j = binop(SHR, arg1, immed64u((p64_t) bpf_2), typnull);
			return(binop(INTRLVLOW, i, j, typ2u));
		}

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop2u op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}
	return(-1);
}

int
binop2us(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 2-bit unsigned field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m;

	switch (op) {

	case MIN: /* 2us */
	case MAX: /* 2us */
	case AVG: /* 2us */

	case EQ: /* 2us */
	case NE: /* 2us */
	case LT: /* 2us */
	case LE: /* 2us */
	case GT: /* 2us */
	case GE: /* 2us */

	case AND: /* 2us */
	case ANDN: /* 2us */
	case OR: /* 2us */
	case XOR: /* 2us */

	case LAND: /* 2us */
	case LOR: /* 2us */

	case SHR: /* 2us */
		/* These are all the same as unsigned unsaturated */
		return (binop(op, arg0, arg1, typ2u));

	case ADD: /* 2us */
		/* first, add without carry (also calculates propagate) */
		i = binop(XOR, arg0, arg1, typnull);

		/* calculate generate */
		j = binop(AND, arg0, arg1, typnull);

		/* Calulate overflow */
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		k = binop(AND, k, i, typnull);
		k = binop(OR, k, j, typnull);
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0xaa)), typnull);

		/* ...and create a saturation mask */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
		k = binop(OR, k, l, typnull);

		/* Calculate the add as usual */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x55)), typnull);
		j = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(XOR, i, j, typnull);

		/* Clobber the calculated value with the max on overflow */
		return(binop(OR, k, j, typnull));

	case SUB: /* 2us */
		/* high bit = a~c~d | ab~c */

		/* a~c : b~d */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(ANDN, arg1, arg0, typnull);
		} else {
			i = unop(NOT, arg1, typnull);
			i = binop(AND, i, arg0, typnull);
		}

		/* ab~c : X */
		j = binop(SHL, arg0, immed64u((p64_t) 1ULL), typnull); /* b:X */
		j = binop(AND, i, j, typnull);

		/* a~c~d : X */
		k = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull); /* d:X */
		k = unop(NOT, k, typnull);			/* ~d : X */
		k = binop(AND, i, k, typnull);

		/* High bit */
		j = binop(OR, j, k, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0xaa)), typnull);


		/* low bit = b~c~d | a~b~cd | abc~d
			   = ~c(b~d) | (a~c)(~bd) | (ac)(b~d)
		*/
		/* i = a~c : b~d */

		/* X : ~c(b~d) */
		k = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull); /* d:X */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			k = binop(ANDN, k, i, typnull);
		} else {
			k = unop(NOT, k, typnull);
			k = binop(AND, k, i, typnull);
		}

		/* X : (a~c)(~bd) */
		l = binop(SHR, i, immed64u((p64_t) 1ULL), typnull); /* X:a~c */
		/* ~ac : ~bd */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			m = binop(ANDN, arg0, arg1, typnull);
		} else {
			m = unop(NOT, arg0, typnull);
			m = binop(AND, m, arg1, typnull);
		}
		l = binop(AND, l, m, typnull);

		/* X : ac(b~d) */
		m = binop(AND, arg0, arg1, typnull);		/* ac : bd */
		m = binop(SHR, m, immed64u((p64_t) 1ULL), typnull); /* X:ac */
		m = binop(AND, i, m, typnull);

		/* Low bit */
		k = binop(OR, k, l, typnull);
		k = binop(OR, k, m, typnull);
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x55)), typnull);

		/* Combine high and low bits */
		return (binop(OR, j, k, typnull));

	case MUL: /* 2us */
		/* shift and add sequence in disguise */

		/* Build the *2 partial product */
		i = binop(AND, arg1, immedu(cvt1x8uto16x8u(0xaa)), typnull);
		j = binop(SHL, arg0, immed64u((p64_t) 1ULL), typnull);
		i = binop(AND, i, j, typnull);
		/* the *2 portion (only top bit per field) */

		/* Build the *1 partial product */
		j = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x55)), typnull);
		/* SHL by 1 via largest supported unsigned add */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			k = binop(ADD, j, j, typ32u);
		} else {
			k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		}
		j = binop(OR, j, k, typnull);
		j = binop(AND, j, arg0, typnull); /* the *1 portion */

		/* Add the partial products without carry.  There won't be one
		   from bit 0 to bit 1 anyway, but this keeps fields separated.
		*/
		i = binop(XOR, i, j, typnull);

		/* Create saturation mask */
		j = binop(AND, arg0, arg1, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0xaa)), typnull);
		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, j, k, typnull);

		/* Combine with saturation mask */
		return(binop(OR, i, j, typnull));

	case DIV: /* 2us */
		/*	abcd	00	01	11	10
			00	11	00	00	00
			01	11	01	00	00
			11	11	11	01	01
			10	11	10	00	01

			Same as DIV2u but saturate divs by 0.

			low(ab / cd) = (b & ~c) | (a & b) | (a & ~d) | ~(c|d)
			high(ab / cd) = (a & ~c) | ~(c|d)
		*/
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
								/* c */
			i = binop(ANDN, i, arg0, typnull);	/* ~(c) & b */
			j = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
								/* a */
			k = binop(AND, j, arg0, typnull);	/* a & b */
			i = binop(OR, i, k, typnull);
			j = binop(ANDN, arg1, j, typnull);	/* ~(d) & a */
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);
			j = binop(ANDN, arg1, arg0, typnull);	/* ~(c) & a */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);
			i = binop(OR, i, j, typnull);
		} else {
			i = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
								/* c */
			i = unop(NOT, i, typnull);		/* ~c */
			i = binop(AND, i, arg0, typnull);	/* ~(c) & b */
			j = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
								/* a */
			k = binop(AND, j, arg0, typnull);	/* a & b */
			i = binop(OR, i, k, typnull);
			l = unop(NOT, arg1, typnull);		/* ~d */
			j = binop(AND, l, j, typnull);		/* ~(d) & a */
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);
			j = unop(NOT, arg1, typnull);		/* ~c */
			j = binop(AND, j, arg0, typnull);	/* ~c & a */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);
			i = binop(OR, i, j, typnull);
		}

		/* Combine with saturation */
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull); /* X:c */
		j = binop(OR, j, arg1, typnull);		/* X:c|d */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x55)), typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull); /* c|d:0 */
		k = binop(OR, j, k, typnull);	/* c|d : c|d */
		j = unop(NOT, k, typnull);	/* ~(c|d) : ~(c|d) */
		return(binop(OR, i, j, typnull));

	case MOD: /* 2us */
		/*	abcd	00	01	11	10
			00	11	00	00	00
			01	11	00	01	01
			11	11	00	00	01
			10	11	00	10	00

			Same as MOD2u but saturate mods by 0.

			low(ab % cd) = (~a & b & c) | (b & ~d) | ~(c|d)
			high(ab % cd) = (a & ~b & c & d) | ~(c|d)
		*/
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(ANDN, arg0, arg1, typnull);
			i = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
								/* ~a&c */
			i = binop(AND, i, arg0, typnull);	/* (~a&c) & b */
			j = binop(ANDN, arg1, arg0, typnull);	/* ~(d) & b */
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			j = binop(ANDN, arg0, arg1, typnull);
			j = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
								/* ~(b)&d */
			k = binop(AND, arg0, arg1, typnull);	/* a & c */
			j = binop(AND, j, k, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);
			i = binop(OR, i, j, typnull);
		} else {
			i = unop(NOT, arg0, typnull);
			i = binop(AND, i, arg1, typnull);
			i = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
								/* ~a&c */
			i = binop(AND, i, arg0, typnull);	/* (~a&c) & b */
			j = unop(NOT, arg1, typnull);		/* ~d */
			j = binop(AND, j, arg0, typnull);	/* ~d & b */
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			j = unop(NOT, arg0, typnull);
			j = binop(AND, j, arg1, typnull);
			j = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
								/* ~(b)&d */
			k = binop(AND, arg0, arg1, typnull);	/* a & c */
			j = binop(AND, j, k, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);
			i = binop(OR, i, j, typnull);
		}

		/* Combine with saturation */
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull); /* X:c */
		j = binop(OR, j, arg1, typnull);		/* X:c|d */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x55)), typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull); /* c|d:0 */
		k = binop(OR, j, k, typnull);	/* c|d : c|d */
		j = unop(NOT, k, typnull);	/* ~(c|d) : ~(c|d) */
		return(binop(OR, i, j, typnull));

	case SHL: /* 2us */
		/* Mask the shift count to lower bits if a NUM */
		arg1 = shiftconst(arg1, typ2u);

		/* only three cases: shift by 0, by 1, or too big */
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:
				return(arg0);
			case 0x1ULL:
				i = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x55)),
					  typnull);
				if (optcpu & CPU_MMX) {
					i = binop(ADD, i, i, typ32);
				} else if (optcpu & CPU_MAX) {
					i = binop(ADD, i, i, typ16);
				} else {
					i = binop(SHL,
						  i,
						  immed64u((p64_t) 1ULL),
						  typnull);
				}

				/* Unsigned saturate if high bit is 1 */
				j = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0xaa)),
					  typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				j = binop(OR, j, k, typnull);

				return(binop(OR, i, j, typnull));
			default:
				/* If non-zero, saturate to all ones */
				i = binop(SHL,
					  arg0,
					  immed64u((p64_t) 1ULL),
					  typnull);
				i = binop(OR, arg0, i, typnull);
				i = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0xaa)),
					  typnull);
				j = binop(SHR,
					  i,
					  immed64u((p64_t) 1ULL),
					  typnull);
				return(binop(OR, i, j, typnull));
			}
		}

		/* shift by a vector is NYI */
		error("shift left of 2-bit field values only implemented for "
		      "a constant shift");
		return(immed64u((p64_t) 0ULL));

	case PACK: /* 2us */
		/* 4us -> 2us */
		/* PACK (with unsigned saturation) each 4-bit field value to
		   2 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Calculate the saturated values, then pack as unsaturated */
		/* Pack arg0 */
		j = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, arg0, j, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x44)), typnull);
		j = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, arg0, j, typnull);
		j = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, i, j, typnull);

		/* Pack arg1 */
		l = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, arg1, l, typnull);
		l = binop(AND, l, immedu(cvt1x8uto16x8u(0x44)), typnull);
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
		k = binop(OR, arg1, l, typnull);
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
		k = binop(OR, k, l, typnull);

		/* Combine packed arg0 and arg1 in result */
		return(binop(PACK, i, k, typ2u));

	case INTRLVLOW: /* 2us */
		return(binop(INTRLVLOW, arg0, arg1, typ2u));

	case INTRLVHIGH: /* 2us */
		return(binop(INTRLVHIGH, arg0, arg1, typ2u));

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop2us op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}
	return(-1);
}

int
binop2(int op,
int arg0,
int arg1)
{
	/* 2-bit signed field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m, n;

	switch (op) {
	case ADD: /* 2s */
	case SUB: /* 2s */
	case MUL: /* 2s */
	case SHL: /* 2s */
	case SHR: /* 2s */
	case EQ: /* 2s */
	case NE: /* 2s */
	case LAND: /* 2s */
	case LOR: /* 2s */
	case AND: /* 2s */
	case ANDN: /* 2s */
	case OR: /* 2s */
	case XOR: /* 2s */
	case PACK: /* 2s */
	case INTRLVLOW: /* 1-bit to 2-bit interleave of 1-bit fields */ /* 2s */
	case INTRLVHIGH: /* 2s */
		/* These are all the same as unsigned */
		return(binop(op, arg0, arg1, typ2u));

	case DIV: /* 2s */
		/*	abcd	00	01	11	10
			00	xx	00	00	00
			01	xx	01	11	00
			11	xx	11	01	00
			10	xx	10	10	01

			low(ab / cd) = (b & d) | (a & ~b & c & ~d)
			high(ab / cd) = (a &~c &d) | (~a &b &c &d) | (a& ~b &d)
		*/
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(AND, arg0, arg1, typnull);	/* a&c : b&d */

			j = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
								/* X:a */
			k = binop(ANDN, arg0, j, typnull);	/* X : ~b&a */
			j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
								/* X:c */
			l = binop(ANDN, arg1, j, typnull);	/* X : ~d&c */
			m = binop(AND, k, l, typnull);		/* X:a&~b&c&~d*/

			n = binop(OR, i, m, typnull);		/* X : low */
			n = binop(AND,
				  n,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);
			/* 0:low */

			i = binop(ANDN, arg1, arg0, typnull);	/* ~c&a:~d&b */
			j = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
								/* d:X */
			i = binop(AND, i, j, typnull);		/* ~c&a&d : X */

			k = binop(ANDN, arg0, arg1, typnull);	/* ~a&c:~b&d */
			j = binop(AND, arg0, arg1, typnull);	/* a&c : b&d */
			j = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
								/* b&d:X */
			l = binop(AND, k, j, typnull);		/* ~a&b&c&d:X */

			m = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
								/* ~b&d:X */
			m = binop(AND, arg0, m, typnull);	/* a&~b&d : X */
			i = binop(OR, i, l, typnull);
			i = binop(OR, i, m, typnull);		/* high : X */
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);
			/* high:0 */
		} else {
			i = binop(AND, arg0, arg1, typnull);	/* a&c : b&d */

			j = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
								/* X:a */
			k = unop(NOT, arg0, typnull);		/* ~a : ~b */
			k = binop(AND, k, j, typnull);		/* X : ~b&a */
			j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
								/* X:c */
			l = unop(NOT, arg1, typnull);		/* ~c : ~d */
			l = binop(AND, l, j, typnull);		/* X : ~d&c */
			m = binop(AND, k, l, typnull);		/* X:a&~b&c&~d*/

			n = binop(OR, i, m, typnull);		/* X : low */
			n = binop(AND,
				  n,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);
			/* 0:low */

			i = unop(NOT, arg1, typnull);		/* ~c:~d */
			i = binop(AND, i, arg0, typnull);	/* ~c&a:~d&b */
			j = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
								/* d:X */
			i = binop(AND, i, j, typnull);		/* ~c&a&d : X */

			k = unop(NOT, arg0, typnull);		/* ~a:~b */
			k = binop(AND, k, arg1, typnull);	/* ~a&c:~b&d */
			j = binop(AND, arg0, arg1, typnull);	/* a&c : b&d */
			j = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
								/* b&d:X */
			l = binop(AND, k, j, typnull);		/* ~a&b&c&d:X */

			m = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
								/* ~b&d:X */
			m = binop(AND, arg0, m, typnull);	/* a&~b&d : X */
			i = binop(OR, i, l, typnull);
			i = binop(OR, i, m, typnull);		/* high : X */
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);
			/* high:0 */
		}

		return(binop(OR, n, i, typnull));

	case MOD: /* 2s */
		/* Isn't there a definition for x%0 somewhere?
			Yes! x mod 0 = x by convention.

			p82, Graham, Knuth, and Patashnik,
			"Concrete Mathematics", 2nd ed.
			1994 Addison-Wesley Publishing Company, Inc.
		*/
		/*	abcd	00	01	11	10
			00	xx	00	00	00
			01	xx	00	00	01
			11	xx	00	00	11
			10	xx	00	00	00

			low(ab % cd) = (b & ~d)
			high(ab % cd) = (a & b & c & ~d)
		*/
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(ANDN, arg1, arg0, typnull);	/* ~(d) & b */
		} else {
			i = unop(NOT, arg1, typnull);		/* ~d */
			i = binop(AND, i, arg0, typnull);	/* ~d & b */
		}
		j = binop(AND, i, immedu(cvt1x8uto16x8u(0x55)), typnull);

		i = binop(SHL, i, immed64u((p64_t) 1ULL), typnull); /* ~(d)&b */
		k = binop(AND, arg0, arg1, typnull);		  /* a & c */
		i = binop(AND, i, k, typnull);		/* a&b&c&~d */
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0xaa)), typnull);
		return(binop(OR, i, j, typnull));

	case AVG: /* 2s */
		/*	abcd	00	01	11	10
			00	00	01	00	11
			01	01	01	00	00
			11	00	00	11	11
			10	11	00	11	10

			low = (~a&b&~c) | (~a&~c&d) | (a&b&c) |
					(a&c&d) | (~a&~b&c&~d) | (a&~b&~c&~d)
			high = (a & c) | (a&~b&~c&~d) | (~a&~b&c&~d)
		*/
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* ~a&b&~c = ~(a|c)&b */
			i = binop(OR, arg0, arg1, typnull);	/* a|c : b|d */
			j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
								/* X:a|c */
			k = binop(ANDN, j, arg0, typnull);	/* X:~(a|c)&b */

			/* ~a&~c&d = ~(a|c)&d */
			j = binop(ANDN, j, arg1, typnull);	/* X:~(a|c)&d */
			j = binop(OR, j, k, typnull);	/* 1st two terms */

			/* a&b&c */
			k = binop(AND, arg0, arg1, typnull);	/* a&c : b&d */
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* X:a&c */
			m = binop(AND, l, arg0, typnull);	/* X: a&c&b */
			j = binop(OR, j, m, typnull);	/* 1st three terms */

			/* a&c&d */
			l = binop(AND, l, arg1, typnull);	/* X: a&c&d */
			j = binop(OR, j, l, typnull);	/* 1st four terms */

			/* ~a&~b&c&~d = ~(b|d)&(~a&c) */
			l = binop(ANDN, arg0, arg1, typnull);	/* ~a&c:~b&d */
			l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
								/* X:~a&c */
			l = binop(ANDN, i, l, typnull);	/* X:~(b|d)&(~a&c) */
			j = binop(OR, j, l, typnull);	/* 1st five terms */

			/* a&~b&~c&~d = ~(b|d)&(a&~c) */
			m = binop(ANDN, arg1, arg0, typnull);	/* ~c&a:~d&b */
			m = binop(SHR, m, immed64u((p64_t) 1ULL), typnull);
								/* X:~c&a */
			m = binop(ANDN, i, m, typnull);	/* X:~(b|d)&(a&~c) */
			j = binop(OR, j, m, typnull);		/* X : low */

			/* 0 : low */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			/* high = (a & c) | (a&~b&~c&~d) | (~a&~b&c&~d)
				= k | m<<1 | l<<1
				= k | (m|l)<<1
			*/
			i = binop(OR, m, l, typnull);
			i = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, k, i, typnull);

			/* high : 0 */
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);

			return(binop(OR, j, i, typnull));
		} else {
			/* ~a&b&~c = ~(a|c)&b */
			i = binop(OR, arg0, arg1, typnull);	/* a|c : b|d */
			j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
								/* X:a|c */
			k = unop(NOT, j, typnull);		/* X:~(a|c) */
			k = binop(AND, k, arg0, typnull);	/* X:~(a|c)&b */

			/* ~a&~c&d = ~(a|c)&d */
			j = unop(NOT, j, typnull);		/* X:~(a|c) */
			j = binop(AND, j, arg1, typnull);	/* X:~(a|c)&d */
			j = binop(OR, j, k, typnull);	/* 1st two terms */

			/* a&b&c */
			k = binop(AND, arg0, arg1, typnull);	/* a&c : b&d */
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* X:a&c */
			m = binop(AND, l, arg0, typnull);	/* X: a&c&b */
			j = binop(OR, j, m, typnull);	/* 1st three terms */

			/* a&c&d */
			l = binop(AND, l, arg1, typnull);	/* X: a&c&d */
			j = binop(OR, j, l, typnull);	/* 1st four terms */

			/* ~a&~b&c&~d = ~(b|d)&(~a&c) */
			l = unop(NOT, arg0, typnull);		/* ~a:~b */
			l = binop(AND, l, arg1, typnull);	/* ~a&c:~b&d */
			l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
								/* X:~a&c */
			n = unop(NOT, i, typnull);	/* X:~(b|d) */
			l = binop(AND, n, l, typnull);	/* X:~(b|d)&(~a&c) */
			j = binop(OR, j, l, typnull);	/* 1st five terms */

			/* a&~b&~c&~d = ~(b|d)&(a&~c) */
			m = unop(NOT, arg1, typnull);		/* ~c:~d */
			m = binop(AND, m, arg0, typnull);	/* ~c&a:~d&b */
			m = binop(SHR, m, immed64u((p64_t) 1ULL), typnull);
								/* X:~c&a */
			m = binop(AND, n, m, typnull);	/* X:~(b|d)&(a&~c) */
			j = binop(OR, j, m, typnull);		/* X : low */

			/* 0 : low */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			/* high = (a & c) | (a&~b&~c&~d) | (~a&~b&c&~d)
				= k | m<<1 | l<<1
				= k | (m|l)<<1
			*/
			i = binop(OR, m, l, typnull);
			i = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, k, i, typnull);

			/* high : 0 */
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);

			return(binop(OR, j, i, typnull));
		}

	case MIN: /* 2s */
		/*	abcd	00	01	11	10
			00	00	00	11	10
			01	00	01	11	10
			11	11	11	11	10
			10	10	10	10	10

			low(ab % cd) = (b&d) | a&b&~c | ~a&c&d
			high(ab % cd) = a | c
		*/
		i = binop(AND, arg0, arg1, typnull);	/* a&c:b&d */

		j = unop(NOT, arg1, typnull);		/* ~c:~d */
		j = binop(AND, j, arg0, typnull);	/* a&~c:b&~d */
		j = binop(SHR, j, immed64u((p64_t) 1ULL), typnull); /* X:a&~c */
		j = binop(AND, j, arg0, typnull);	/* X:a&b&~c */

		k = unop(NOT, arg0, typnull);		/* ~a:~b */
		k = binop(AND, k, arg1, typnull);	/* ~a&c:~b&d */
		k = binop(SHR, k, immed64u((p64_t) 1ULL), typnull); /* X:~a&c */
		k = binop(AND, k, arg1, typnull);	/* X:~a&c&d */

		i = binop(OR, i, j, typnull);
		i = binop(OR, i, k, typnull);
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0x55)), typnull);

		j = binop(OR, arg0, arg1, typnull);	/* a|c : b|d */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0xaa)), typnull);

		return(binop(OR, i, j, typnull));


	case MAX: /* 2s */
		/*	abcd	00	01	11	10
			00	00	01	00	00
			01	01	01	01	01
			11	00	01	11	11
			10	00	01	11	10

			low(ab % cd) = (~c&d) | (~a&b) | (a&d) | (b&c)
			high(ab % cd) = a & c
		*/
		i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull); /* X:a */
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull); /* X:c */

		k = unop(NOT, i, typnull);		/* X:~a */
		k = binop(AND, k, arg0, typnull);	/* X:~a&b */
		l = unop(NOT, j, typnull);		/* X:~c */
		l = binop(AND, l, arg1, typnull);	/* X:~c&d */
		k = binop(OR, k, l, typnull);

		l = binop(AND, i, arg1, typnull);	/* X:a&d */
		m = binop(AND, j, arg0, typnull);	/* X:c&b */
		l = binop(OR, l, m, typnull);

		k = binop(OR, k, l, typnull);	/* X : low */
		/* 0 : low */
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x55)), typnull);

		i = binop(AND, arg0, arg1, typnull);	/* a&c : b&d */
		/* high : 0 */
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0xaa)), typnull);

		return(binop(OR, i, k, typnull));

	case LT: /* 2s */
		/* x LT y is really x NE (x MAX y) */
		/* x LT y is also y GT x */
		return(binop(GT, arg1, arg0, typ2));

	case LE: /* 2s */
		/* x LE y is really x EQ (x MIN y) */
		i = binop(MIN, arg0, arg1, typ2);
		return(binop(EQ, arg0, i, typ2));

	case GT: /* 2s */
		/* x GT y is really x NE (x MIN y) */
		i = binop(MIN, arg0, arg1, typ2);
		return(binop(NE, arg0, i, typ2));

	case GE: /* 2s */
		/* x GE y is really x EQ (x MAX y) */
		/* x GE y is also y LE x */
		return(binop(LE, arg1, arg0, typ2));

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop2 op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}
	return(-1);
}

int
binop2ss(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 2-bit signed field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m;

	switch (op) {
	case EQ: /* 2ss */
	case NE: /* 2ss */

	case LAND: /* 2ss */
	case LOR: /* 2ss */

	case AND: /* 2ss */
	case ANDN: /* 2ss */
	case OR: /* 2ss */
	case XOR: /* 2ss */

		/* These are all the same as unsigned unsaturated */
		return(binop(op, arg0, arg1, typ2u));

	case MIN: /* 2ss */
	case MAX: /* 2ss */
	case AVG: /* 2ss */

	case LT: /* 2ss */
	case LE: /* 2ss */
	case GT: /* 2ss */
	case GE: /* 2ss */

	case SHR: /* 2ss */
		/* These are all the same as signed unsaturated */
		return (binop(op, arg0, arg1, typ2));

	case ADD: /* 2ss */
#define LIKE32
#ifdef CURRENT
		/* This method takes 26 instructions with ANDN */
		/* This method takes 28+ instructions without ANDN */

		/* THIS METHOD HAS NOT BEEN MADE non-MMX!! */

		/* high bit = a~d | ac | ~bc | a~b | c~d */
		i = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
								/* d : X */
		if (optcpu & CPU_MMX) {
			j = binop(ANDN, i, arg0, typnull);	/* a~d : X */
			i = binop(ANDN, i, arg1, typnull);	/* c~d : X */
		} else {
			i = unop(NOT, i, typnull);		/* ~d : X */
			j = binop(AND, i, arg0, typnull);	/* a~d : X */
			i = binop(AND, i, arg1, typnull);	/* c~d : X */
		}
		i = binop(OR, i, j, typnull);		/* terms 1 and 5 */

		j = binop(AND, arg0, arg1, typnull);	/* ac : bd */
		i = binop(OR, i, j, typnull);		/* terms 1,2 and 5 */

		j = binop(SHL, arg0, immed64u((p64_t) 1ULL), typnull);
							/* b : X */
		if (optcpu & CPU_MMX) {
			k = binop(ANDN, j, arg1, typnull);	/* ~bc : X */
			j = binop(ANDN, j, arg0, typnull);	/* a~b : X */
		} else {
			j = unop(NOT, j, typnull);		/* ~b  : X */
			k = binop(AND, j, arg1, typnull);	/* ~bc : X */
			j = binop(AND, j, arg0, typnull);	/* a~b : X */
		}
		i = binop(OR, i, k, typnull);
		i = binop(OR, i, j, typnull);

		/* High bit */
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0xaa)), typnull);

		/* low bit  = ~b~cd | b~c~d | ~ab~d | ~ab~c
			    = ~c(~bd) | ~c(b~d) | ~a(b~d) | ~ab~c
		*/
		j = binop(ANDN, arg0, arg1, typnull);		/* ~ac : ~bd */
		k = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
								/* X : c */
		j = binop(ANDN, k, j, typnull);			/* X : ~c~bd */

		l = binop(ANDN, arg1, arg0, typnull);		/* a~c : b~d */
		m = binop(ANDN, k, l, typnull);			/* X : ~cb~d */
		j = binop(OR, j, m, typnull);		/* terms 1 and 2 */

		m = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
								/* X : a */
		n = binop(ANDN, m, l, typnull);			/* X : ~ab~d */
		j = binop(OR, j, n, typnull);		/* terms 1,2,3 */

		k = binop(ANDN, k, arg0, typnull);		/* X : ~cb */
		m = binop(ANDN, m, k, typnull);			/* X : ~ab~c */
		j = binop(OR, j, m, typnull);		/* terms 1-4 */

		/* Low bit */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x55)), typnull);

		return (binop(OR, i, j, typnull));
#endif
#ifdef LIKE32
		/* This method takes 18+ADD2 (23) instructions */
		/* Do the signed add */
		i = binop(ADD, arg0, arg1, typ2);	/* 5 instructions if
							   ADD32 is one */
		/* Correct for positive saturation */
		j = binop(OR, arg0, arg1, typnull);	/* fx if both args + */
		j = unop(NOT, j, typnull);		/* tx if both args + */
		j = binop(AND, j, i, typnull);	/* tx if MSb(sum)=1 & both + */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0xaa)), typnull);
							/* t0 if sats */

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
							/* 0t if sats */
		l = binop(OR, k, j, typnull);		/* tt if sats */

		/* Clobber the calculated value with PosSat if arg1==0 */
		i = binop(OR, i, l, typnull);			/* tt | ab */
		j = unop(NOT, j, typnull);			/* f1 */
		i = binop(AND, i, j, typnull);			/* 0t | a&f b */

		/* Correct for negative saturation */
		j = binop(AND, arg0, arg1, typnull);	/* tX if both neg */
		k = unop(NOT, i, typnull);		/* fX if both neg */
		j = binop(AND, j, k, typnull);		/* 0X always */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0xaa)), typnull);
							/* 00 always */

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
							/* 00 always */
		l = binop(OR, k, j, typnull);		/* 00 always */

		/* Clobber the calculated value with NegSat if arg1==0 */
		l = unop(NOT, l, typnull);		/* ff */
		i = binop(AND, i, l, typnull);		/* ab */
		i = binop(OR, i, j, typnull);		/* ab */

		return(i);
#endif

	case SUB: /* 2ss */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* K-map method */
			/* This method takes 18 instructions,
			   but expects ANDN */

			/* High bit = a~c | a(~bd) | ~c(~bd) */
			i = binop(ANDN, arg1, arg0, typnull);	/* a~c : b~d */

			j = binop(ANDN, arg0, arg1, typnull);	/* ~ac : ~bd */
			k = binop(SHL,
				  j,
				  immed64u((p64_t) 1ULL),
				  typnull); /* ~bd:X */
			l = binop(AND, arg0, k, typnull);	/* a~bd : X */

			m = binop(ANDN, arg1, k, typnull);	/* ~c~bd : X */
			k = binop(OR, i, l, typnull);
			k = binop(OR, k, m, typnull);

			/* High bit */
			k = binop(AND,
				  k,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);

			/* Low bit  = ~ac | b~d | c(~bd) | ~a(~bd) */
			l = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
								/* X:~ac */
			l = binop(OR, l, i, typnull);		/* terms 1,2 */

			m = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
								/* X:c */
			m = binop(AND, m, j, typnull);		/* X:c~bd */
			l = binop(OR, l, m, typnull);		/* terms 1-3 */

			m = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
								/* X:a */
			m = binop(ANDN, m, j, typnull);		/* X : ~a~bd */
			l = binop(OR, l, m, typnull);		/* terms 1-4 */

			/* Low bit */
			l = binop(AND,
				  l,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			return (binop(OR, k, l, typnull));
		} else {
			/* This method takes 18+SUB2(3) = 21 instructions */

			/* Do the signed sub */
			i = binop(SUB, arg0, arg1, typ2);

			/* Correct for positive saturation */
			m = binop(XOR, arg0, arg1, typnull);
							/* tX..X if mixed */
			j = binop(AND, m, arg1, typnull);
						/* tX..X if arg0+ & arg1- */
			j = binop(AND, j, i, typnull);	/* MSb(diff) = 1 */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);

			/* Clobber the calculated value with PosSat
			   if arg1==0 */
			i = binop(OR, i, l, typnull);		/* T...T */
			j = unop(NOT, j, typnull);		/* f1..1 */
			i = binop(AND, i, j, typnull);


			/* Correct for negative saturation */
			j = binop(AND, m, arg0, typnull);
						/* tX..X if arg0- & arg1+ */
			k = unop(NOT, i, typnull);
			j = binop(AND, j, k, typnull);	/* MSb(diff) = 0 */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);

			/* Clobber the calculated value with NegSat
			   if arg1==0 */
			l = unop(NOT, l, typnull);		/* F...F */
			i = binop(AND, i, l, typnull);
			i = binop(OR, i, j, typnull);

			return(i);
		}

	case MUL: /* 2ss */
		/* High bit = a~cd | ~abc */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(ANDN, arg1, arg0, typnull);	/* a~c : b~d */
		} else {
			i = unop(NOT, arg1, typnull);
			i = binop(AND, i, arg0, typnull);	/* a~c : b~d */
		}
		j = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
								/* d : X */
		i = binop(AND, i, j, typnull);			/* a~cd : X */

		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			j = binop(ANDN, arg0, arg1, typnull);	/* ~ac : ~bd */
		} else {
			j = unop(NOT, arg0, typnull);
			j = binop(AND, j, arg1, typnull);	/* ~ac : ~bd */
		}
		k = binop(SHL, arg0, immed64u((p64_t) 1ULL), typnull);
								/* b : X */
		j = binop(AND, j, k, typnull);			/* ~abc : X */

		/* High bit */
		i = binop(OR, i, j, typnull);
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0xaa)), typnull);

		/* Low bit  = bd | ac */
		j = binop(AND, arg0, arg1, typnull);		/* ac : bd */
		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
								/* X : ac */
		j = binop(OR, j, k, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x55)), typnull);

		return (binop(OR, i, j, typnull));

	case DIV: /* 2ss */
		/* High bit = a~c | ~abcd */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(ANDN, arg1, arg0, typnull);	/* a~c : b~d */

			j = binop(ANDN, arg0, arg1, typnull);	/* ~ac : ~bd */
			k = binop(AND, arg0, arg1, typnull);	/* ac : bd */
			l = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
								/* bd:X */
			l = binop(AND, j, l, typnull);		/* ~abcd : X */

			/* High bit */
			i = binop(OR, i, l, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);

			/* Low bit  = ~c~d | bd | ~bac
				    = ~(c|d) | bd | ~bac */
			j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
								/* X:c */
			j = binop(OR, arg1, j, typnull);	/* X : c|d */
			j = unop(NOT, j, typnull);		/* X : ~(c|d) */

			j = binop(OR, j, k, typnull);		/* terms 1-2 */

			k = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* X:ac */
			k = binop(ANDN, arg0, k, typnull);	/* X : ~bac */
			j = binop(OR, j, k, typnull);		/* terms 1-3 */

			/* Low bit */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			return (binop(OR, i, j, typnull));
		} else {
			i = unop(NOT, arg1, typnull);		/* ~c : ~d */
			i = binop(AND, i, arg0, typnull);	/* a~c : b~d */

			j = unop(NOT, arg0, typnull);		/* ~a : ~b */
			j = binop(AND, j, arg1, typnull);	/* ~ac : ~bd */
			k = binop(AND, arg0, arg1, typnull);	/* ac : bd */
			l = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
								/* bd:X */
			l = binop(AND, j, l, typnull);		/* ~abcd : X */

			/* High bit */
			i = binop(OR, i, l, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xaa)),
				  typnull);

			/* Low bit  = ~c~d | bd | ~bac
				    = ~(c|d) | bd | ~bac */
			j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
								/* X:c */
			j = binop(OR, arg1, j, typnull);	/* X : c|d */
			j = unop(NOT, j, typnull);		/* X : ~(c|d) */

			j = binop(OR, j, k, typnull);		/* terms 1-2 */

			k = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* X:ac */
			l = unop(NOT, arg0, typnull);		/* X : ~b */
			k = binop(AND, l, k, typnull);		/* X : ~bac */
			j = binop(OR, j, k, typnull);		/* terms 1-3 */

			/* Low bit */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x55)),
				  typnull);

			return (binop(OR, i, j, typnull));
		}

	case MOD: /* 2ss */
		/* Same as unsaturated except that mod by 0 saturates */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(ANDN, arg1, arg0, typnull);	/* ~(d) & b */
		} else {
			i = unop(NOT, arg1, typnull);		/* ~d */
			i = binop(AND, i, arg0, typnull);	/* ~d & b */
		}
		j = binop(AND, i, immedu(cvt1x8uto16x8u(0x55)), typnull);

		i = binop(SHL, i, immed64u((p64_t) 1ULL), typnull); /* ~(d)&b */
		k = binop(AND, arg0, arg1, typnull);		  /* a & c */
		i = binop(AND, i, k, typnull);		/* a&b&c&~d */
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0xaa)), typnull);
		i = binop(OR, i, j, typnull);

		j = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, arg1, j, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0xaa)), typnull);
		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, j, k, typnull);
		return(binop(AND, i, j, typnull));

	case SHL: /* 2ss */
		/* Mask the shift count to lower bits if a NUM */
		arg1 = shiftconst(arg1, typ2u);

		/* only three cases: shift by 0, by 1, or too big */
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:
				return(arg0);
			case 0x1ULL:
				i = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x55)),
					  typnull);
				if (optcpu & CPU_MMX) {
					i = binop(ADD, i, i, typ32);
				} else if (optcpu & CPU_MAX) {
					i = binop(ADD, i, i, typ16);
				} else {
					i = binop(SHL,
						  i,
						  immed64u((p64_t) 1ULL),
						  typnull);
				}

				/* Signed saturate if ? */
				j = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0xaa)),
					  typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				j = binop(OR, j, k, typnull);

				return(binop(OR, i, j, typnull));
			default:
				/* If non-zero, saturate to all ones */

				i = binop(SHL,
					  arg0,
					  immed64u((p64_t) 1ULL),
					  typnull);
				i = binop(OR, arg0, i, typnull);
				i = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x55)),
					  typnull);
				j = binop(SHR,
					  i,
					  immed64u((p64_t) 1ULL),
					  typnull);
				return(binop(OR, i, j, typnull));
			}
		}

		/* shift by a vector is NYI */
		error("shift left of 2-bit field values only implemented for "
		      "a constant shift");
		return(immed64u((p64_t) 0ULL));

	case INTRLVLOW: /* 2ss */
		/* 1-bit to 2-bit interleave of 1-bit fields */ /* 2ss */
	case INTRLVHIGH: /* 2ss */
		/* The same as unsigned */
		return(binop(op, arg0, arg1, typ2u));

	case PACK: /* 2ss */
		/* 4ss -> 2ss */
		/* PACK (with saturation) each 4-bit field value to
		   2 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Saturate arg0 */
		i = binop(MIN, arg0, immedu(cvt1x16uto8x16u(0x1111)), typ4);
		i = binop(MAX, i, immedu(cvt1x16uto8x16u(0xeeee)), typ4);

		/* Saturate arg1 */
		j = binop(MIN, arg1, immedu(cvt1x16uto8x16u(0x1111)), typ4);
		j = binop(MAX, j, immedu(cvt1x16uto8x16u(0xeeee)), typ4);

		/* Pack as signed */
		return(binop(PACK, i, j, typ2));

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop2ss op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}
	return(-1);
}


int
binop4u(int op,
int arg0,
int arg1)
{
	/* 4-bit unsigned field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i=-1, j, k, l;

	switch (op) {
	case ADD: /* 4u */
		/* Use implicit spacer technique */
		/* This method takes 6 instructions assuming that the ISA
		   includes a larger parallel unsigned add */

		i = binop(AND, arg0, immedu(cvt1x8uto16x8u(0x77)), typnull);
		j = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x77)), typnull);

		/* Use largest supported parallel unsigned add */
		if (optcpu & CPU_MAX) {
			i = binop(ADD, i, j, typ16u);
		} else {
			i = binop(ADD, i, j, typ32u);
		}

		j = binop(XOR, arg0, arg1, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);
		return(binop(XOR, i, j, typnull));

	case SUB: /* 4u */
		/* Use implicit spacer technique */
		/* This method takes 7 instructions assuming that the ISA
		   includes a larger parallel unsigned subtract */

		i = binop(OR, arg0, immedu(cvt1x8uto16x8u(0x88)), typnull);
		j = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x77)), typnull);

		/* Use largest supported parallel unsigned sub */
		if (optcpu & CPU_MMX) {
			i = binop(SUB, i, j, typ32u);
		} else if (optcpu & CPU_MMX) {
			i = binop(SUB, i, j, typ16u);
		} else {
			i = binop(SUB, i, j, typ32u);
		}

		j = binop(XOR, arg0, arg1, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);
		j = binop(XOR, j, immedu(cvt1x8uto16x8u(0x88)), typnull);
		return(binop(XOR, i, j, typnull));

	case MUL: /* 4u */
		if (optcpu & CPU_MMX) {
			/* use 8-bit multiplies */
			i = binop(MUL, arg0, arg1, typ8u);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);

			j = binop(SHR, arg0, immed64u((p64_t) 4ULL), typ8u);
			k = binop(SHR, arg1, immed64u((p64_t) 4ULL), typ8u);
			j = binop(MUL, j, k, typ8u);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);

			j = binop(SHL, j, immed64u((p64_t) 4ULL), typ8u);
			return(binop(OR, i, j, typnull));
		} else if (optcpu & CPU_AltiVec) {
			/* AltiVec has even and odd 8u->16u multiplies */
			int m, n, o, p;

			/* Do evens */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			k = binop(MULEVEN, i, j, typ8u);
			k = binop(AND,
				  k,
				  immedu(cvt1x16uto8x16u(0x000f)),
				  typnull);
			l = binop(MULODD, i, j, typ8u);
			l = binop(AND,
				  l,
				  immedu(cvt1x16uto8x16u(0x000f)),
				  typnull);
			l = binop(SHL, l, immed64u((p64_t) 8ULL), typnull);
			k = binop(OR, k, l, typnull);

			/* Do odds */
			m = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			m = binop(SHR, m, immed64u((p64_t) 4ULL), typnull);
			n = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			n = binop(SHR, n, immed64u((p64_t) 4ULL), typnull);
			o = binop(MULEVEN, m, n, typ8u);
			o = binop(AND,
				  o,
				  immedu(cvt1x16uto8x16u(0x000f)),
				  typnull);
			p = binop(MULODD, m, n, typ8u);
			p = binop(AND,
				  p,
				  immedu(cvt1x16uto8x16u(0x000f)),
				  typnull);
			p = binop(SHL, p, immed64u((p64_t) 8ULL), typnull);
			o = binop(OR, o, p, typnull);
			o = binop(SHL, o, immed64u((p64_t) 4ULL), typnull);

			return(binop(OR, k, o, typnull));
		} else {

#ifdef NOTDEFD
			/* Use 2-bit unsigned MULs */
			/* Clearing the odd fields of arg0 will save us one
			   masking operation. */
			i = binop(AND,
				arg0,
				immedu(cvt1x8uto16x8u(0x33)),
				typnull);
			j = binop(MUL, i, arg1, typ2u);
			k = binop(MULH, i, arg1, typ2u);
			k = binop(SHL, k, immed64u((p64_t) 2ULL), typnull);

			i = binop(SHL, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(MUL, i, arg1, typ2u);
			k = binop(ADD, k, i, typ2u);

			/* Clearing the odd fields of arg1 will save us one
			   masking operation. */
			i = binop(AND,
				arg1,
				immedu(cvt1x8uto16x8u(0x33)),
				typnull);
			i = binop(SHL, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(MUL, arg0, i, typ2u);
			k = binop(ADD, k, i, typ2u);

			return(binop(OR, k, j, typnull));
#else
			unsigned long long step;

			/* Perform a shift-add sequence */
			i = immed64u((p64_t) 0ULL);
			for (step=0ULL; step<4ULL; ++step)
			{
				j = binop(AND,
					  arg1,
					  immedu(cvt1x8uto16x8u(0x11<<step)),
					  typnull);
				k = binop(NE, j, immed64u((p64_t) 0ULL), typ4u);
				j = binop(AND, arg0, k, typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) step),
					  typ4u);
				i = binop(ADD, i, j, typ4u);
			}
			return(i);
#endif
		}

	case DIV: /* 4u */
		/* use 8-bit divides */
		i = binop(AND, arg0, immedu(cvt1x8uto16x8u(0x0f)), typnull);
		j = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x0f)), typnull);
		i = binop(DIV, i, j, typ8u);
		if (optcpu & CPU_AltiVec) {
			j = binop(SHR,
				  arg0,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
			k = binop(SHR,
				  arg1,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
		} else {
			j = binop(SHR, arg0, immed64u((p64_t) 4ULL), typ8u);
			k = binop(SHR, arg1, immed64u((p64_t) 4ULL), typ8u);
		}
		j = binop(DIV, j, k, typ8u);
		if (optcpu & CPU_AltiVec) {
			j = binop(SHL,
				  j,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
		} else {
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typ8u);
		}
		return(binop(OR, i, j, typnull));

	case MOD: /* 4u */
		/* use 8-bit modulus */
		i = binop(AND, arg0, immedu(cvt1x8uto16x8u(0x0f)), typnull);
		j = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x0f)), typnull);
		i = binop(MOD, i, j, typ8u);
		if (optcpu & CPU_AltiVec) {
			j = binop(SHR,
				  arg0,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
			k = binop(SHR,
				  arg1,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
		} else {
			j = binop(SHR, arg0, immed64u((p64_t) 4ULL), typ8u);
			k = binop(SHR, arg1, immed64u((p64_t) 4ULL), typ8u);
		}
		j = binop(MOD, j, k, typ8u);
		if (optcpu & CPU_AltiVec) {
			j = binop(SHL,
				  j,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
		} else {
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typ8u);
		}
		return(binop(OR, i, j, typnull));

	case AVG: /* 4u */
		/* Average rounds up */
		i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0x77)), typnull);
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x77)), typnull);

		/* Use a supported unsigned add */
		if (optcpu & CPU_MMX) {
			i = binop(ADD, i, j, typ32u);
		} else if (optcpu & CPU_MAX) {
			i = binop(ADD, i, j, typ16u);
		} else {
			i = binop(ADD, i, j, typ32u);
		}
		j = binop(OR, arg0, arg1, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x11)), typnull);

		/* Use a supported unsigned add */
		if (optcpu & CPU_MMX) {
			i = binop(ADD, i, j, typ32u);
		} else if (optcpu & CPU_MAX) {
			i = binop(ADD, i, j, typ16u);
		} else {
			i = binop(ADD, i, j, typ32u);
		}
		return(i);

	case MIN: /* 4u */
		/* use GT */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(GT, arg0, arg1, typ4u);
			j = binop(AND, i, arg1, typnull);
			i = binop(ANDN, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = binop(GT, arg0, arg1, typ4u);
			j = binop(AND, i, arg1, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		}

	case MAX: /* 4u */
		/* use GT */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(GT, arg0, arg1, typ4u);
			j = binop(AND, i, arg0, typnull);
			i = binop(ANDN, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = binop(GT, arg0, arg1, typ4u);
			j = binop(AND, i, arg0, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		}

	case AND: /* 4u */
		return(binop(AND, arg0, arg1, typnull));

	case ANDN: /* 4u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			return(binop(ANDN, arg0, arg1, typnull));
		} else {
			i = unop(NOT, arg0, typnull);
			return(binop(AND, i, arg1, typnull));
		}

	case OR: /* 4u */
		return(binop(OR, arg0, arg1, typnull));

	case XOR: /* 4u */
		return(binop(XOR, arg0, arg1, typnull));

	case LAND: /* 4u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* use 8-bit EQ 0 to normalize fields before AND */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(EQ, i, immed64u((p64_t) 0ULL), typ8u);
			i = binop(ANDN,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(EQ, j, immed64u((p64_t) 0ULL), typ8u);
			k = binop(ANDN, j, i, typnull);

			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			i = binop(EQ, i, immed64u((p64_t) 0ULL), typ8u);
			i = binop(ANDN,
				  i,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(EQ, j, immed64u((p64_t) 0ULL), typ8u);
			j = binop(ANDN, j, i, typnull);
			return(binop(OR, j, k, typnull));
		} else {
			/* use shift/or seq. to normalize fields before AND */
			i = binop(SHR, arg0, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, arg0, i, typnull);
			j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x11)),
				  typnull);
			j = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHL, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, i, j, typnull);

			j = binop(SHR, arg1, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, arg1, j, typnull);
			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, j, k, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x11)),
				  typnull);
			k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, j, k, typnull);

			return(binop(AND, i, j, typnull));
		}

	case LOR: /* 4u */
		/* use 8-bit EQ 0 to normalize fields after ORing */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(OR, arg0, arg1, typnull);
			j = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(EQ, j, immed64u((p64_t) 0ULL), typ8u);
			i = binop(EQ, i, immed64u((p64_t) 0ULL), typ8u);
			j = binop(ANDN,
				  j,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(ANDN,
				  i,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use shift/or seq. to normalize fields before AND */
			i = binop(SHR, arg0, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, arg0, i, typnull);
			j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x11)),
				  typnull);
			j = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHL, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, i, j, typnull);

			j = binop(SHR, arg1, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, arg1, j, typnull);
			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, j, k, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x11)),
				  typnull);
			k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, j, k, typnull);

			return(binop(OR, i, j, typnull));
		}

	case EQ: /* 4u */
		if (optcpu & CPU_MMX) {
			/* use 8-bit EQ */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(EQ, i, j, typ8u);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(EQ, j, k, typ8u);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use 2-bit EQ */
			i = binop(EQ, arg0, arg1, typ2u);
			j = binop(SHL, i, immed64u((p64_t) 2ULL), typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xcc)),
				  typnull);
			i = binop(AND, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			return(binop(OR, i, j, typnull));
		}

	case NE: /* 4u */
		/* not EQ */
		return(unop(NOT, binop(EQ, arg0, arg1, typ4u), typnull));

	case LT: /* 4u */
		return(binop(GT, arg1, arg0, typ4u));

	case LE: /* 4u */
		return(binop(GE, arg1, arg0, typ4u));

	case GT: /* 4u */
		if (optcpu & CPU_MMX) {
			/* use 8-bit GT */

			/* Compare even fields */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(GT, i, j, typ8u);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);

			/* Compare odd fields */
			j = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(GT, j, k, typ8u);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		} else {
			/* use 2-bit GT and EQ */
			i = binop(GT, arg0, arg1, typ2u);
			j = binop(EQ, arg0, arg1, typ2u);
			k = binop(SHL, i, immed64u((p64_t) 2ULL), typnull);
			j = binop(AND, j, k, typnull);
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xcc)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			return(binop(OR, i, j, typnull));
		}

	case GE: /* 4u */
		if (optcpu & CPU_MMX) {
			/* use 8-bit GE (yeah, I know it is not really there) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(GE, i, j, typ8u);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(GE, j, k, typ8u);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use 2-bit GT and EQ */
			i = binop(GT, arg0, arg1, typ2u);
			j = binop(EQ, arg0, arg1, typ2u);
			k = binop(OR, i, j, typnull);
			k = binop(SHL, k, immed64u((p64_t) 2ULL), typnull);
			j = binop(AND, j, k, typnull);
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xcc)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			return(binop(OR, i, j, typnull));
		}

	case SHL: /* 4u */
		/* only a few cases: shift by 0, 1, 2, 3, or too big */
		arg1 = shiftconst(arg1, typ4u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:
				return(arg0);
			case 0x1ULL:
				/* Sneaky (Intel recommended) use of 32-bit
				   add...
				*/
				i = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x77)),
					  typnull);
				i = binop(ADD, i, i, typ32);
				return(i);
			case 0x2ULL:
				i = binop(SHL,
					  arg0,
					  immed64u((p64_t) 2ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0xcc)),
					  typnull);
				return(i);
			case 0x3ULL:
				i = binop(SHL,
					  arg0,
					  immed64u((p64_t) 3ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x88)),
					  typnull);
				return(i);
			default:
				return(immed64u((p64_t) 0ULL));
			}
		}
		/* shift by a vector is NYI */
		error("shift left of 4-bit field values only implemented for a"
		      " constant shift");
		return(immed64u((p64_t) 0ULL));

	case SHR: /* 4u */
		/* only a few cases: shift by 0, 1, 2, 3, or too big */
		arg1 = shiftconst(arg1, typ4u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:
				return(arg0);
			case 0x1ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 1ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x77)),
					  typnull);
				return(i);
			case 0x2ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 2ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x33)),
					  typnull);
				return(i);
			case 0x3ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 3ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x11)),
					  typnull);
				return(i);
			default:
				return(immed64u((p64_t) 0ULL));
			}
		}
		/* shift by a vector is NYI */
		error("shift right of 4-bit field values only implemented for "
		      "a constant shift");
		return(immed64u((p64_t) 0ULL));

	case PACK: /* 4u */
		/* 8u -> 4u */
		/* PACK (without saturation) each 8-bit field value to
		   4 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Pack arg0 */
		i = binop(AND, arg0, immedu(cvt1x8uto16x8u(0x0f)), typnull);
		j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
		i = binop(OR, i, j, typnull);	/* low 8 bits correct */

		/* Pack arg1 */
		k = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x0f)), typnull);
		l = binop(SHR, k, immed64u((p64_t) 4ULL), typnull);
		k = binop(OR, k, l, typnull);	/* low 8 bits correct */

		/* Combine packed arg0 and arg1 in result */
		return(binop(PACK, i, k, typ8u));

	case INTRLVLOW: /* 4u */
		/* 2-bit to 4-bit interleave of 2-bit fields */
		/* another nasty shift and AND sequence... */
		{
			int bpf = bitsperfrag();

			/* high bit */
			j = arg1;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					  j,
					  (bpf==64)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 32:
				i = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x000000ffULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x000000ff)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x0000ff00ULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x0000ff00)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 8ULL),
					  typnull);
				j = binop(OR, i, j, typnull);


				i = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x000f)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x00f0)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 4ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

				i = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x03)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x0c)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 2ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}
			k = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);


			/* low bit */
			j = arg0;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					  j,
					  (bpf==64)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 32:
				i = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x000000ffULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x000000ff)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x0000ff00ULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x0000ff00)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 8ULL),
					  typnull);
				j = binop(OR, i, j, typnull);


				i = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x000f)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x00f0)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 4ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

				i = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x03)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x0c)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 2ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}
		}

		return(binop(OR, i, k, typnull));

	case INTRLVHIGH: /* 4u */
		/* 2-bit to 4-bit interleave of 2-bit fields */
		/* sneaky way to reuse INTRLVLOW code... */
		{
			unsigned long long bpf_2 =
				(unsigned long long) bitsperfrag()/2ULL;
			i = binop(SHR, arg0, immed64u((p64_t) bpf_2), typnull);
			j = binop(SHR, arg1, immed64u((p64_t) bpf_2), typnull);
			return(binop(INTRLVLOW, i, j, typ4u));
		}

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop4u op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop4us(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 4-bit unsigned field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m, n;

	switch (op) {
	case MIN: /* 4us */
	case MAX: /* 4us */
	case AVG: /* 4us */

	case NE: /* 4us */
	case EQ: /* 4us */
	case GT: /* 4us */
	case LT: /* 4us */
	case LE: /* 4us */
	case GE: /* 4us */

	case LAND: /* 4us */
	case LOR: /* 4us */

	case AND: /* 4us */
	case XOR: /* 4us */
	case OR: /* 4us */
	case ANDN: /* 4us */
		/* These are all the same as unsigned unsaturated */
		return(binop(op, arg0, arg1, typ4u));

	case ADD: /* 4us */
		/* use implicit spacer technique */
		i = binop(AND, arg0, immedu(cvt1x8uto16x8u(0x77)), typnull);
		j = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x77)), typnull);

		/* Use largest supported unsigned add */
		if (optcpu & CPU_MMX) {
			i = binop(ADD, i, j, typ32u);
		} else if (optcpu & CPU_MAX) {
			i = binop(ADD, i, j, typ16u);
		} else {
			i = binop(ADD, i, j, typ32u);
		}

		j = binop(XOR, arg0, arg1, typnull);	/* Aka "propagate" */
		k = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);
		i = binop(XOR, i, k, typnull);


		/* Calulate overflow */
		k = binop(AND, arg0, arg1, typnull);	/* generate */
		l = k;

		k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
		k = binop(AND, k, j, typnull);
		l = binop(OR, l, k, typnull);

		k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
		k = binop(AND, k, j, typnull);
		l = binop(OR, l, k, typnull);

		k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
		k = binop(AND, k, j, typnull);
		l = binop(OR, l, k, typnull);

		l = binop(AND, l, immedu(cvt1x8uto16x8u(0x88)), typnull);

		/* ...and create a saturation mask */
		k = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(OR, k, l, typnull);

		/* Clobber the calculated value with max on overflow */
		i = binop(OR, l, i, typnull);

		/* Saturate to the declared precision */
		if (decl_bits != 4) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1 << decl_bits) - 1)),
				  typ4u);
		}
		return (i);

	case SUB: /* 4us */
		/* Do signed subtract as usual, using implicit spacer
		   technique */
		i = binop(OR, arg0, immedu(cvt1x8uto16x8u(0x88)), typnull);
		j = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x77)), typnull);

		/* Use largest supported unsigned sub */
		if (optcpu & CPU_MMX) {
			i = binop(SUB, i, j, typ32u);
		} else if (optcpu & CPU_MAX) {
			i = binop(SUB, i, j, typ16u);
		} else {
			i = binop(SUB, i, j, typ32u);
		}

		j = binop(XOR, arg0, arg1, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);
		j = binop(XOR, j, immedu(cvt1x8uto16x8u(0x88)), typnull);
		i = binop(XOR, i, j, typnull);

#ifdef NOTDEFD
		/* For emulation, this may be better than emulating 8-bit
		   GT, which is what 4-bit GT uses */
		/* Now calculate borrows */
		/* abcd - efgh.  Sat. to 0 if:
		   ~ae | ~bef | ~a~bf | (~(a^e)) (~(b^f)) (~cg|~c~dh|~dgh)
		*/
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			j = binop(ANDN, arg0, arg1, typnull);
		} else {
			j = unop(NOT, arg0, typnull);
			j = binop(AND, j, arg1, typnull);
		} /* ~ae:~bf:~cg:~dh */

		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
							/* ~bf:~cg:~dh:X */
		l = binop(AND, arg1, k, typnull);	/* ~bfe:~cgf:~dhg:X */
		j = binop(OR, j, l, typnull);		/* terms 1 and 2 */

		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			m = binop(ANDN, arg0, k, typnull);
		} else {
			m = unop(NOT, arg0, typnull);
			m = binop(AND, m, k, typnull);
		} /* ~a~bf:~b~cg:~c~dh:X */

		j = binop(OR, j, m, typnull);		/* terms 1-3 */

		n = binop(XOR, arg0, arg1, typnull);    /* a^e:b^f:c^g:d^h */
		n = unop(NOT, n, typnull);     /* ~(a^e):~(b^f):~(c^g):~(d^h) */

		k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
		m = binop(SHL, m, immed64u((p64_t) 2ULL), typnull);
		l = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
		k = binop(OR, k, m, typnull);
		k = binop(OR, k, l, typnull);

		k = binop(AND, k, n, typnull);
		n = binop(SHL, n, immed64u((p64_t) 1ULL), typnull);
		k = binop(AND, k, n, typnull);

		j = binop(OR, j, k, typnull);		/* terms 1-4 */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);

		/* ...and create a saturation mask */
		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, k, j, typnull);
		j = unop(NOT, j, typnull);

#else
		/* Create a saturation mask */
		j = binop(GT, arg1, arg0, typ4u);
		j = unop(NOT, j, typnull);
#endif

		/* Clobber the calculated value with 0 on negative overflow */
		i = binop(AND, i, j, typnull);

		/* Saturate to the declared precision */
		if (decl_bits != 4) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1 << decl_bits) - 1)),
				  typ4u);
		}
		return (i);

	case MUL: /* 4us */
#ifdef NEVER
		/* This might be good for an arch with MUL8u, but not INTRLVs */
		unsigned long long element;
		unsigned long long mask;

		/* INTRLVLOW arg0 with 0 */
		i = immed64u((p64_t)0ULL);
		for (element=0ULL, mask=0xffULL;
		     element<4ULL;
		     ++element, mask=mask<<8ULL)
		{
			j = binop(AND, arg0, immed64u((p64_t)mask), typnull);
			j = binop(SHL,
				  j,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			i = binop(OR, i, j, typnull);
		}

		/* INTRLVLOW arg1 with 0 */
		j = immed64u((p64_t)0ULL);
		for (element=0ULL, mask=0xffULL;
		     element<4ULL;
		     ++element, mask=mask<<8ULL)
		{
			k = binop(AND, arg1, immed64u((p64_t)mask), typnull);
			k = binop(SHL,
				  k,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			j = binop(OR, j, k, typnull);
		}

		k = binop(MUL, i, j, typ8u);


		/* INTRLVHIGH arg0 with 0 */
		i = immed64u((p64_t)0ULL);
		for (element=0ULL, mask=0xff00000000000000ULL;
		     element<4ULL;
		     ++element, mask=mask>>8ULL)
		{
			j = binop(AND, arg0, immed64u((p64_t)mask), typnull);
			j = binop(SHL,
				  j,
				  immed64u((p64_t)(8ULL*element+8ULL)),
				  typnull);
			i = binop(OR, i, j, typnull);
		}

		/* INTRLVHIGH arg1 with 0 */
		j = immed64u((p64_t)0ULL);
		for (element=0ULL, mask=0xff00000000000000ULL;
		     element<4ULL;
		     ++element, mask=mask>>8ULL)
		{
			k = binop(AND, arg1, immed64u((p64_t)mask), typnull);
			k = binop(SHL,
				  k,
				  immed64u((p64_t)(8ULL*element+8ULL)),
				  typnull);
			j = binop(OR, j, k, typnull);
		}

		i = binop(MUL, i, j, typ8u);

		i = binop(PACK, k, i, typ4us);

		/* Saturate to the declared precision */
		if (decl_bits != 4) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1 << decl_bits) - 1)),
				  typ4u);
		}
		return (i);
#endif
	    if (optcpu == GenericIA32) {
		/* Serialize */
		unsigned long long element;
		i = immed64u((p64_t)0ULL);
		for (element=0ULL; element<8ULL; ++element) {
			j = binop(SHL,
				  arg0,
				  immed64u((p64_t)(4ULL*element)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t)28ULL), typnull);
			k = binop(SHL,
				  arg1,
				  immed64u((p64_t)(4ULL*element)),
				  typnull);
			k = binop(SHR, k, immed64u((p64_t)28ULL), typnull);
			j = binop(MUL, j, k, typ32u);
			j = binop(MIN, j, immed64u((p64_t)0xfULL), typ32u);
			j = binop(SHL,
				  j,
				  immed64u((p64_t)(28ULL-4ULL*element)),
				  typnull);
			i = binop(OR, i, j, typnull);
		}
#ifdef NOTYET
		/* Saturate to the declared precision */
		if (decl_bits != 4) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1 << decl_bits) - 1)),
				  typ4u);
		}
#endif
		return(i);
	    } else if (optcpu & CPU_AltiVec) {
		/* AltiVec has even and odd 8u->16u multiplies */
		int m, n, o, p;

		/* Do evens */
		i = binop(AND,
			  arg0,
			  immedu(cvt1x8uto16x8u(0x0f)),
			  typnull);
		j = binop(AND,
			  arg1,
			  immedu(cvt1x8uto16x8u(0x0f)),
			  typnull);
		k = binop(MULEVEN, i, j, typ8u);
		k = binop(MIN, k, immedu(cvt1x16uto8x16u(0x000f)), typ16u);
		l = binop(MULODD, i, j, typ8u);
		l = binop(MIN, l, immedu(cvt1x16uto8x16u(0x000f)), typ16u);
		l = binop(SHL, l, immed64u((p64_t) 8ULL), typnull);
		k = binop(OR, k, l, typnull);

		/* Do odds */
		m = binop(AND,
			  arg0,
			  immedu(cvt1x8uto16x8u(0xf0)),
			  typnull);
		m = binop(SHR, m, immed64u((p64_t) 4ULL), typnull);
		n = binop(AND,
			  arg1,
			  immedu(cvt1x8uto16x8u(0xf0)),
			  typnull);
		n = binop(SHR, n, immed64u((p64_t) 4ULL), typnull);
		o = binop(MULEVEN, m, n, typ8u);
		o = binop(MIN, o, immedu(cvt1x16uto8x16u(0x000f)), typ16u);
		p = binop(MULODD, m, n, typ8u);
		p = binop(MIN, p, immedu(cvt1x16uto8x16u(0x000f)), typ16u);
		p = binop(SHL, p, immed64u((p64_t) 8ULL), typnull);
		o = binop(OR, o, p, typnull);
		o = binop(SHL, o, immed64u((p64_t) 4ULL), typnull);

		return(binop(OR, k, o, typnull));

	    } else if (optcpu & CPU_MAX) {
		i = binop(INTRLVLOW,
			  arg0,
			  immed64u((p64_t) 0ULL),
			  typ8u);
		j = binop(INTRLVLOW,
			  arg1,
			  immed64u((p64_t) 0ULL),
			  typ8u);
		k = binop(MUL, i, j, typ8u);

		i = binop(INTRLVHIGH,
			  arg0,
			  immed64u((p64_t) 0ULL),
			  typ8u);
		j = binop(INTRLVHIGH,
			  arg1,
			  immed64u((p64_t) 0ULL),
			  typ8u);
		i = binop(MUL, i, j, typ8u);

		i = binop(PACK, k, i, typ4us);

		/* Saturate to the declared precision */
		if (decl_bits != 4) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1 << decl_bits) - 1)),
				  typ4u);
		}
		return (i);
	    } else {
		/* use 8-bit multiplies */
		i = binop(MUL, arg0, arg1, typ8u);
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0x0f)), typnull);

		j = binop(SHR, arg0, immed64u((p64_t) 4ULL), typ8u);
		k = binop(SHR, arg1, immed64u((p64_t) 4ULL), typ8u);
		j = binop(MUL, j, k, typ8u);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x0f)), typnull);

		j = binop(SHL, j, immed64u((p64_t) 4ULL), typ8u);
		i = binop(OR, i, j, typnull);

		/* Now calculate a saturation mask */
		/* ae | bf | a~bf | eb~f | ~bce | a~fg | cgdf | cgbh */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			j = binop(AND, arg0, arg1, typnull);
							/* ae : bf : cg : dh */
			k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
							/* bf : cg : dh : X  */
			j = binop(OR, j, k, typnull);	/* terms 1-2 */

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
							/* cg : dh : X  : X  */

			l = binop(ANDN, arg0, arg1, typnull);
							/* ~ae: ~bf: ~cg: ~dh*/
			l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* ~bf: ~cg: ~dh: X  */
			l = binop(AND, l, arg0, typnull);
							/* a~bf: X : X  : X  */
			j = binop(OR, j, l, typnull);	/* terms 1-3 */

			l = binop(ANDN, arg1, arg0, typnull);
							/* a~e: b~f: c~g: d~h*/
			l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* b~f: c~g: d~h: X  */
			l = binop(AND, l, arg1, typnull);
							/* eb~f: X : X  : X  */
			j = binop(OR, j, l, typnull);	/* terms 1-4 */

			l = binop(SHL, arg0, immed64u((p64_t) 1ULL), typnull);
							/* b  : c  : d  : X  */
			m = binop(ANDN, l, arg1, typnull);
							/* ~be: ~cf: ~dg: X  */
			l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* c  : d  : X  : X  */
			m = binop(AND, l, m, typnull);
							/* ~bce: X : X  : X  */
			j = binop(OR, j, m, typnull);	/* terms 1-5 */

			m = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
							/* f  : g  : h  : X  */
			n = binop(ANDN, m, arg0, typnull);
							/* ~fa: ~gb: ~hc: X  */
			m = binop(SHL, m, immed64u((p64_t) 1ULL), typnull);
							/* g  : h  : X  : X  */
			n = binop(AND, m, n, typnull);
							/* a~fg: X : X  : X  */
			j = binop(OR, j, n, typnull);	/* terms 1-6 */

			l = binop(AND, l, arg1, typnull);
							/* ce : df : X  : X  */
			m = binop(AND, m, arg0, typnull);
							/* ag : bh : X  : X  */
			l = binop(OR, l, m, typnull);	/* X  : df|bh :X: X  */
			l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* df|bh :X: X  : X  */
			k = binop(AND, k, l, typnull);		/* terms 7-8 */
			j = binop(OR, j, k, typnull);		/* terms 1-8 */

			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x88)),
				  typnull);

			/* ...and create a saturation mask */
			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, k, j, typnull);
			k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, k, j, typnull);
		} else {
			j = binop(AND, arg0, arg1, typnull);
							/* ae : bf : cg : dh */
			k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
							/* bf : cg : dh : X  */
			j = binop(OR, j, k, typnull);	/* terms 1-2 */

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
							/* cg : dh : X  : X  */

			l = unop(NOT, arg0, typnull);
			l = binop(AND, l, arg1, typnull);
							/* ~ae: ~bf: ~cg: ~dh*/
			l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* ~bf: ~cg: ~dh: X  */
			l = binop(AND, l, arg0, typnull);
							/* a~bf: X : X  : X  */
			j = binop(OR, j, l, typnull);	/* terms 1-3 */

			l = unop(NOT, arg1, typnull);
			l = binop(AND, l, arg0, typnull);
							/* a~e: b~f: c~g: d~h*/
			l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* b~f: c~g: d~h: X  */
			l = binop(AND, l, arg1, typnull);
							/* eb~f: X : X  : X  */
			j = binop(OR, j, l, typnull);	/* terms 1-4 */

			l = binop(SHL, arg0, immed64u((p64_t) 1ULL), typnull);
							/* b  : c  : d  : X  */
			m = unop(NOT, l, typnull);
			m = binop(AND, m, arg1, typnull);
							/* ~be: ~cf: ~dg: X  */
			l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* c  : d  : X  : X  */
			m = binop(AND, l, m, typnull);
							/* ~bce: X : X  : X  */
			j = binop(OR, j, m, typnull);	/* terms 1-5 */

			m = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
							/* f  : g  : h  : X  */
			n = unop(NOT, m, typnull);
			n = binop(AND, n, arg0, typnull);
							/* ~fa: ~gb: ~hc: X  */
			m = binop(SHL, m, immed64u((p64_t) 1ULL), typnull);
							/* g  : h  : X  : X  */
			n = binop(AND, m, n, typnull);
							/* a~fg: X : X  : X  */
			j = binop(OR, j, n, typnull);	/* terms 1-6 */

			l = binop(AND, l, arg1, typnull);
							/* ce : df : X  : X  */
			m = binop(AND, m, arg0, typnull);
							/* ag : bh : X  : X  */
			l = binop(OR, l, m, typnull);	/* X  : df|bh :X: X  */
			l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* df|bh :X: X  : X  */
			k = binop(AND, k, l, typnull);		/* terms 7-8 */
			j = binop(OR, j, k, typnull);		/* terms 1-8 */

			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x88)),
				  typnull);

			/* ...and create a saturation mask */
			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, k, j, typnull);
			k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, k, j, typnull);
		}

		/* Clobber the calculated value with MAX on overflow */
		i = binop(OR, i, j, typnull);

		/* Saturate to the declared precision */
		if (decl_bits != 4) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1 << decl_bits) - 1)),
				  typ4u);
		}
		return (i);
	    }

	case DIV: /* 4us */
		/* use 8-bit divides */
		i = binop(AND, arg0, immedu(cvt1x8uto16x8u(0x0f)), typnull);
		j = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x0f)), typnull);
		i = binop(DIV, i, j, typ8u);
		if (optcpu & CPU_AltiVec) {
			j = binop(SHR,
				  arg0,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
			k = binop(SHR,
				  arg1,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
		} else {
			j = binop(SHR, arg0, immed64u((p64_t) 4ULL), typ8u);
			k = binop(SHR, arg1, immed64u((p64_t) 4ULL), typ8u);
		}
		j = binop(DIV, j, k, typ8u);
		if (optcpu & CPU_AltiVec) {
			j = binop(SHL,
				  j,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
		} else {
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typ8u);
		}
		i = binop(OR, i, j, typnull);

		/* Generate a saturation mask (~e~f~g~h) */
		j = unop(NOT, arg1, typnull);
		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);

		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, k, j, typnull);

		/* Clobber the calculated value with MAX if arg1==0 */
		i = binop(OR, i, j, typnull);

		/* Saturate to the declared precision */
		if (decl_bits != 4) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1 << decl_bits) - 1)),
				  typ4u);
		}
		return (i);

	case MOD: /* 4us */
		/* use 8-bit modulus */
		i = binop(AND, arg0, immedu(cvt1x8uto16x8u(0x0f)), typnull);
		j = binop(AND, arg1, immedu(cvt1x8uto16x8u(0x0f)), typnull);
		i = binop(MOD, i, j, typ8u);
		if (optcpu & CPU_AltiVec) {
			j = binop(SHR,
				  arg0,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
			k = binop(SHR,
				  arg1,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
		} else {
			j = binop(SHR, arg0, immed64u((p64_t) 4ULL), typ8u);
			k = binop(SHR, arg1, immed64u((p64_t) 4ULL), typ8u);
		}
		j = binop(MOD, j, k, typ8u);
		if (optcpu & CPU_AltiVec) {
			j = binop(SHL,
				  j,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8u);
		} else {
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typ8u);
		}
		i = binop(OR, i, j, typnull);

		/* Generate a saturation mask */
		j = unop(NOT, arg1, typnull);
		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);

		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, k, j, typnull);

		/* Clobber the calculated value with MAX if arg1==0 */
		i = binop(OR, i, j, typnull);

		/* Saturate to the declared precision */
		if (decl_bits != 4) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1 << decl_bits) - 1)),
				  typ4u);
		}
		return (i);

	case SHL: /* 4us */
		/* only a few cases: shift by 0, 1, 2, 3, or too big */
		arg1 = shiftconst(arg1, typ4u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:
				return(arg0);
			case 0x1ULL:
				/* Sneaky (Intel recommended) use of 32-bit
				   add...
				*/
				i = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x77)),
					  typnull);
				if (optcpu & CPU_MMX) {
					i = binop(ADD, i, i, typ32);
				} else if (optcpu & CPU_MAX) {
					i = binop(ADD, i, i, typ16);
				} else {
					i = binop(SHL,
						  arg0,
						  immed64u((p64_t) 1ULL),
						  typnull);
				}
				return(i);
			case 0x2ULL:
				i = binop(SHL,
					  arg0,
					  immed64u((p64_t) 2ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0xcc)),
					  typnull);
				return(i);
			case 0x3ULL:
				i = binop(SHL,
					  arg0,
					  immed64u((p64_t) 3ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x88)),
					  typnull);
				return(i);
			default:
				return(immed64u((p64_t) 0ULL));
			}
		}
		/* shift by a vector is NYI */
		error("shift left of 4-bit field values only implemented for a"
		      " constant shift");
		return(immed64u((p64_t) 0ULL));

	case SHR: /* 4us */
		/* only a few cases: shift by 0, 1, 2, 3, or too big */
		arg1 = shiftconst(arg1, typ4u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:
				return(arg0);
			case 0x1ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 1ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x77)),
					  typnull);
				return(i);
			case 0x2ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 2ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x33)),
					  typnull);
				return(i);
			case 0x3ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 3ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x11)),
					  typnull);
				return(i);
			default:
				return(immed64u((p64_t) 0ULL));
			}
		}
		/* shift by a vector is NYI */
		error("shift right of 4-bit field values only implemented for "
		      "a constant shift");
		return(immed64u((p64_t) 0ULL));

	case PACK: /* 4us */
		/* 8us -> 4us */
		/* PACK (with unsigned saturation) each 8-bit field value to
		   4 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Pack arg0 */
		/* Calculate the saturated values, then pack as unsaturated */
		i = binop(SHR, arg0, immed64u((p64_t) 2ULL), typnull);
		i = binop(OR, arg0, i, typnull);
		j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, i, j, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x10)), typnull);
		j = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		i = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
		i = binop(OR, i, j, typnull);
		i = binop(OR, arg0, i, typnull);

		/* Pack arg1 */
		/* Calculate the saturated values, then pack as unsaturated */
		k = binop(SHR, arg1, immed64u((p64_t) 2ULL), typnull);
		k = binop(OR, arg1, k, typnull);
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, l, typnull);
		l = binop(AND, l, immedu(cvt1x8uto16x8u(0x10)), typnull);
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
		k = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
		k = binop(OR, k, l, typnull);
		l = binop(SHR, k, immed64u((p64_t) 2ULL), typnull);
		k = binop(OR, k, l, typnull);
		k = binop(OR, arg1, k, typnull);

		/* Combine packed arg0 and arg1 in result */
		return(binop(PACK, i, k, typ4u));

	case INTRLVLOW: /* 4us */
	case INTRLVHIGH: /* 4us */
		/* These are the same as unsigned */
		return(binop(op, arg0, arg1, typ4u));

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop4us op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop4(int op,
int arg0,
int arg1)
{
	/* 4-bit signed field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k;

	switch (op) {
	case ADD: /* 4s */
	case MUL: /* 4s */
	case AND: /* 4s */
	case NE: /* 4s */
	case EQ: /* 4s */
	case SHL: /* 4s */
	case LAND: /* 4s */
	case LOR: /* 4s */
	case SUB: /* 4s */
	case XOR: /* 4s */
	case OR: /* 4s */
	case ANDN: /* 4s */
	case PACK: /* 4s */
		/* These are all the same as unsigned */
		return(binop(op, arg0, arg1, typ4u));

	case INTRLVLOW: /* 4s */
	case INTRLVHIGH: /* 4s */
		/* 2-bit to 4-bit interleave of 2-bit fields */
		/* These are the same as unsigned */
		return(binop(op, arg0, arg1, typ4u));

	case DIV: /* 4s */
		if (optcpu == GenericIA32) {
		    unsigned long long element;

		    /* Serialize */
		    i = immed64u((p64_t)0ULL);
		    for (element=0ULL; element<8ULL; ++element) {
			j = binop(SHL,
				  arg0,
				  immed64u((p64_t)(4ULL*element)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t)28ULL), typ32);
			k = binop(SHL,
				  arg1,
				  immed64u((p64_t)(4ULL*element)),
				  typnull);
			k = binop(SHR, k, immed64u((p64_t)28ULL), typ32);
			j = binop(DIV, j, k, typ32);
			j = binop(SHL, j, immed64u((p64_t)28ULL), typnull);
			j = binop(SHR,
				  j,
				  immed64u((p64_t)(4ULL*element)),
				  typnull);
			i = binop(OR, i, j, typnull);
		    }
		    return(i);

		} else if (optcpu & CPU_AltiVec) {
			/* use 8-bit divides */

			/* Divide even fields (sign-extended to 8 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(SHR,
				  i,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(SHL,
				  j,
				  immed64u((p64_t) 4ULL),
				  typnull);
			j = binop(SHR, j, immedu(cvt1x8uto16x8u(4)), typ8);
			i = binop(DIV, i, j, typ8);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
	
			/* Divide odd fields */
			j = binop(SHR,
				  arg0,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8);
			k = binop(SHR,
				  arg1,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8);
			j = binop(DIV, j, k, typ8);
			j = binop(SHL,
				  j,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));

		} else {
			/* use 8-bit divides */

			/* Divide even fields (sign-extended to 8 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 4ULL), typ8);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 4ULL), typ8);
			i = binop(DIV, i, j, typ8);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
	
			/* Divide odd fields */
			j = binop(SHR, arg0, immed64u((p64_t) 4ULL), typ8);
			k = binop(SHR, arg1, immed64u((p64_t) 4ULL), typ8);
			j = binop(DIV, j, k, typ8);
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typ8);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case MOD: /* 4s */
		if (optcpu == GenericIA32) {
		    unsigned long long element;

		    /* Serialize */
		    i = immed64u((p64_t)0ULL);
		    for (element=0ULL; element<8ULL; ++element) {
			j = binop(SHL,
				  arg0,
				  immed64u((p64_t)(4ULL*element)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t)28ULL), typ32);
			k = binop(SHL,
				  arg1,
				  immed64u((p64_t)(4ULL*element)),
				  typnull);
			k = binop(SHR, k, immed64u((p64_t)28ULL), typ32);
			j = binop(MOD, j, k, typ32);
			j = binop(SHL, j, immed64u((p64_t)28ULL), typnull);
			j = binop(SHR,
				  j,
				  immed64u((p64_t)(4ULL*element)),
				  typnull);
			i = binop(OR, i, j, typnull);
		    }
		    return(i);
		} else if (optcpu & CPU_AltiVec) {
			/* use 8-bit modulus */

			/* Take mod of even fields (sign-extended to 8 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(SHR,
				  i,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
			j = binop(SHR,
				  j,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8);
			i = binop(MOD, i, j, typ8);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);

			/* Take mod of odd fields */
			j = binop(SHR,
				  arg0,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8);
			k = binop(SHR,
				  arg1,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8);
			j = binop(MOD, j, k, typ8);
			j = binop(SHL,
				  j,
				  immedu(cvt1x8uto16x8u(4)),
				  typ8);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		} else {
			/* use 8-bit modulus */

			/* Take mod of even fields (sign-extended to 8 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 4ULL), typ8);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 4ULL), typ8);
			i = binop(MOD, i, j, typ8);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);

			/* Take mod of odd fields */
			j = binop(SHR, arg0, immed64u((p64_t) 4ULL), typ8);
			k = binop(SHR, arg1, immed64u((p64_t) 4ULL), typ8);
			j = binop(MOD, j, k, typ8);
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typ8);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}
	case AVG: /* 4s */
		/* Average rounds up */

		/* Divide by 2 */
		i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typ4);
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typ4);

		/* Add */
		i = binop(ADD, i, j, typ4);

		/* Round upward */
		j = binop(OR, arg0, arg1, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x11)), typnull);
		i = binop(ADD, i, j, typ4);
		return(i);

	case MIN: /* 4s */
		/* use GT */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* ((x>y) & y) | (~(x>y) & x) */
			i = binop(GT, arg0, arg1, typ4);
			j = binop(AND, i, arg1, typnull);
			i = binop(ANDN, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = binop(GT, arg0, arg1, typ4);
			j = binop(AND, i, arg1, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		}

	case MAX: /* 4s */
		/* use GT */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* ((x>y) & x) | (~(x>y) & y) */
			i = binop(GT, arg0, arg1, typ4);
			j = binop(AND, i, arg0, typnull);
			i = binop(ANDN, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = binop(GT, arg0, arg1, typ4);
			j = binop(AND, i, arg0, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		}

	case LT: /* 4s */
		return(binop(GT, arg1, arg0, typ4));

	case LE: /* 4s */
		return(binop(GE, arg1, arg0, typ4));

	case GT: /* 4s */
		if (optcpu & CPU_MMX) {
			/* use 8-bit GT */

			/* Compare even fields (sign-extended to 8 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 4ULL), typ8);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 4ULL), typ8);
			i = binop(GT, i, j, typ8);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);

			/* Compare odd fields */
			j = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(GT, j, k, typ8);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		} else {
			/* use 2-bit GT and EQ */
			i = binop(GT, arg0, arg1, typ2u);
			i = binop(SHL, i, immed64u((p64_t) 2ULL), typnull);
			j = binop(EQ, arg0, arg1, typ2);
			i = binop(AND, i, j, typnull);

			j = binop(GT, arg0, arg1, typ2);
			i = binop(OR, i, j, typnull);

			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xcc)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			return(binop(OR, i, j, typnull));
		}

	case GE: /* 4s */
		if (optcpu & CPU_MMX) {
			/* use 8-bit GE (yeah, I know it is not really there) */

			/* Compare even fields (sign-extended to 8 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 4ULL), typ8);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 4ULL), typ8);
			i = binop(GE, i, j, typ8);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);

			/* Compare odd fields */
			j = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(GE, j, k, typ8);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		} else {
			/* use 2-bit GT and EQ */
			i = binop(GT, arg0, arg1, typ2u);
			j = binop(EQ, arg0, arg1, typ2);
			i = binop(OR, i, j, typnull);
			i = binop(SHL, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(AND, i, j, typnull);

			j = binop(GT, arg0, arg1, typ2);
			i = binop(OR, i, j, typnull);

			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xcc)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			return(binop(OR, i, j, typnull));
		}

	case SHR: /* 4s */
		/* only a few cases: shift by 0, 1, 2, 3, or too big */
		arg1 = shiftconst(arg1, typ4u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:
				return(arg0);
			case 0x1ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 1ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x77)),
					  typnull);
				j = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x88)),
					  typnull);
				i = binop(OR, i, j, typnull);
				return(i);
			case 0x2ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 2ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x33)),
					  typnull);
				j = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x88)),
					  typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				j = binop(OR, j, k, typnull);
				i = binop(OR, i, j, typnull);
				return(i);
			case 0x3ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 3ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x11)),
					  typnull);
				j = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x88)),
					  typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				j = binop(OR, j, k, typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				j = binop(OR, j, k, typnull);
				i = binop(OR, i, j, typnull);
				return(i);
			default:
				i = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x88)),
					  typnull);
				j = binop(SHR,
					  i,
					  immed64u((p64_t) 1ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
				j = binop(SHR,
					  i,
					  immed64u((p64_t) 2ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
				return(i);
			}
		}
		/* shift by a vector is NYI */
		error("shift right of 4-bit field values only implemented for "
		      "a constant shift");
		return(immed64u((p64_t) 0ULL));

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop4 op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop4ss(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 4-bit signed field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m;

	switch (op) {
	case LAND: /* 4ss */
	case LOR: /* 4ss */

	case AND: /* 4ss */
	case ANDN: /* 4ss */
	case XOR: /* 4ss */
	case OR: /* 4ss */

	case NE: /* 4ss */
	case EQ: /* 4ss */
		/* These are all the same as unsigned unsaturated */
		return(binop(op, arg0, arg1, typ4u));

	case GT: /* 4ss */
	case LT: /* 4ss */
	case LE: /* 4ss */
	case GE: /* 4ss */

	case MIN: /* 4ss */
	case MAX: /* 4ss */
	case AVG: /* 4ss */
		/* These are all the same as signed unsaturated */
		return(binop(op, arg0, arg1, typ4));

	case ADD: /* 4ss */
#ifdef NOTDEFD	/* K map method, keep for comparison in dissertation */
		/* This method takes 39+ADD4 instructions */
		/* Do unsaturated add */
		i = binop(ADD, arg0, arg1, typ4);

		/* Correct for positive saturation */
		j = binop(SHL, arg0, immed64u((p64_t) 1ULL), typnull);
							/* b  : c  : d  : X   */
		j = binop(AND, arg0, j, typnull);	/* ab : bc : cd : X   */

		k = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
							/* f  : g  : h  : X   */
		k = binop(AND, arg1, j, typnull);	/* ef : fg : gh : X   */
		j = binop(OR, j, k, typnull);		/* X  : X : cd|gh : X */

		l = binop(AND, arg0, arg1, typnull);	/* ae : bf : cg : dh  */
		j = binop(OR, j, l, typnull);		/* X: X: cd|gh|cg : X */
		j = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
							/* X: cd|gh|cg : X: X */

		m = binop(XOR, arg0, arg1, typnull);	/* a^e: b^f: c^g: d^h */
		j = binop(AND, j, m, typnull);	/* X: b^f(cd|gh|cg):      X:X */
		j = binop(OR, j, l, typnull);	/* X: bf|(b^f(cd|gh|cg)): X:X */
		j = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
						/* bf|(b^f(cd|gh|cg)):  X:X:X */

		n = binop(OR, arg0, arg1, typnull);	/* a|e: b|f: c|g: d|h */

		n = unop(NOT, n, typnull);     /* ~(a|e):~(b|f):~(c|g):~(d|h) */
		j = binop(AND, j, n, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);
		/* j has pattern T000 */

		/* Apply the mask to the MSb */
		k = unop(NOT, j, typnull);
		i = binop(AND, i, k, typnull);

		/* Convert j's T000 pattern to a 0TTT pattern */
		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull); /* 0T00 */
		k = binop(OR, j, k, typnull);			   /* TT00 */
		k = binop(SHR, k, immed64u((p64_t) 1ULL), typnull); /* 0TT0 */
		k = binop(OR, j, k, typnull);			   /* TTT0 */
		k = binop(SHR, k, immed64u((p64_t) 1ULL), typnull); /* 0TTT */

		/* Apply the mask to the LSbs */
		i = binop(OR, i, k, typnull);


		/* Correct for negative saturation */
		k = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* bf : cg : dh : X  */
		k = unop(NOT, k, typnull);	/* ~(bf) : ~(cg) : ~(dh) : X */
		k = binop(AND, k, m, typnull);	/* X : (b^f)(~(cg)) : X : X */
		k = binop(OR, k, n, typnull);	/* X:~(b|f)|(b^f)(~(cg)):X:X */
		k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
						/* ~(b|f)|(b^f)(~(cg)):X:X:X */
		k = binop(AND, k, l, typnull);	/* ae[~(b|f)|(b^f)(~(cg))]... */
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x88)), typnull);
		/* k has pattern T000 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's T000 pattern to a 1FFF pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull); /* 0T00 */
		l = binop(OR, k, l, typnull);			   /* TT00 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TT0 */
		l = binop(OR, k, l, typnull);			   /* TTT0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TTT */
		l = unop(NOT, l, typnull);			   /* 1FFF */

		/* Apply the mask to the LSbs */
		i = binop(AND, i, l, typnull);

		return(i);
#else
		/* This method takes 22+ADD4 instructions */
		/* Do the signed add */
		i = binop(ADD, arg0, arg1, typ4);

		/* Correct for positive saturation */
		j = binop(OR, arg0, arg1, typnull);
		j = unop(NOT, j, typnull);		/* tX..X if both pos */
		j = binop(AND, j, i, typnull);		/* MSb(sum) = 1 */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, j, typnull);
		k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(OR, k, l, typnull);			/* T...T */

		/* Clobber the calculated value with PosSat if arg1==0 */
		i = binop(OR, i, l, typnull);			/* T...T */
		j = unop(NOT, j, typnull);			/* f1..1 */
		i = binop(AND, i, j, typnull);


		/* Correct for negative saturation */
		j = binop(AND, arg0, arg1, typnull);	/* tX..X if both neg */
		k = unop(NOT, i, typnull);
		j = binop(AND, j, k, typnull);		/* MSb(sum) = 0 */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, j, typnull);
		k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(OR, k, l, typnull);			/* T...T */

		/* Clobber the calculated value with NegSat if arg1==0 */
		l = unop(NOT, l, typnull);			/* F...F */
		i = binop(AND, i, l, typnull);
		i = binop(OR, i, j, typnull);

		/* Saturate to the declared precision */
		if (decl_bits != 4) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1<<(decl_bits-1))-1)),
				  typ4u);
			i = binop(MAX,
				  i,
				  immedu(cvt1x4uto32x4u(1<<(decl_bits-1))),
				  typ4u);
		}
		return(i);
#endif

	case SUB: /* 4ss */
#ifdef NOTDEFD
		/* K-map method - keep for dissertation */
		/* This method takes 38+SUB4 instructions */
		i = binop(SUB, arg0, arg1, typ4);

		/* Correct for positive saturation */
		j = binop(ANDN, arg0, arg1, typnull);	/* ~ae: ~bf: ~cg: ~dh */
		k = binop(ANDN, arg1, arg0, typnull);	/* a~e: b~f: c~g: d~h */
		l = binop(XOR, arg0, arg1, typnull);	/* a^e: b^f: c^g: d^h */
		m = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
							/* ~bf: ~cg: ~dh: X   */
		m = binop(OR, l, m, typnull);		/* X: b^f|~cg: X: X   */
		m = unop(NOT, m, typnull);		/* X: ~(b^f|~cg): X:X */
		m = binop(OR, k, m, typnull);	/* X: (b~f)|~(b^f|~cg): X:X */
		m = binop(SHL, m, immed64u((p64_t) 1ULL), typnull);
						/* (b~f)|~(b^f|~cg): X:X:X */
		m = binop(AND, j, m, typnull);	/* ~ae[(b~f)|~(b^f|~cg)]:... */
		m = binop(AND, m, immedu(cvt1x8uto16x8u(0x88)), typnull);
		/* m has pattern T000 */

		/* Apply the mask to the MSb */
		n = unop(NOT, m, typnull);
		i = binop(AND, i, n, typnull);

		/* Convert m's T000 pattern to a 0TTT pattern */
		n = binop(SHR, m, immed64u((p64_t) 1ULL), typnull); /* 0T00 */
		n = binop(OR, m, n, typnull);			   /* TT00 */
		n = binop(SHR, n, immed64u((p64_t) 1ULL), typnull); /* 0TT0 */
		n = binop(OR, m, n, typnull);			   /* TTT0 */
		n = binop(SHR, n, immed64u((p64_t) 1ULL), typnull); /* 0TTT */

		/* Apply the mask to the LSbs */
		i = binop(OR, i, n, typnull);


		/* Correct for negative saturation */
		m = binop(SHL, arg0, immed64u((p64_t) 1ULL), typnull);
							/* bcdX */
		m = binop(OR, arg0, m, typnull);	/* a|b:b|c:c|d:X */
		m = unop(NOT, m, typnull);		/* ~a~b:~b~c:~c~d:X */
		n = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
							/* fghX */
		n = binop(AND, arg1, n, typnull);	/* ef:fg:gh:X */
		m = binop(OR, m, n, typnull);		/* X:X:(~c~d)gh:X */
		m = binop(OR, m, j, typnull);	/* X:X:(~c~d)gh(~cg):X */
		m = binop(SHL, m, immed64u((p64_t) 1ULL), typnull);
						/* X:(~c~d)gh(~cg):X:X */
		m = binop(ANDN, l, m, typnull);	/* X:(~(b^f))(~c~d)gh(~cg):.. */
		m = binop(OR, m, j, typnull);
		m = binop(SHL, m, immed64u((p64_t) 1ULL), typnull);
		k = binop(AND, m, k, typnull);
		/* k has pattern T000 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's T000 pattern to a 1FFF pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull); /* 0T00 */
		l = binop(OR, k, l, typnull);			   /* TT00 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TT0 */
		l = binop(OR, k, l, typnull);			   /* TTT0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TTT */
		l = unop(NOT, l, typnull);			   /* 1FFF */

		/* Apply the mask to the LSbs */
		i = binop(AND, i, l, typnull);

		return(i);
#else
		/* This method takes 22+SUB4 instructions */
		/* Do the signed sub */
		i = binop(SUB, arg0, arg1, typ4);

		/* Correct for positive saturation */
		m = binop(XOR, arg0, arg1, typnull);	/* tX..X if mixed */
		j = binop(AND, m, arg1, typnull); /* tX..X if arg0+ & arg1- */
		j = binop(AND, j, i, typnull);		/* MSb(diff) = 1 */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, j, typnull);
		k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(OR, k, l, typnull);			/* T...T */

		/* Clobber the calculated value with PosSat if arg1==0 */
		i = binop(OR, i, l, typnull);			/* T...T */
		j = unop(NOT, j, typnull);			/* f1..1 */
		i = binop(AND, i, j, typnull);


		/* Correct for negative saturation */
		j = binop(AND, m, arg0, typnull); /* tX..X if arg0- & arg1+ */
		k = unop(NOT, i, typnull);
		j = binop(AND, j, k, typnull);		/* MSb(diff) = 0 */
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x88)), typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, j, typnull);
		k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(OR, k, l, typnull);			/* T...T */

		/* Clobber the calculated value with NegSat if arg1==0 */
		l = unop(NOT, l, typnull);			/* F...F */
		i = binop(AND, i, l, typnull);
		i = binop(OR, i, j, typnull);

		/* Saturate to the declared precision */
		if (decl_bits != 4) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1<<(decl_bits-1))-1)),
				  typ4u);
			i = binop(MAX,
				  i,
				  immedu(cvt1x4uto32x4u(1<<(decl_bits-1))),
				  typ4u);
		}
		return(i);
#endif
	case MUL: /* 4ss */
#ifdef NOTDEFD
/* This needs to be fixed if used */
			/* use 2-bit multiplies for chain down */
			i = binop(MUL, arg0, arg1, typ2u);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x33)),
				  typnull);

			j = binop(SHL, arg0, immed64u((p64_t) 2ULL), typ4u);
			j = binop(MUL, j, arg1, typ2u);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0xcc)),
				  typnull);

			k = binop(SHL, arg1, immed64u((p64_t) 2ULL), typ4u);
			k = binop(MUL, arg0, k, typ2u);
			k = binop(AND,
				  k,
				  immedu(cvt1x8uto16x8u(0xcc)),
				  typnull);
			j = binop(ADD, j, k, typ2u);

			k = binop(AND, arg0, arg1, typnull);
			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND,
				  k,
				  immedu(cvt1x8uto16x8u(0x44)),
				  typnull);
			j = binop(ADD, j, k, typ2u);

			return(binop(OR, i, j, typnull));
#endif
		if (optcpu == GenericIA32) {
			/* Serialize */
			unsigned long long element;
			i = immed64u((p64_t)0ULL);
			for (element=0ULL; element<8ULL; ++element) {
				j = binop(SHL,
					  arg0,
					  immed64u((p64_t)(4ULL*element)),
					  typnull);
				j = binop(SHR,
					  j,
					  immed64u((p64_t)28ULL),
					  typ32);
				k = binop(SHL,
					  arg1,
					  immed64u((p64_t)(4ULL*element)),
					  typnull);
				k = binop(SHR,
					  k,
					  immed64u((p64_t)28ULL),
					  typ32);
				j = binop(MUL, j, k, typ32);
				j = binop(MIN,
					  j,
					  immed64u((p64_t)0x7ULL),
					  typ32);
				j = binop(MAX,
					j,
					immed32u((p32_t)0xfffffff8U),
					typ32);
				j = binop(SHL,
					  j,
					  immed64u((p64_t)28ULL),
					  typnull);
				j = binop(SHR,
					  j,
					  immed64u((p64_t)(4ULL*element)),
					  typnull);
				i = binop(OR, i, j, typnull);
			}
			/* Saturate to the declared precision */
			if (decl_bits != 4) {
				i = binop(MIN,
					i,
					immedu(
					  cvt1x4uto32x4u((1<<(decl_bits-1))-1)),
					typ4u);
				i = binop(MAX,
					i,
					immedu(
					  cvt1x4uto32x4u(1 << (decl_bits - 1))),
					typ4u);
			}
			return(i);

		} else if (optcpu & CPU_AltiVec) {
			/* AltiVec has even and odd 8u->16u multiplies */
			int m, n, o, p;

			/* Do evens */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(SHL,
				  i,
				  immedu(cvt1x8uto16x8u(0x04)),
				  typ8u);
			i = binop(SHR,
				  i,
				  immedu(cvt1x8uto16x8u(0x04)),
				  typ8);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(SHL,
				  j,
				  immedu(cvt1x8uto16x8u(0x04)),
				  typ8u);
			j = binop(SHR,
				  j,
				  immedu(cvt1x8uto16x8u(0x04)),
				  typ8);
			k = binop(MULEVEN, i, j, typ8);
			k = binop(MIN,
				  k,
				  immedu(cvt1x16uto8x16u(0x0007)),
				  typ16);
			k = binop(MAX,
				  k,
				  immedu(cvt1x16uto8x16u(0xfff8)),
				  typ16);
			k = binop(AND,
				  k,
				  immedu(cvt1x16uto8x16u(0x000f)),
				  typnull);
			l = binop(MULODD, i, j, typ8);
			l = binop(MIN,
				  l,
				  immedu(cvt1x16uto8x16u(0x0007)),
				  typ16);
			l = binop(MAX,
				  l,
				  immedu(cvt1x16uto8x16u(0xfff8)),
				  typ16);
			l = binop(AND,
				  l,
				  immedu(cvt1x16uto8x16u(0x000f)),
				  typnull);
			l = binop(SHL, l, immed64u((p64_t) 8ULL), typnull);
			k = binop(OR, k, l, typnull);

			/* Do odds */
			m = binop(SHR,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x04)),
				  typ8);
			n = binop(SHR,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x04)),
				  typ8);
			o = binop(MULEVEN, m, n, typ8);
			o = binop(MIN,
				  o,
				  immedu(cvt1x16uto8x16u(0x0007)),
				  typ16);
			o = binop(MAX,
				  o,
				  immedu(cvt1x16uto8x16u(0xfff8)),
				  typ16);
			o = binop(AND,
				  o,
				  immedu(cvt1x16uto8x16u(0x000f)),
				  typnull);
			p = binop(MULODD, m, n, typ8);
			p = binop(MIN,
				  p,
				  immedu(cvt1x16uto8x16u(0x0007)),
				  typ16);
			p = binop(MAX,
				  p,
				  immedu(cvt1x16uto8x16u(0xfff8)),
				  typ16);
			p = binop(AND,
				  p,
				  immedu(cvt1x16uto8x16u(0x000f)),
				  typnull);
			p = binop(SHL, p, immed64u((p64_t) 8ULL), typnull);
			o = binop(OR, o, p, typnull);
			o = binop(SHL, o, immed64u((p64_t) 4ULL), typnull);

			return(binop(OR, k, o, typnull));

		} else {
			/* Multiply and saturate even fields using 8 bit ops */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 4ULL), typ8u);
			i = binop(SHR, i, immed64u((p64_t) 4ULL), typ8);

			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 4ULL), typ8u);
			j = binop(SHR, j, immed64u((p64_t) 4ULL), typ8);

			i = binop(MUL, i, j, typ8);
			i = binop(MIN,
				  i,
				  immedu(cvt1x8uto16x8u((1<<(decl_bits-1))-1)),
				  typ8);
			i = binop(MAX,
				  i,
				  immedu(cvt1x8uto16x8u(~((1<<(decl_bits-1))-1)
							& 0xff)),
				  typ8);
			k = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x0f)),
				  typnull);


			/* Multiply and saturate odd fields using 8 bit ops */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			i = binop(SHR, i, immed64u((p64_t) 4ULL), typ8);

			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t) 4ULL), typ8);

			i = binop(MUL, i, j, typ8);
			i = binop(MIN,
				  i,
				  immedu(cvt1x8uto16x8u((1<<(decl_bits-1))-1)),
				  typ8);
			i = binop(MAX,
				  i,
				  immedu(cvt1x8uto16x8u(~((1<<(decl_bits-1))-1)
							& 0xff)),
				  typ8);
			i = binop(SHL, i, immed64u((p64_t) 4ULL), typ8u);

			i = binop(OR, i, k, typnull);
			return (i);
		}

	case DIV: /* 4ss */
		/* Modify 4-bit divides */
		i = binop(DIV, arg0, arg1, typ4);


/* HEREHERE - Left off here converting binop*s() to use declared fieldsize. */
		/* Correct for positive saturation */
		/* Get ~e~f~g~h */
		j = unop(NOT, arg1, typnull);		/* ~e:~f:~g:~h */
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
							/* ~f:~g:~h:X */
		j = binop(AND, j, k, typnull);		/* ~e~f:~f~g:~g~h:X */
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
							/* ~g~h : ... */
		j = binop(AND, j, k, typnull);		/* ~e~f~g~h : ... */


		/* Get efgh */
		k = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
							/* f:g:h:X */
		k = binop(AND, arg1, k, typnull);	/* ef:fg:gh:X */
		l = binop(SHL, k, immed64u((p64_t) 2ULL), typnull);
							/* gh : ... */
		k = binop(AND, k, l, typnull);		/* efgh : ... */


		/* Get a~b~c~d */
		l = unop(NOT, arg0, typnull);		/* ~a:~b:~c:~d */
		m = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* ~b:~c:~d:X */
		l = binop(AND, l, m, typnull);		/* ~a~b:~b~c:~c~d:X */
		m = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* ~b~c:~c~d:X:X */
		l = binop(AND, l, m, typnull);		/* X:~b~c~d:X:X */
		l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* ~b~c~d:X:X:X */
		l = binop(AND, l, arg0, typnull);	/* a~b~c~d:X:X:X */


		l = binop(AND, k, l, typnull);	/* a~b~c~defgh:X:X:X */


		/* ~a~e~f~g~h : ... */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			k = binop(ANDN, arg0, j, typnull);
		} else {
			k = unop(NOT, arg0, typnull);
			k = binop(AND, k, j, typnull);
		}
		k = binop(OR, l, k, typnull); /* ~a~e~f~g~h | a~b~c~defgh */
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x88)), typnull);
		/* k has pattern T000 */

		/* Apply the mask to the MSb */
		l = unop(NOT, k, typnull);
		i = binop(AND, i, l, typnull);

		/* Convert k's T000 pattern to a 0TTT pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull); /* 0T00 */
		l = binop(OR, k, l, typnull);			   /* TT00 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TT0 */
		l = binop(OR, k, l, typnull);			   /* TTT0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TTT */

		/* Apply the mask to the LSbs */
		i = binop(OR, i, l, typnull);


		/* Correct for negative saturation */
		k = binop(AND, arg0, j, typnull);	/* a~e~f~g~h : ... */
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x88)), typnull);
		/* k has pattern T000 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's T000 pattern to a 1FFF pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull); /* 0T00 */
		l = binop(OR, k, l, typnull);			   /* TT00 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TT0 */
		l = binop(OR, k, l, typnull);			   /* TTT0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TTT */
		l = unop(NOT, l, typnull);			   /* 1FFF */

		/* Apply the mask to the LSbs */
		return(binop(AND, i, l, typnull));


	case MOD: /* 4ss */
		/* Modify 4-bit modulus */
		i = binop(MOD, arg0, arg1, typ4);


		/* Get ~e~f~g~h */
		j = unop(NOT, arg1, typnull);		/* ~e:~f:~g:~h */
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
							/* ~f:~g:~h:X */
		j = binop(AND, j, k, typnull);		/* ~e~f:~f~g:~g~h:X */
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
							/* ~g~h : ... */
		j = binop(AND, j, k, typnull);		/* ~e~f~g~h : ... */


		/* Correct for positive saturation */
		/* ~a~e~f~g~h : ... */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			k = binop(ANDN, arg0, j, typnull);
		} else {
			k = unop(NOT, arg0, typnull);
			k = binop(AND, k, j, typnull);
		}
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x88)), typnull);
		/* k has pattern T000 */

		/* Apply the mask to the MSb */
		l = unop(NOT, k, typnull);
		i = binop(AND, i, l, typnull);

		/* Convert k's T000 pattern to a 0TTT pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull); /* 0T00 */
		l = binop(OR, k, l, typnull);			   /* TT00 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TT0 */
		l = binop(OR, k, l, typnull);			   /* TTT0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TTT */

		/* Apply the mask to the LSbs */
		i = binop(OR, i, l, typnull);


		/* Correct for negative saturation */
		k = binop(AND, arg0, j, typnull);	/* a~e~f~g~h : ... */
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x88)), typnull);
		/* k has pattern T000 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's T000 pattern to a 1FFF pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull); /* 0T00 */
		l = binop(OR, k, l, typnull);			   /* TT00 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TT0 */
		l = binop(OR, k, l, typnull);			   /* TTT0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull); /* 0TTT */
		l = unop(NOT, l, typnull);			   /* 1FFF */

		/* Apply the mask to the LSbs */
		return (binop(AND, i, l, typnull));

	case SHL: /* 4ss */
		/* Not true? */
		/* These are all the same as unsigned unsaturated */
		return(binop(op, arg0, arg1, typ4u));

	case SHR: /* 4ss */
		/* only a few cases: shift by 0, 1, 2, 3, or too big */
		arg1 = shiftconst(arg1, typ4u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:
				return(arg0);
			case 0x1ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 1ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x77)),
					  typnull);
				j = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x88)),
					  typnull);
				i = binop(OR, i, j, typnull);
				return(i);
			case 0x2ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 2ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x33)),
					  typnull);
				j = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x88)),
					  typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				j = binop(OR, j, k, typnull);
				i = binop(OR, i, j, typnull);
				return(i);
			case 0x3ULL:
				i = binop(SHR,
					  arg0,
					  immed64u((p64_t) 3ULL),
					  typnull);
				i = binop(AND,
					  i,
					  immedu(cvt1x8uto16x8u(0x11)),
					  typnull);
				j = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x88)),
					  typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				j = binop(OR, j, k, typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				j = binop(OR, j, k, typnull);
				i = binop(OR, i, j, typnull);
				return(i);
			default:
				i = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x88)),
					  typnull);
				j = binop(SHR,
					  i,
					  immed64u((p64_t) 1ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
				j = binop(SHR,
					  i,
					  immed64u((p64_t) 2ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
				return(i);
			}
		}
		/* shift by a vector is NYI */
		error("shift right of 4-bit field values only implemented for "
		      "a constant shift");
		return(immed64u((p64_t) 0ULL));

	case INTRLVLOW: /* 4ss */
	case INTRLVHIGH: /* 4ss */
		/* 2-bit to 4-bit interleave of 2-bit fields */
		/* These are the same as unsigned */
		return(binop(op, arg0, arg1, typ4u));

	case PACK: /* 4ss */
		/* 8ss -> 4ss */
		/* PACK (with saturation) each 8-bit field value to
		   4 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Saturate arg0 */
		i = binop(MIN, arg0, immedu(cvt1x16uto8x16u(0x0707)), typ8);
		i = binop(MAX, i, immedu(cvt1x16uto8x16u(0xf8f8)), typ8);

		/* Saturate arg1 */
		j = binop(MIN, arg1, immedu(cvt1x16uto8x16u(0x0707)), typ8);
		j = binop(MAX, j, immedu(cvt1x16uto8x16u(0xf8f8)), typ8);

		/* Pack as signed */
		return(binop(PACK, i, j, typ4));

	default:
		/* Bug out if we failed to handle some operation */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop4ss op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}


int
binop8u(int op,
int arg0,
int arg1)
{
	/* 8-bit unsigned field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i=-1, j, k, l;

	switch (op) {
	case ADD: /* 8u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use implicit spacer technique */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x7f)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x7f)),
				  typnull);
			i = binop(ADD, i, j, typ16u);
			j = binop(XOR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			return(binop(XOR, i, j, typnull));
		}

	case SUB: /* 8u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use implicit spacer technique */
			i = binop(OR,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x7f)),
				  typnull);
			i = binop(SUB, i, j, typ16u);
			j = binop(XOR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			j = binop(XOR,
				  j,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			return(binop(XOR, i, j, typnull));
		}

	case MUL: /* 8u */
		if (optcpu & CPU_MMX) {
			/* Only have 16-bit muls...
			   unsigned interleave, mul, cast via modulation,
			   pack without saturation sequence
			*/
			i = binop(INTRLVLOW,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ16u);
			j = binop(INTRLVLOW,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ16u);
			k = binop(MUL, i, j, typ16u);
			k = binop(AND,
				  k,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);

			i = binop(INTRLVHIGH,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ16u);
			j = binop(INTRLVHIGH,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ16u);
			i = binop(MUL, i, j, typ16u);
			i = binop(AND,
				  i,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);

			return(binop(PACK, k, i, typ8u));
		} else if (optcpu & CPU_AltiVec) {
			/* Have even and odd 8u->16u muls */
			i = binop(MULEVEN, arg0, arg1, typ8u);
			i = binop(AND,
				  i,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);

			j = binop(MULODD, arg0, arg1, typ8u);
			j = binop(SHL,
				  j,
				  immedu(cvt1x16uto8x16u(0x0008)),
				  typ16u);
			return(binop(OR, i, j, typnull));
		} else {
#ifdef NOTDEFD
			/* Use 4-bit unsigned MULs */
			/* Clearing the odd fields of arg0 will save us one
			   masking operation. */
			i = binop(AND,
				arg0,
				immedu(cvt1x8uto16x8u(0x0f)),
				typnull);
			j = binop(MUL, i, arg1, typ4u);
			k = binop(MULH, i, arg1, typ4u);
			k = binop(SHL, k, immed64u((p64_t) 4ULL), typnull);

			i = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(MUL, i, arg1, typ4u);
			k = binop(ADD, k, i, typ4u);

			/* Clearing the odd fields of arg1 will save us one
			   masking operation. */
			i = binop(AND,
				arg1,
				immedu(cvt1x8uto16x8u(0x0f)),
				typnull);
			i = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(MUL, arg0, i, typ4u);
			k = binop(ADD, k, i, typ4u);

			return(binop(OR, k, j, typnull));
#else
			unsigned long long step;

			/* Perform a shift-add sequence */
			i = immed64u((p64_t) 0ULL);
			for (step=0ULL; step<8ULL; ++step)
			{
				j = binop(AND,
					  arg1,
					  immedu(cvt1x8uto16x8u(0x01<<step)),
					  typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 4ULL),
					  typnull);
				j = binop(OR, k, j, typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 2ULL),
					  typnull);
				j = binop(OR, k, j, typnull);
				k = binop(SHR,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				j = binop(OR, k, j, typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x8uto16x8u(0x01)),
					  typnull);

				k = binop(SHL,
					  j,
					  immed64u((p64_t) 1ULL),
					  typnull);
				j = binop(OR, k, j, typnull);
				k = binop(SHL,
					  j,
					  immed64u((p64_t) 2ULL),
					  typnull);
				j = binop(OR, k, j, typnull);
				k = binop(SHL,
					  j,
					  immed64u((p64_t) 4ULL),
					  typnull);
				j = binop(OR, k, j, typnull);

				j = binop(AND, arg0, j, typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) step),
					  typ8u);
				i = binop(ADD, i, j, typ8u);
			}
			return(i);
#endif
		}

	case MULEVEN: /* 8u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("MULEVEN8u not supported on this target");
		}

	case MULODD: /* 8u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("MULODD8u not supported on this target");
		}

	case DIV: /* 8u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use 16-bit divides */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			i = binop(DIV, i, j, typ16u);
			j = binop(SHR, arg0, immed64u((p64_t) 8ULL), typ16u);
			k = binop(SHR, arg1, immed64u((p64_t) 8ULL), typ16u);
			j = binop(DIV, j, k, typ16u);
			j = binop(SHL, j, immed64u((p64_t) 8ULL), typ16u);
			return(binop(OR, i, j, typnull));
		}

	case MOD: /* 8u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use 16-bit modulus */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			i = binop(MOD, i, j, typ16u);
			j = binop(SHR, arg0, immed64u((p64_t) 8ULL), typ16u);
			k = binop(SHR, arg1, immed64u((p64_t) 8ULL), typ16u);
			j = binop(MOD, j, k, typ16u);
			j = binop(SHL, j, immed64u((p64_t) 8ULL), typ16u);
			return(binop(OR, i, j, typnull));
		}

	case AVG: /* 8u */
		if (optcpu & (CPU_3DNow|CPU_XMMX|CPU_athlon|CPU_AltiVec)) {
			break;
		}

		/* Average rounds up (just like 3DNow! instruction...) */
		i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
		i = binop(AND, i, immedu(cvt1x8uto16x8u(0x7f)), typnull);
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x7f)), typnull);
		i = binop(ADD, i, j, typ8u);
		j = binop(OR, arg0, arg1, typnull);
		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x01)), typnull);

		/* Use the next largest HW supported add */
		if (optcpu & CPU_MMX) {
			i = binop(ADD, i, j, typ32u);
		} else {
			i = binop(ADD, i, j, typ32u);
		}
		return(i);

	case MIN: /* 8u */
		if ((optcpu & CPU_athlon) || (optcpu & CPU_AltiVec)) {
			break;
		} else if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* use GT */
			i = binop(GT, arg0, arg1, typ8u);
			j = binop(AND, i, arg1, typnull);
			i = binop(ANDN, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ8u);
			j = binop(AND, i, arg1, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		}

	case MAX: /* 8u */
		/* use GT */
		if ((optcpu & CPU_athlon) || (optcpu & CPU_AltiVec)) {
			break;
		} else if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(GT, arg0, arg1, typ8u);
			j = binop(AND, i, arg0, typnull);
			i = binop(ANDN, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = binop(GT, arg0, arg1, typ8u);
			j = binop(AND, i, arg0, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		}

	case LAND: /* 8u */
		/* use 8-bit NE 0 to normalize fields before AND */
		i = binop(NE, arg0, immed64u((p64_t) 0x0ULL), typ8u);
		j = binop(NE, arg1, immed64u((p64_t) 0x0ULL), typ8u);
		return(binop(AND, i, j, typnull));

	case LOR: /* 8u */
		/* use 8-bit NE 0 to normalize fields after ORing */
		i = binop(OR, arg0, arg1, typnull);
		return(binop(NE, i, immed64u((p64_t) 0ULL), typ8u));

	case AND: /* 8u */
		return(binop(AND, arg0, arg1, typnull));

	case ANDN: /* 8u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			return(binop(ANDN, arg0, arg1, typnull));
		} else {
			i = unop(NOT, arg0, typnull);
			return(binop(AND, i, arg1, typnull));
		}

	case OR: /* 8u */
		return(binop(OR, arg0, arg1, typnull));

	case XOR: /* 8u */
		return(binop(XOR, arg0, arg1, typnull));

	case EQ: /* 8u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else if (optcpu & CPU_MAX) {
			/* use 4-bit EQ */
			i = binop(EQ, arg0, arg1, typ4u);
			j = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0xf0f0)),
				  typnull);
			i = binop(AND, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
			return(binop(OR, i, j, typnull));
		} else if (optcpu == GenericIA32) {
			return(unop(NOT,
				    binop(NE, arg0, arg1, typ8u),
				    typnull));
		} else {
			/* use 16-bit EQ */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			i = binop(EQ, i, j, typ16u);
			i = binop(AND,
				  i,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			j = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);
			j = binop(EQ, j, k, typ16u);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);
			return(binop(OR, i, j, typnull));
		}

	case NE: /* 8u */
		if (optcpu == GenericIA32) {
			/* use XORs */
			i = binop(XOR, arg0, arg1, typnull);
			j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0x01)),
				  typnull);
			j = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHL, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* not EQ */
			return(unop(NOT,
				    binop(EQ, arg0, arg1, typ8u),
				    typnull));
		}

	case GT: /* 8u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if (optcpu & CPU_MMX) {
			/* Add offset and do signed GT */
			i = binop(ADD,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typ8u);
			j = binop(ADD,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typ8u);
			return(binop(GT, i, j, typ8));
		} else if (optcpu == GenericIA32) {
			/* use 4-bit GT and EQ */
			i = binop(GT, arg0, arg1, typ4u);
			j = binop(EQ, arg0, arg1, typ4u);
			k = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			j = binop(AND, j, k, typnull);
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use 16-bit GT */

			/* Compare even fields */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			i = binop(GT, i, j, typ16u);
			i = binop(AND,
				  i,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);

			/* Compare odd fields */
			j = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);
			j = binop(GT, j, k, typ16u);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case GE: /* 8u */
		/* the obvious hack is that x GE y is (x EQ y) OR (x GT y) */
		i = binop(GT, arg0, arg1, typ8u);
		j = binop(EQ, arg0, arg1, typ8u);
		return(binop(OR, i, j, typnull));

	case LT: /* 8u */
		return(binop(GT, arg1, arg0, typ8u));

	case LE: /* 8u */
		return(binop(GE, arg1, arg0, typ8u));

	case SHL: /* 8u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* MMX does not directly do 8-bit shifts... */
			arg1 = shiftconst(arg1, typ8u);
			if (tup[arg1].op == NUM) {
				/* Shift by a constant is easy */
				i =binop(SHL,
					 arg0,
					 immed64u((p64_t)tup[arg1].immed.uq[0]),
					 typnull);
				switch (tup[arg1].immed.q[0]) {
				case 0x0ULL: i = arg0; break;
				case 0x1ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xfe)),
						  typnull);
					break;
				case 0x2ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xfc)),
						  typnull);
					break;
				case 0x3ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xf8)),
						  typnull);
					break;
				case 0x4ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xf0)),
						  typnull);
					break;
				case 0x5ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xe0)),
						  typnull);
					break;
				case 0x6ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xc0)),
						  typnull);
					break;
				case 0x7ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x80)),
						  typnull);
					break;
				default: i = immed64u((p64_t) 0ULL);
				}
				return(i);
			}
			error("shift left of unsigned 8-bit field values only "
			      "implemented for a constant shift");
			return(immed64u((p64_t) 0ULL));
		}

	case SHR: /* 8u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* MMX does not directly do 8-bit shifts... */
			arg1 = shiftconst(arg1, typ8u);
			if (tup[arg1].op == NUM) {
				/* Shift by a constant is easy */
				i =binop(SHR,
					 arg0,
					 immed64u((p64_t)tup[arg1].immed.uq[0]),
					 typnull);
				switch (tup[arg1].immed.q[0]) {
				case 0x0ULL:	i = arg0; break;
				case 0x1ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x7f)),
						  typnull);
					break;
				case 0x2ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x3f)),
						  typnull);
					break;
				case 0x3ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x1f)),
						  typnull);
					break;
				case 0x4ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x0f)),
						  typnull);
					break;
				case 0x5ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x07)),
						  typnull);
						break;
				case 0x6ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x03)),
						  typnull);
						break;
				case 0x7ULL:
					i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x01)),
						  typnull);
						break;
				default: i = immed64u((p64_t) 0ULL);
				}
				return(i);
			}
			error("shift right of unsigned 8-bit field values only "
			      "implemented for a constant shift");
			return(immed64u((p64_t) 0ULL));
		}

	case PACK: /* 8u */
		/* 16u -> 8u */
		/* PACK (without saturation) each 16-bit field value to
		   8 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		if (optcpu & CPU_AltiVec) {
			break;	/* vpkuhum */
		} else if (optcpu & CPU_MMX) {
			/* Clear upper halves of fields, then pack saturated */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			return(binop(PACK, i, j, typ8us));
		} else {
			/* Pack arg0 */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 8ULL), typnull);
			i = binop(OR, i, j, typnull);

			/* Pack arg1 */
			k = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			l = binop(SHR, k, immed64u((p64_t) 8ULL), typnull);
			k = binop(OR, k, l, typnull);

			/* Combine packed arg0 and arg1 in result */
			return(binop(PACK, i, k, typ16u));
		}

	case INTRLVLOW: /* 8u */
		/* 4-bit to 8-bit interleave of 4-bit fields */
		/* another nasty shift and AND sequence... */
		{
			int bpf = bitsperfrag();

			/* high bit */
			j = arg1;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					  j,
					  (bpf==64)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 32:
				i = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x000000ffULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x000000ff)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x0000ff00ULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x0000ff00)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 8ULL),
					  typnull);
				j = binop(OR, i, j, typnull);


				i = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x000f)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x00f0)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 4ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}
			k = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);


			/* low bit */
			j = arg0;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					  j,
					  (bpf==64)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 32:
				i = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x000000ffULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x000000ff)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x0000ff00ULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x0000ff00)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 8ULL),
					  typnull);
				j = binop(OR, i, j, typnull);


				i = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x000f)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x00f0)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 4ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}
		}

		return(binop(OR, i, k, typnull));

	case INTRLVHIGH: /* 8u */
		/* 4-bit to 8-bit interleave of 4-bit fields */
		/* sneaky way to reuse INTRLVLOW code... */
		{
			unsigned long long bpf_2 =
				(unsigned long long) bitsperfrag()/2ULL;
			i = binop(SHR, arg0, immed64u((p64_t) bpf_2), typnull);
			j = binop(SHR, arg1, immed64u((p64_t) bpf_2), typnull);
			return(binop(INTRLVLOW, i, j, typ8u));
		}

	case INTRLVODD: /* 8u */
		/* Interleave 16-bit fields */
		if (optcpu & CPU_MAX) {
			break;
		} else if (optcpu & CPU_AltiVec) {
#ifdef TRINOPS
			p128_t tmp;
			tmp.uq[1] = 0x1000120214041606ULL;
			tmp.uq[0] = 0x18081A0A1C0C1E0EULL;
			return(trinop(TPERM,
				      arg0,
				      arg1,
				      immed128u(tmp),
				      typ8u));
#else
			/* For now, implement in target header file */
			break;
#endif
		}
		bug("INTRLVODD not available on this target");
		break;

	case INTRLVEVEN: /* 8u */
		if (optcpu & CPU_AltiVec) {
#ifdef TRINOPS
			p128_t tmp;
			tmp.uq[1] = 0x1101130315051707ULL;
			tmp.uq[0] = 0x19091B0B1D0D1F0FULL;
			return(trinop(TPERM,
				      arg0,
				      arg1,
				      immed128u(tmp),
				      typ8u));
#else
			/* For now, implement in target header file */
			break;
#endif
		} else {
			bug("INTRLVEVEN not available on this target");
			break;
		}

	case REPL: /* 8u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("REPL8u not available on this target");
			break;
		}

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop8u op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop8us(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 8-bit unsigned field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l;

	switch (op) {
	case MIN: /* 8us */
	case MAX: /* 8us */
	case AVG: /* 8us */

	case EQ: /* 8us */
	case NE: /* 8us */
	case GT: /* 8us */
	case LT: /* 8us */
	case LE: /* 8us */
	case GE: /* 8us */

	case LAND: /* 8us */
	case LOR: /* 8us */

	case AND: /* 8us */
	case ANDN: /* 8us */
	case OR: /* 8us */
	case XOR: /* 8us */
		/* These are all the same as unsigned unsaturated */
		return (binop(op, arg0, arg1, typ8u));

	case ADD: /* 8us */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* use implicit spacer technique */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x7f)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x7f)),
				  typnull);
			if (optcpu & CPU_MMX) {
				i = binop(ADD, i, j, typ32u);
			} else if (optcpu & CPU_MAX) {
				i = binop(ADD, i, j, typ16u);
			} else {
				i = binop(ADD, i, j, typ32u);
			}
			j = binop(XOR, arg0, arg1, typnull); /* "propagate" */
			k = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			i = binop(XOR, i, k, typnull);

			/* Calulate overflow */
			k = binop(AND, arg0, arg1, typnull);	/* generate */
			l = k;

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			l = binop(AND,
				  l,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);


			/* ...and create a saturation mask */
			k = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);

			/* Clobber the calculated value with the max on
			   overflow */
			return(binop(OR, l, i, typnull));
		}

	case SUB: /* 8us */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* Do subtract using implicit spacer technique */
			i = binop(OR,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x7f)),
				  typnull);
			i = binop(SUB, i, j, typ32u);
			j = binop(XOR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			j = binop(XOR,
				  j,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			i = binop(XOR, i, j, typnull);

			/* Create a saturation mask */
			j = binop(GT, arg1, arg0, typ8u);
			j = unop(NOT, j, typnull);

			/* Clobber the calculated value with 0 on negative
			   overflow */
			return(binop(AND, i, j, typnull));
		}

	case MUL: /* 8us */
	    if (optcpu == GenericIA32) {
		/* Serialize */
		unsigned long long element;
		i = immed64u((p64_t)0ULL);
		for (element=0ULL; element<4ULL; ++element) {
			j = binop(SHL,
				  arg0,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t)24ULL), typnull);
			k = binop(SHL,
				  arg1,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			k = binop(SHR, k, immed64u((p64_t)24ULL), typnull);
			j = binop(MUL, j, k, typ32u);
			j = binop(MIN, j, immed64u((p64_t)0xffULL), typ32u);
			j = binop(SHL,
				  j,
				  immed64u((p64_t)(24ULL-8ULL*element)),
				  typnull);
			i = binop(OR, i, j, typnull);
		}
		/* Saturate to the declared precision */
		if (decl_bits != 8) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1 << decl_bits) - 1)),
				  typ8u);
		}
		return(i);

	    } else {
		/* Only have 16-bit muls...
		   unsigned interleave, mul, pack with saturation sequence
		*/
		i = binop(INTRLVLOW,
			  arg0,
			  immed64u((p64_t) 0ULL),
			  typ16u);
		j = binop(INTRLVLOW,
			  arg1,
			  immed64u((p64_t) 0ULL),
			  typ16u);
		k = binop(MUL, i, j, typ16u);

		i = binop(INTRLVHIGH,
			  arg0,
			  immed64u((p64_t) 0ULL),
			  typ16u);
		j = binop(INTRLVHIGH,
			  arg1,
			  immed64u((p64_t) 0ULL),
			  typ16u);
		i = binop(MUL, i, j, typ16u);

		return(binop(PACK, k, i, typ8us));
	    }

	case DIV: /* 8us */
		/* This method will bomb on div by 0 instead of saturating,
		   which is, of course, the point of doing a sat DIV. */

		/* Do divide as usual */
		i = binop(DIV, arg0, arg1, typ8u);

		/* Generate a saturation mask */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);

		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x80)), typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(OR, j, k, typnull);

		/* Clobber the calculated value with MAX if arg1==0 */
		return(binop(OR, i, j, typnull));

	case MOD: /* 8us */
		/* This method will bomb on mod by 0 instead of saturating,
		   which is, of course, the point of doing a sat MOD. */

		/* Do modulus as usual */
		i = binop(MOD, arg0, arg1, typ8u);

		/* Generate a saturation mask */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);

		j = binop(AND, j, immedu(cvt1x8uto16x8u(0x80)), typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(OR, k, j, typnull);

		/* Clobber the calculated value with MAX if arg1==0 */
		return(binop(OR, i, j, typnull));

	case SHL: /* 8us */
		/* MMX does not directly do 8-bit shifts... */
		arg1 = shiftconst(arg1, typ8u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			i = binop(SHL,
				  arg0,
				  immed64u((p64_t)tup[arg1].immed.uq[0]),
				  typnull);
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:	i = arg0; break;
			case 0x1ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xfe)),
						  typnull);
					break;
			case 0x2ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xfc)),
						  typnull);
					break;
			case 0x3ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xf8)),
						  typnull);
					break;
			case 0x4ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xf0)),
						  typnull);
					break;
			case 0x5ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xe0)),
						  typnull);
					break;
			case 0x6ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0xc0)),
						  typnull);
					break;
			case 0x7ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x80)),
						  typnull);
					break;
			default:	i = immed64u((p64_t) 0ULL);
			}
			return(i);
		}
		error("shift left of unsigned 8-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	case SHR: /* 8us */
		/* MMX does not directly do 8-bit shifts... */
		arg1 = shiftconst(arg1, typ8u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			i = binop(SHR,
				  arg0,
				  immed64u((p64_t)tup[arg1].immed.uq[0]),
				  typnull);
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:	i = arg0; break;
			case 0x1ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x7f)),
						  typnull);
					break;
			case 0x2ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x3f)),
						  typnull);
					break;
			case 0x3ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x1f)),
						  typnull);
					break;
			case 0x4ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x0f)),
						  typnull);
					break;
			case 0x5ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x07)),
						  typnull);
					break;
			case 0x6ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x03)),
						  typnull);
					break;
			case 0x7ULL:	i = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x01)),
						  typnull);
					break;
			default:	i = immed64u((p64_t) 0ULL);
			}
			return(i);
		}
		error("shift right of unsigned 8-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	case PACKS2U: /* 8us */
		if (optcpu & CPU_MMX) {
			break;
		} else {
			bug("PACKS2U not implemented for non-MMX target");
			break;
		}

	case PACK: /* 8us */
		/* 16us -> 8us */
		/* PACK (with unsigned saturation) each 16-bit field value to
		   8 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		if (optcpu & CPU_AltiVec) {
			break;
		} else if (optcpu & CPU_MMX) {
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, arg0, j, typnull);
			i = binop(ANDN, i, j, typnull);

			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			k = binop(OR, arg1, k, typnull);
			j = binop(ANDN, j, k, typnull);

			return(binop(PACKS2U, i, j, typ8us));
		} else {
			/* Calculate the saturated values, then pack as
			   unsaturated */

			/* arg0 */
			i = binop(SHR, arg0, immed64u((p64_t) 4ULL), typnull);
			i = binop(OR, arg0, i, typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x16uto8x16u(0x0100)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(OR, i, j, typnull);
			i = binop(OR, arg0, i, typnull);

			/* arg1 */
			k = binop(SHR, arg1, immed64u((p64_t) 4ULL), typnull);
			k = binop(OR, arg1, k, typnull);
			l = binop(SHR, k, immed64u((p64_t) 2ULL), typnull);
			k = binop(OR, k, l, typnull);
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(OR, k, l, typnull);
			k = binop(AND,
				  k,
				  immedu(cvt1x16uto8x16u(0x0100)),
				  typnull);
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
			k = binop(OR, k, l, typnull);
			l = binop(SHR, k, immed64u((p64_t) 2ULL), typnull);
			k = binop(OR, k, l, typnull);
			l = binop(SHR, k, immed64u((p64_t) 4ULL), typnull);
			k = binop(OR, k, l, typnull);
			k = binop(OR, arg1, k, typnull);

			/* Combine packed arg0 and arg1 in result */
			return(binop(PACK, i, k, typ8u));
		}

	case INTRLVLOW: /* 8us */
	case INTRLVHIGH: /* 8us */
		/* 4-bit to 8-bit interleave of 4-bit fields */
		/* These are the same as unsigned */
		return(binop(op, arg0, arg1, typ8u));

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop8us op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop8(int op,
int arg0,
int arg1)
{
	/* 8-bit signed field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k;

	switch (op) {
	case ADD: /* 8s */
	case EQ: /* 8s */
	case MUL: /* 8s */
	case NE: /* 8s */
	case SHL: /* 8s */
	case LAND: /* 8s */
	case LOR: /* 8s */
	case SUB: /* 8s */
	case XOR: /* 8s */
	case OR: /* 8s */
	case ANDN: /* 8s */
	case AND: /* 8s */
	case PACK: /* 8s */
	case INTRLVLOW: /* 8s */
	case INTRLVHIGH: /* 4-bit to 8-bit interleave of 4-bit fields */
		/* These are the same thing as unsigned */
		return(binop(op, arg0, arg1, typ8u));

	case MULEVEN: /* 8s */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("MULEVEN8s not supported on this target");
		}

	case MULODD: /* 8s */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("MULODD8s not supported on this target");
		}

	case DIV: /* 8s */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else if (optcpu == GenericIA32) {
		    unsigned long long element;

		    /* Serialize */
		    i = immed64u((p64_t)0ULL);
		    for (element=0ULL; element<4ULL; ++element) {
			j = binop(SHL,
				  arg0,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t)24ULL), typ64);
			k = binop(SHL,
				  arg1,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			k = binop(SHR, k, immed64u((p64_t)24ULL), typ64);
			j = binop(DIV, j, k, typ64);
			j = binop(SHL, j, immed64u((p64_t)24ULL), typnull);
			j = binop(SHR,
				  j,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			i = binop(OR, i, j, typnull);
		    }
		    return(i);
		#ifdef NOTDEFD
		  } else if (optcpu & CPU_AltiVec) {
			/* use 16-bit divides */

			/* Divide low fields (sign-extended to 16 bits) */
			i = unop(UNPACKL, arg0, typ16);
			j = unop(UNPACKL, arg1, typ16);
			i = binop(DIV, i, j, typ16);

			/* Divide high fields */
			j = unop(UNPACKH, arg0, typ16);
			k = unop(UNPACKH, arg1, typ16);
			j = binop(DIV, j, k, typ16);

			/* Combine */
			return(binop(PACK, i, j, typ8u));
		#endif
		} else {
			/* use 16-bit divides */

			/* Divide even fields (sign-extended to 16 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 8ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 8ULL), typ16);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 8ULL), typ16);
			i = binop(DIV, i, j, typ16);
			i = binop(AND,
				  i,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);

			/* Divide odd fields */
			j = binop(SHR, arg0, immed64u((p64_t) 8ULL), typ16);
			k = binop(SHR, arg1, immed64u((p64_t) 8ULL), typ16);
			j = binop(DIV, j, k, typ16);
			j = binop(SHL, j, immed64u((p64_t) 8ULL), typ16);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case MOD: /* 8s */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else if (optcpu == GenericIA32) {
		    unsigned long long element;

		    /* Serialize */
		    i = immed64u((p64_t)0ULL);
		    for (element=0ULL; element<4ULL; ++element) {
			j = binop(SHL,
				  arg0,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t)24ULL), typ64);
			k = binop(SHL,
				  arg1,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			k = binop(SHR, k, immed64u((p64_t)24ULL), typ64);
			j = binop(MOD, j, k, typ64);
			j = binop(SHL, j, immed64u((p64_t)24ULL), typnull);
			j = binop(SHR,
				  j,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			i = binop(OR, i, j, typnull);
		    }
		    return(i);
		#ifdef NOTDEFD
		  } else if (optcpu & CPU_AltiVec) {
			/* use 16-bit modulus */

			/* Take mod of low fields (sign-extended to 16 bits) */
			i = unop(UNPACKL, arg0, typ16);
			j = unop(UNPACKL, arg1, typ16);
			i = binop(MOD, i, j, typ16);

			/* Take mod of high fields */
			j = unop(UNPACKH, arg0, typ16);
			k = unop(UNPACKH, arg1, typ16);
			j = binop(MOD, j, k, typ16);

			/* Combine */
			return(binop(PACK, i, j, typ8u));
		#endif
		} else {
			/* use 16-bit modulus */

			/* Take mod of even fields (sign-extended to 16 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 8ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 8ULL), typ16);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 8ULL), typ16);
			i = binop(MOD, i, j, typ16);
			i = binop(AND,
				  i,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);

			/* Take mod of odd fields */
			j = binop(SHR, arg0, immed64u((p64_t) 8ULL), typ16);
			k = binop(SHR, arg1, immed64u((p64_t) 8ULL), typ16);
			j = binop(MOD, j, k, typ16);
			j = binop(SHL, j, immed64u((p64_t) 8ULL), typ16);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case AVG: /* 8s */
		/* Average rounds up */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if (optcpu == GenericIA32) {
			/*i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typ8);*/
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			j = binop(SHR, arg0, immed64u((p64_t) 1ULL), typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x7f)),
				  typnull);
			i = binop(OR, i, j, typnull);

			/*j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typ8);*/
			j = binop(AND,
				  arg1,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			k = binop(SHR, arg1, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND,
				  k,
				  immedu(cvt1x8uto16x8u(0x7f)),
				  typnull);
			j = binop(OR, j, k, typnull);

			i = binop(ADD, i, j, typ8);

			/* Calculate and add rounding bit */
			j = binop(OR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x01)),
				  typnull);
			i = binop(ADD, i, j, typ8);
			return(i);
		} else {
			i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typ8);
			j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typ8);
			i = binop(ADD, i, j, typ8);

			/* Calculate and add rounding bit */
			j = binop(OR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x01)),
				  typnull);
			i = binop(ADD, i, j, typ8);
			return(i);
		}

	case MIN: /* 8s */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* use GT */
			i = binop(GT, arg0, arg1, typ8);
			j = binop(AND, i, arg1, typnull);
			i = binop(ANDN, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ8);
			j = binop(AND, i, arg1, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		}

	case MAX: /* 8s */
		/* use GT */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(GT, arg0, arg1, typ8);
			j = binop(AND, i, arg0, typnull);
			i = binop(ANDN, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = binop(GT, arg0, arg1, typ8);
			j = binop(AND, i, arg0, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		}

	case GT: /* 8s */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else if (optcpu == GenericIA32) {
			/* use 4-bit GT and EQ */
			i = binop(GT, arg0, arg1, typ4u);
			i = binop(SHL, i, immed64u((p64_t) 4ULL), typnull);
			j = binop(EQ, arg0, arg1, typ4);
			i = binop(AND, i, j, typnull);

			j = binop(GT, arg0, arg1, typ4);
			i = binop(OR, i, j, typnull);

			i = binop(AND,
				  i,
				  immedu(cvt1x8uto16x8u(0xf0)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use 16-bit GT */

			/* Compare even fields (sign-extended to 16 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 8ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 8ULL), typ16);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 8ULL), typ16);
			i = binop(GT, i, j, typ16);
			i = binop(AND,
				  i,
				  immedu(cvt1x16uto8x16u(0x00ff)),
				  typnull);

			/* Compare odd fields */
			j = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);
			j = binop(GT, j, k, typ16);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0xff00)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case GE: /* 8s */
		/* the obvious hack is that x GE y is (x EQ y) OR (x GT y) */
		i = binop(GT, arg0, arg1, typ8);
		j = binop(EQ, arg0, arg1, typ8);
		return(binop(OR, i, j, typnull));

	case LT: /* 8s */
		return(binop(GT, arg1, arg0, typ8));

	case LE: /* 8s */
		return(binop(GE, arg1, arg0, typ8));

	case SHR: /* 8s */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* MMX does not directly do 8-bit shifts...
			   use unsigned shift, but paste-in sign extension
			*/
			arg1 = shiftconst(arg1, typ8u);
			if (tup[arg1].op == NUM) {
				/* Shift by a constant is easy */
				i = binop(AND,
					  arg0,
					  immedu(cvt1x8uto16x8u(0x80)),
					  typnull);
				i = binop(EQ,
					  i,
					  immedu(cvt1x8uto16x8u(0x80)),
					  typ8u);
				switch (tup[arg1].immed.q[0]) {
				case 0x0ULL:	i = arg0;
						break;
				case 0x1ULL:
					i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x7f)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
				case 0x2ULL:
					i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x3f)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
				case 0x3ULL:
					i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x1f)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
				case 0x4ULL:
					i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x0f)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
				case 0x5ULL:
					i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x07)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
				case 0x6ULL:
					i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x03)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
				case 0x7ULL:
					i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x01)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
				default: break;
				}
				return(i);
			}
			error("shift right of unsigned 8-bit field values only "
			      "implemented for a constant shift");
			return(immed64u((p64_t) 0ULL));
		}

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop8 op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop8ss(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 8-bit signed field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m;

	switch (op) {
	case EQ: /* 8ss */
	case NE: /* 8ss */

	case LAND: /* 8ss */
	case LOR: /* 8ss */

	case AND: /* 8ss */
	case ANDN: /* 8ss */
	case OR: /* 8ss */
	case XOR: /* 8ss */
		/* These are all the same as unsigned unsaturated */
		return(binop(op, arg0, arg1, typ8u));

	case MIN: /* 8ss */
	case MAX: /* 8ss */
	case AVG: /* 8ss */

	case GT: /* 8ss */
	case GE: /* 8ss */
	case LT: /* 8ss */
	case LE: /* 8ss */
		/* These are all the same as signed unsaturated */
		return(binop(op, arg0, arg1, typ8));

	case ADD: /* 8ss */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* Do the signed add */
			i = binop(ADD, arg0, arg1, typ8);

			/* Correct for positive saturation */
			j = binop(OR, arg0, arg1, typnull);
			j = unop(NOT, j, typnull);	/* tX..X if both pos */
			j = binop(AND, j, i, typnull);	/* MSb(sum) = 1 */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with PosSat if
			   arg1==0
			*/
			i = binop(OR, i, l, typnull);		/* T...T */
			j = unop(NOT, j, typnull);		/* f1..1 */
			i = binop(AND, i, j, typnull);


			/* Correct for negative saturation */
			j = binop(AND, arg0, arg1, typnull);
							/* tX..X if both neg */
			k = unop(NOT, i, typnull);
			j = binop(AND, j, k, typnull);	/* MSb(sum) = 0 */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with NegSat if
			   arg1==0
			*/
			l = unop(NOT, l, typnull);		/* F...F */
			i = binop(AND, i, l, typnull);
			i = binop(OR, i, j, typnull);

			return(i);
		}
		
	case SUB: /* 8ss */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* Do the signed sub */
			i = binop(SUB, arg0, arg1, typ8);

			/* Correct for positive saturation */
			m = binop(XOR, arg0, arg1, typnull);
							/* tX..X if mixed */
			j = binop(AND, m, arg1, typnull);
						/* tX..X if arg0+ & arg1- */
			j = binop(AND, j, i, typnull);	/* MSb(diff) = 1 */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with PosSat if
			   arg1==0
			*/
			i = binop(OR, i, l, typnull);		/* T...T */
			j = unop(NOT, j, typnull);		/* f1..1 */
			i = binop(AND, i, j, typnull);


			/* Correct for negative saturation */
			j = binop(AND, m, arg0, typnull);
						/* tX..X if arg0- & arg1+ */
			k = unop(NOT, i, typnull);
			j = binop(AND, j, k, typnull);	/* MSb(diff) = 0 */
			j = binop(AND,
				  j,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with NegSat if
			   arg1==0
			*/
			l = unop(NOT, l, typnull);		/* F...F */
			i = binop(AND, i, l, typnull);
			i = binop(OR, i, j, typnull);

			return(i);
		}

	case MUL: /* 8ss */
	    if (optcpu == GenericIA32) {
		/* Serialize */
		unsigned long long element;
		i = immed64u((p64_t)0ULL);
		for (element=0ULL; element<4ULL; ++element) {
			j = binop(SHL,
				  arg0,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t)24ULL), typ32);
			k = binop(SHL,
				  arg1,
				  immed64u((p64_t)(8ULL*element)),
				  typnull);
			k = binop(SHR, k, immed64u((p64_t)24ULL), typ32);
			j = binop(MUL, j, k, typ32);
			j = binop(MIN, j, immed64u((p64_t)0x7fULL), typ32);
			j = binop(MAX,
				  j,
				  immed32u((p32_t)0xffffff80U),
				  typ32);
			j = binop(AND, j, immed64u((p64_t)0xffULL), typnull);
			j = binop(SHL,
				  j,
				  immed64u((p64_t)(24ULL-8ULL*element)),
				  typnull);
			i = binop(OR, i, j, typnull);
		}
		/* Saturate to the declared precision */
		if (decl_bits != 8) {
			i = binop(MIN,
				  i,
				  immedu(cvt1x4uto32x4u((1 << decl_bits) - 1)),
				  typ8u);
		}
		return(i);
	    } else if (optcpu & CPU_AltiVec) {
		/* Only have 16-bit muls...
		   sign-extend halves, mul, pack with saturation sequence
		*/
		i = unop(UNPACKL, arg0, typ16);
		j = unop(UNPACKL, arg1, typ16);
		k = binop(MUL, i, j, typ16);

		i = unop(UNPACKH, arg0, typ16);
		j = unop(UNPACKH, arg1, typ16);
		i = binop(MUL, i, j, typ16);

		return(binop(PACK, k, i, typ8ss));

	    } else {
		/* Only have 16-bit muls...
		   interleave with sign bit, mul, pack with saturation sequence
		*/
		i = binop(SHR, arg0, immed64u((p64_t) 7ULL), typ8);
		j = binop(SHR, arg1, immed64u((p64_t) 7ULL), typ8);

		k = binop(INTRLVLOW, arg0, i, typ16u);
		l = binop(INTRLVLOW, arg1, j, typ16u);
		k = binop(MUL, k, l, typ16);

		i = binop(INTRLVHIGH, arg0, i, typ16u);
		j = binop(INTRLVHIGH, arg1, j, typ16u);
		i = binop(MUL, i, j, typ16);

		return(binop(PACK, k, i, typ8ss));
	    }

	case DIV: /* 8ss */
		/* This method will bomb on div by 0 instead of saturating,
		   which is, of course, the point of doing a sat DIV. */

		/* Modify 8-bit divides */
		i = binop(DIV, arg0, arg1, typ8);


		/* Correct for positive saturation */
		/* Get ~i~j~k~l~m~n~o~p */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);

		/* Get ijklmnop */
		k = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
						   /* j:k:l:m:n:o:p:X */
		k = binop(AND, arg1, k, typnull);  /* ij:jk:kl:lm:mn:no:op:X */
		l = binop(SHL, k, immed64u((p64_t) 2ULL), typnull);
						   /* kl:lm:mn:no:op:X... */
		k = binop(AND, k, l, typnull);     /* ijkl:X:X:X:mnop:X...*/
		l = binop(SHL, k, immed64u((p64_t) 4ULL), typnull);
						   /* mnop:X... */
		k = binop(AND, k, l, typnull);     /* ijklmnop:X... */

		/* Get a~b~c~d~e~f~g~h */
		l = unop(NOT, arg0, typnull);           /* ~a:~b...~g:~h */
		m = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* ~b:~c...~h:X */
		l = binop(AND, l, m, typnull);          /* X:~b~c:..~d~e:... */
		m = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
							/* X:~d~e:... */
		l = binop(AND, l, m, typnull);          /* X:~b~c~d~e:...:X */
		m = binop(SHL, l, immed64u((p64_t) 3ULL), typnull);
							/* X:~e~f~g~h:...:X */
		l = binop(AND, l, m, typnull);          /* X:~b~c~d~e~f~g~h.. */

		l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
							/* ~b~c~d~e~f~g~h...  */

		l = binop(AND, l, arg0, typnull);       /* a~b~c~d~e~f~g~h... */

		l = binop(AND, k, l, typnull);	/* a~b~c~d~e~f~g~hijklmnop */

		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			k = binop(ANDN, arg0, j, typnull);
		} else {
			k = unop(NOT, arg0, typnull);
			k = binop(AND, k, j, typnull);
		} /* ~a(tuple j) */
		k = binop(OR, l, k, typnull); /* ~a~i...~p | a~b...~hi...p */
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x80)), typnull);
		/* k has pattern T0000000 */

		/* Apply the mask to the MSb */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			i = binop(ANDN, k, i, typnull);
		} else {
			l = unop(NOT, k, typnull);
			i = binop(AND, i, l, typnull);
		}

		/* Convert k's T0000000 pattern to a 0TTTTTTT pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
							/* 0T000000*/
		l = binop(OR, k, l, typnull);		/* TT000000*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
							/* 00TT0000*/
		l = binop(OR, l, m, typnull);		/* TTTT0000*/
		m = binop(SHR, l, immed64u((p64_t) 3ULL), typnull);
							/* 000TTTT0*/
		l = binop(OR, l, m, typnull);		/* TTTTTTT0*/
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* 0TTTTTTT*/

		/* Apply the mask to the LSbs */
		i = binop(OR, i, l, typnull);


		/* Correct for negative saturation */
		k = binop(AND, arg0, j, typnull);	/* a(tuple j) */
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x80)), typnull);
		/* k has pattern T000 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's T0000000 pattern to a 1FFFFFFF pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* 0T000000*/
		l = binop(OR, k, l, typnull);			/* TT000000*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* 00TT0000*/
		l = binop(OR, k, l, typnull);			/* TTTT0000*/
		m = binop(SHR, l, immed64u((p64_t) 3ULL), typnull);
								/* 000TTTT0*/
		l = binop(OR, l, m, typnull);			/* TTTTTTT0*/
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
								/* 0TTTTTTT*/
		l = unop(NOT, l, typnull);			/* 1FFFFFFF*/

		/* Apply the mask to the LSbs */
		i = binop(AND, i, l, typnull);

		return(i);

	case MOD: /* 8ss */
		/* This method will bomb on mod by 0 instead of saturating,
		   which is, of course, the point of doing a sat MOD. */

		/* Modify 8-bit modulus */
		i = binop(MOD, arg0, arg1, typ8);

		/* Get ~i~j~k~l~m~n~o~p */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);

		/* Correct for positive saturation */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			k = binop(ANDN, arg0, j, typnull);   /* ~a(tuple j) */
		} else {
			k = unop(NOT, arg0, typnull);
			k = binop(AND, k, j, typnull);    /* ~a(tuple j) */
		}
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x80)), typnull);
		/* k has pattern T0000000 */

		/* Apply the mask to the MSb */
		l = unop(NOT, k, typnull);
		i = binop(AND, i, l, typnull);

		/* Convert k's T0000000 pattern to a 0TTTTTTT pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* 0T000000*/
		l = binop(OR, k, l, typnull);			/* TT000000*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* 00TT0000*/
		l = binop(OR, l, m, typnull);			/* TTTT0000*/
		m = binop(SHR, l, immed64u((p64_t) 3ULL), typnull);
								/* 000TTTT0*/
		l = binop(OR, l, m, typnull);			/* TTTTTTT0*/
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
								/* 0TTTTTTT*/

		/* Apply the mask to the LSbs */
		i = binop(OR, i, l, typnull);


		/* Correct for negative saturation */
		k = binop(AND, arg0, j, typnull);	/* a(tuple j) */
		k = binop(AND, k, immedu(cvt1x8uto16x8u(0x80)), typnull);
		/* k has pattern T0000000 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's T0000000 pattern to a 1FFFFFFF pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* 0T000000*/
		l = binop(OR, k, l, typnull);			/* TT000000*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* 00TT0000*/
		l = binop(OR, k, l, typnull);			/* TTTT0000*/
		m = binop(SHR, l, immed64u((p64_t) 3ULL), typnull);
								/* 000TTTT0*/
		l = binop(OR, l, m, typnull);			/* TTTTTTT0*/
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
								/* 0TTTTTTT*/
		l = unop(NOT, l, typnull);			/* 1FFFFFFF*/

		/* Apply the mask to the LSbs */
		i = binop(AND, i, l, typnull);

		return(i);

	case SHL: /* 8ss */
		/* Not true? */
		/* the same thing signed or unsigned */
		return(binop(op, arg0, arg1, typ8u));

	case SHR: /* 8ss */
		/* MMX does not directly do 8-bit shifts...
		   use unsigned shift, but paste-in sign extension
		*/
		arg1 = shiftconst(arg1, typ8u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x8uto16x8u(0x80)),
				  typnull);
			i = binop(EQ, i, immedu(cvt1x8uto16x8u(0x80)), typ8u);
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:	i = arg0;
					break;
			case 0x1ULL:	i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x7f)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
			case 0x2ULL:	i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x3f)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
			case 0x3ULL:	i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x1f)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
			case 0x4ULL:	i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x0f)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
			case 0x5ULL:	i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x07)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
			case 0x6ULL:	i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x03)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
			case 0x7ULL:	i = binop(ANDN,
						  immedu(cvt1x8uto16x8u(0x01)),
						  i,
						  typnull);
					i = binop(OR,
						  i,
						  binop(SHR, arg0, arg1, typ8u),
						  typnull);
					break;
			default:	break;
			}
			return(i);
		}
		error("shift right of unsigned 8-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	case PACK: /* 8ss */
		/* 16ss -> 8ss */
		/* PACK (with saturation) each 16-bit field value to
		   8 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* Saturate arg0 */
			i = binop(MIN,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x007f)),
				  typ16);
			i = binop(MAX,
				  i,
				  immedu(cvt1x16uto8x16u(0xff80)),
				  typ16);

			/* Saturate arg1 */
			j = binop(MIN,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x007f)),
				  typ16);
			j = binop(MAX,
				  j,
				  immedu(cvt1x16uto8x16u(0xff80)),
				  typ16);

			/* Pack as signed */
			return(binop(PACK, i, j, typ8));
		}

	case INTRLVLOW: /* 8ss */
	case INTRLVHIGH: /* 8ss */
		/* 4-bit to 8-bit interleave of 4-bit fields */
		/* These are the same thing as unsigned */
		return(binop(op, arg0, arg1, typ8u));

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop8ss op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}


int
binop16u(int op,
int arg0,
int arg1)
{
	/* 16-bit unsigned field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i=-1, j, k, l;

	switch (op) {
	case ADD: /* 16u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use implicit spacer technique */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x7fff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x7fff)),
				  typnull);
			i = binop(ADD, i, j, typ32u);
			j = binop(XOR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			return(binop(XOR, i, j, typnull));
		}

	case SUB: /* 16u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use implicit spacer technique */
			i = binop(OR,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x7fff)),
				  typnull);
			i = binop(SUB, i, j, typ32u);
			j = binop(XOR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			j = binop(XOR,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			return(binop(XOR, i, j, typnull));
		}

	case MUL: /* 16u */
		if (optcpu & CPU_MMX) {
			break;
		} else if (optcpu == GenericIA32) {
			/* Use 32-bit MUL
			   unsigned interleave, mul, cast via modulation, pack
			   with saturation sequence
			*/
			i = binop(INTRLVLOW,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ32u);
			j = binop(INTRLVLOW,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ32u);
			k = binop(MUL, i, j, typ32u);

			k = binop(AND,
				  k,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);

			i = binop(INTRLVHIGH,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ32u);
			j = binop(INTRLVHIGH,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ32u);
			i = binop(MUL, i, j, typ32u);
			i = binop(AND,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);

			return(binop(PACK, k, i, typ16us));
		} else if (optcpu & CPU_AltiVec) {
			/* Have even and odd 16u->32u muls */
			i = binop(MULEVEN, arg0, arg1, typ16u);
			i = binop(AND,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t)0x0000ffff)),
				  typnull);

			j = binop(MULODD, arg0, arg1, typ16u);
			j = binop(SHL,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 16U)),
				  typ32u);
			return(binop(OR, i, j, typnull));
		} else {
			unsigned long long step;

			/* Perform a shift-add sequence */
			i = immed64u((p64_t) 0ULL);
			for (step=0ULL; step<16ULL; ++step)
			{
				j = binop(AND,
					  arg1,
					  immedu(cvt1x16uto8x16u(0x0001<<step)),
					  typnull);
				k = binop(NE,
					 j,
					 immed64u((p64_t) 0ULL),
					 typ16u);
				j = binop(AND, arg0, k, typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) step),
					  typ16u);
				i = binop(ADD, i, j, typ16u);
			}
			return(i);
		}

	case MULH: /* 16u */
		if (optcpu & CPU_athlon) {
			break;
		} else if (optcpu & CPU_AltiVec) {
			/* Have even and odd 16u->32u muls */
			i = binop(MULEVEN, arg0, arg1, typ16u);
			i = binop(SHR,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t) 16)),
				  typ32u);

			j = binop(MULODD, arg0, arg1, typ16u);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t)0xffff0000)),
				  typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* Emulate using two 32-bit unsigned MULs */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			k = binop(MUL, i, j, typ32u);
			k = binop(AND,
				  k,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);
			k = binop(SHR, k, immed64u((p64_t) 16ULL), typnull);

			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);
			i = binop(SHR, i, immed64u((p64_t) 16ULL), typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t) 16ULL), typnull);
			l = binop(MUL, i, j, typ32u);
			l = binop(AND,
				  l,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);

			return (binop(OR, k, l, typnull));
		}

	case MULEVEN: /* 16u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("MULEVEN16u not supported on this target");
		}

	case MULODD: /* 16u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("MULODD16u not supported on this target");
		}

	case DIV: /* 16u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		#ifdef NOTDEFD
		  } else if (optcpu & CPU_AltiVec) {
			/* use 32-bit divides */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			i = binop(DIV, i, j, typ32u);
			j = binop(SHR, arg0, immed64u((p64_t) 16ULL), typ32u);
			k = binop(SHR, arg1, immed64u((p64_t) 16ULL), typ32u);
			j = binop(DIV, j, k, typ32u);
			return(binop(INTRLVEVEN, i, j, typ16u));
		#endif
		} else {
			/* use 32-bit divides */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			i = binop(DIV, i, j, typ32u);
			j = binop(SHR, arg0, immed64u((p64_t) 16ULL), typ32u);
			k = binop(SHR, arg1, immed64u((p64_t) 16ULL), typ32u);
			j = binop(DIV, j, k, typ32u);
			j = binop(SHL, j, immed64u((p64_t) 16ULL), typ32u);
			return(binop(OR, i, j, typnull));
		}

	case MOD: /* 16u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		#ifdef NOTDEFD
		  } else if (optcpu & CPU_AltiVec) {
			/* use 32-bit modulus */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			i = binop(MOD, i, j, typ32u);
			j = binop(SHR, arg0, immed64u((p64_t) 16ULL), typ32u);
			k = binop(SHR, arg1, immed64u((p64_t) 16ULL), typ32u);
			j = binop(MOD, j, k, typ32u);
			return(binop(INTRLVEVEN, i, j, typ16u));
		#endif
		} else {
			/* use 32-bit modulus */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			i = binop(MOD, i, j, typ32u);
			j = binop(SHR, arg0, immed64u((p64_t) 16ULL), typ32u);
			k = binop(SHR, arg1, immed64u((p64_t) 16ULL), typ32u);
			j = binop(MOD, j, k, typ32u);
			j = binop(SHL, j, immed64u((p64_t) 16ULL), typ32u);
			return(binop(OR, i, j, typnull));
		}

	case AND: /* 16u */
	case OR: /* 16u */
	case XOR: /* 16u */
		break;

	case ANDN: /* 16u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			break;
		} else {
			i = unop(NOT, arg0, typnull);
			return(binop(AND, i, arg1, typnull));
		}

	case AVG: /* 16u */
		/* Average rounds up */
		if ((optcpu & CPU_MAX) || (optcpu & CPU_athlon) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typ16u);
			i = binop(AND,
				  i,
				  immedu(cvt1x16uto8x16u(0x7fff)),
				  typnull);
			j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typ16u);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x7fff)),
				  typnull);
			i = binop(ADD, i, j, typ16u);
			j = binop(OR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x0001)),
				  typnull);
			i = binop(ADD, i, j, typ32u);
			return(i);
		}

	case MIN: /* 16u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* use GT */
			i = binop(GT, arg0, arg1, typ16u);
			j = binop(AND, i, arg1, typnull);
			i = binop(ANDN, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ16u);
			j = binop(AND, i, arg1, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		}

	case MAX: /* 16u */
		if ((optcpu & CPU_XMMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* use GT */
			i = binop(GT, arg0, arg1, typ16u);
			j = binop(AND, i, arg0, typnull);
			i = binop(ANDN, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ16u);
			j = binop(AND, i, arg0, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		}

	case EQ: /* 16u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use 32-bit EQ */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			i = binop(EQ, i, j, typ32u);
			i = binop(AND,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			j = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);
			j = binop(EQ, j, k, typ32u);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);
			return(binop(OR, i, j, typnull));
		}

	case NE: /* 16u */
		/* not EQ */
		return(unop(NOT, binop(EQ, arg0, arg1, typ16u), typnull));

	case GT: /* 16u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if (optcpu & CPU_MMX) {
			/* Add offset and do signed GT */
			i = binop(ADD,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typ16u);
			j = binop(ADD,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typ16u);
			return(binop(GT, i, j, typ16));
		} else {
			/* use 32-bit GT */

			/* Compare even fields */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t)0x0000ffff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t)0x0000ffff)),
				  typnull);
			i = binop(GT, i, j, typ32u);
			i = binop(AND,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t)0x0000ffff)),
				  typnull);

			/* Compare odd fields */
			j = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t)0xffff0000)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t)0xffff0000)),
				  typnull);
			j = binop(GT, j, k, typ32u);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t)0xffff0000)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case LT: /* 16u */
		return(binop(GT, arg1, arg0, typ16u));

	case LE: /* 16u */
		return(binop(GE, arg1, arg0, typ16u));

	case GE: /* 16u */
		/* the obvious hack is that x GE y is (x EQ y) OR (x GT y) */
		i = binop(GT, arg0, arg1, typ16u);
		j = binop(EQ, arg0, arg1, typ16u);
		return(binop(OR, i, j, typnull));

	case LAND: /* 16u */
		/* use 16-bit NE 0 to normalize fields before AND */
		i = binop(NE, arg0, immed64u((p64_t) 0x0ULL), typ16u);
		j = binop(NE, arg1, immed64u((p64_t) 0x0ULL), typ16u);
		return(binop(AND, i, j, typnull));

	case LOR: /* 16u */
		/* use 16-bit NE 0 to normalize fields after ORing */
		i = binop(OR, arg0, arg1, typnull);
		return(binop(NE, i, immed64u((p64_t) 0ULL), typ16u));

	case SHL: /* 16u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec))
			break;

		arg1 = shiftconst(arg1, typ8u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			i = binop(SHL,
				  arg0,
				  immed64u((p64_t)tup[arg1].immed.uq[0]),
				  typnull);
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:	i = arg0; break;
			case 0x1ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xfffe)),
						typnull);
					break;
			case 0x2ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xfffc)),
						typnull);
					break;
			case 0x3ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xfff8)),
						typnull);
					break;
			case 0x4ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xfff0)),
						typnull);
					break;
			case 0x5ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xffe0)),
						typnull);
					break;
			case 0x6ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xffc0)),
						typnull);
					break;
			case 0x7ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xff80)),
						typnull);
					break;

			case 0x8ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xff00)),
						typnull);
					break;
			case 0x9ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xfe00)),
						typnull);
					break;
			case 0xaULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xfc00)),
						typnull);
					break;
			case 0xbULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xf800)),
						typnull);
					break;
			case 0xcULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xf000)),
						typnull);
					break;
			case 0xdULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xe000)),
						typnull);
					break;
			case 0xeULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0xc000)),
						typnull);
					break;
			case 0xfULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x8000)),
						typnull);
					break;

			default:	i = immed64u((p64_t) 0ULL);
			}
			return(i);
		}
		error("shift left of unsigned 16-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	case SHR: /* 16u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec))
			break;

		arg1 = shiftconst(arg1, typ8u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			i = binop(SHR,
				  arg0,
				  immed64u((p64_t)tup[arg1].immed.uq[0]),
				  typnull);
			switch (tup[arg1].immed.q[0]) {
			case 0x0ULL:	i = arg0; break;
			case 0x1ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x7fff)),
						typnull);
					break;
			case 0x2ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x3fff)),
						typnull);
					break;
			case 0x3ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x1fff)),
						typnull);
					break;
			case 0x4ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x0fff)),
						typnull);
					break;
			case 0x5ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x07ff)),
						typnull);
					break;
			case 0x6ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x03ff)),
						typnull);
					break;
			case 0x7ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x01ff)),
						typnull);
					break;

			case 0x8ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x00ff)),
						typnull);
					break;
			case 0x9ULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x007f)),
						typnull);
					break;
			case 0xaULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x003f)),
						typnull);
					break;
			case 0xbULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x001f)),
						typnull);
					break;
			case 0xcULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x000f)),
						typnull);
					break;
			case 0xdULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x0007)),
						typnull);
					break;
			case 0xeULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x0003)),
						typnull);
					break;
			case 0xfULL:	i = binop(AND,
						i,
						immedu(cvt1x16uto8x16u(0x0001)),
						typnull);
					break;

			default:	i = immed64u((p64_t) 0ULL);
			}
			return(i);
		}
		error("shift right of unsigned 16-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	case PACK: /* 16u */
		/* 32u -> 16u */
		/* PACK (without saturation) each 32-bit field value to
		   16 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		if (optcpu & CPU_AltiVec) {
			/* vpkuwum */
			break;
		} else if (optcpu & CPU_MAX) {
			i = binop(INTRLVEVEN, arg0, arg1, typ16u);
			return(binop(PERM,
				     i,
				     immed64u((p64_t) 1302ULL),
				     typ16u));
		}

		if (bitsperfrag() > 32)
		{
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t)0x0000ffff)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 16ULL), typnull);
			i = binop(OR, i, j, typnull);

			k = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t)0x0000ffff)),
				  typnull);
			l = binop(SHR, k, immed64u((p64_t) 16ULL), typnull);
			k = binop(OR, k, l, typnull);

			/* Combine packed arg0 and arg1 in result */
			return(binop(PACK, i, k, typ32u));
		}
		else
		{
			/* Stop here if 32-bit target */

			/* Keep low 16 bits of arg0 in packed form in
			   low half (32-bit target) */
			i = binop(AND,
				  arg0,
				  immed32u((p32_t)0x0000ffff),
				  typnull);

			/* Move low 16 bits of arg1 to high half */
			k = binop(SHL, arg1, immed64u((p64_t) 16ULL), typnull);

			/* Combine packed arg0 and arg1 in result */
			return(binop(OR, i, k, typnull));
		}

	case INTRLVLOW: /* 16u */
	    /* 8-bit to 16-bit interleave of 8-bit fields */
	    if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
		break;		/* vmrglb for AltiVec */
	    } else {
			int bpf = bitsperfrag();

			/* high bit */
			j = arg1;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					  j,
					  (bpf==64)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 32:
				i = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x000000ffULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x000000ff)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x0000ff00ULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x0000ff00)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 8ULL),
					  typnull);
				j = binop(OR, i, j, typnull);


				i = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x000f)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x00f0)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 4ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}
			k = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);


			/* low bit */
			j = arg0;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					  j,
					  (bpf==64)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 32:
				i = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x000000ffULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x000000ff)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t) 0x0000ff00ULL):
					      immedu(cvt1x32uto4x32u(
						(p32_t)0x0000ff00)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 8ULL),
					  typnull);
				j = binop(OR, i, j, typnull);


				i = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x000f)),
					  typnull);
				j = binop(AND,
					  j,
					  immedu(cvt1x16uto8x16u(0x00f0)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 4ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}

		return(binop(OR, i, k, typnull));
	    }

	case INTRLVHIGH: /* 16u */
	    /* 8-bit to 16-bit interleave of 8-bit fields */
	    if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
		break;		/* vmrghb for AltiVec */
	    } else {
		/* sneaky way to reuse INTRLVLOW code... */
		unsigned long long bpf_2 =
			(unsigned long long) bitsperfrag()/2ULL;
		i = binop(SHR, arg0, immed64u((p64_t) bpf_2), typnull);
		j = binop(SHR, arg1, immed64u((p64_t) bpf_2), typnull);
		return(binop(INTRLVLOW, i, j, typ16u));
	    }

	case INTRLVODD: /* 16u */
		/* Interleave 16-bit fields */
		if (optcpu & CPU_MAX) {
			break;
		} else if (optcpu & CPU_AltiVec) {
#ifdef TRINOPS
			p128_t tmp;
			tmp.uq[1] = 0x1011000114150405ULL;
			tmp.uq[0] = 0x181908091C1D0C0DULL;
			return(trinop(TPERM,
				      arg0,
				      arg1,
				      immed128u(tmp),
				      typ16u));
#else
			/* For now, implement in target header file */
			break;
#endif
		}
		bug("INTRLVODD not available on this target");
		break;

	case INTRLVEVEN: /* 16u */
		/* Interleave 16-bit fields */
		if (optcpu & CPU_MAX) {
			break;
		} else if (optcpu & CPU_AltiVec) {
#ifdef TRINOPS
			p128_t tmp;
			tmp.uq[1] = 0x1213020316170607ULL;
			tmp.uq[0] = 0x1A1B0A0B1E1F0E0FULL;
			return(trinop(TPERM,
				      arg0,
				      arg1,
				      immed128u(tmp),
				      typ16u));
#else
			/* For now, implement in target header file */
			break;
#endif
		}
		bug("INTRLVEVEN not available on this target");
		break;

	case PERM: /* 16u */
		/* Arbitrary permutation of 16-bit fields */
		if ((optcpu & CPU_MAX) || (optcpu & CPU_athlon)) {
			if (tup[arg1].op == NUM)
				break;
			bug("PERM index restricted to constants");
			break;
		}

		bug("PERM not available on this target");
		break;

	case REPL: /* 16u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("REPL16u not available on this target");
			break;
		}

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop16u op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}


int
binop16us(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 16-bit unsigned field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l;

	switch (op) {
	case ADD: /* 16us */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use implicit spacer technique */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x7fff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x7fff)),
				  typnull);
			i = binop(ADD, i, j, typ32u);
			j = binop(XOR, arg0, arg1, typnull);  /* "propagate" */
			k = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			i = binop(XOR, i, k, typnull);

			/* Calulate overflow */
			k = binop(AND, arg0, arg1, typnull);	/* generate */
			l = k;

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			k = binop(SHL, k, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, k, j, typnull);
			l = binop(OR, l, k, typnull);

			l = binop(AND,
				  l,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);

			/* ...and create a saturation mask */
			k = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, k, l, typnull);

			/* Clobber the calculated value with the max on
			   overflow
			*/
			return(binop(OR, l, i, typnull));
		}

	case SUB: /* 16us */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* Do subtract as usual, using implicit spacer
			   technique */
			i = binop(OR,
				  arg0,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x16uto8x16u(0x7fff)),
				  typnull);
			i = binop(SUB, i, j, typ32u);
			j = binop(XOR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			j = binop(XOR,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			i = binop(XOR, i, j, typnull);

			/* Create a saturation mask */
			j = binop(GT, arg1, arg0, typ16u);
			j = unop(NOT, j, typnull);

			/* Clobber the calculated value with 0 on negative
			   overflow */
			return(binop(AND, i, j, typnull));
		}

	case MUL: /* 16us */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			/* Do unsigned mul */
			i = binop(MUL, arg0, arg1, typ16u);

			/* Do high mul, and convert to saturation mask */
			j = binop(MULH, arg0, arg1, typ16u);
			k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, j, k, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);
			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHR, j, immed64u((p64_t) 8ULL), typnull);
			j = binop(OR, j, k, typnull);

			return(binop(OR, i, j, typnull));
		} else {
			/* Use 32-bit MULs...
			   unsigned interleave, mul, pack with saturation
			   sequence
			*/
			i = binop(INTRLVLOW,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ32u);
			j = binop(INTRLVLOW,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ32u);
			k = binop(MUL, i, j, typ32u);

			i = binop(INTRLVHIGH,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ32u);
			j = binop(INTRLVHIGH,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ32u);
			i = binop(MUL, i, j, typ32u);

			return(binop(PACK, k, i, typ16us));
		}

	case DIV: /* 16us */
		/* This method will bomb on div by 0 instead of saturating,
		   which is, of course, the point of doing a sat DIV. */

		/* Do divide as usual */
		i = binop(DIV, arg0, arg1, typ16u);

		/* Generate a saturation mask */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);

		j = binop(AND, j, immedu(cvt1x16uto8x16u(0x8000)), typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(OR, k, j, typnull);

		/* Clobber the calculated value with MAX if arg1==0 */
		return(binop(OR, i, j, typnull));

	case MOD: /* 16us */
		/* This method will bomb on mod by 0 instead of saturating,
		   which is, of course, the point of doing a sat MOD. */

		/* Do modulus as usual */
		i = binop(MOD, arg0, arg1, typ16u);

		/* Generate a saturation mask */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);

		j = binop(AND, j, immedu(cvt1x16uto8x16u(0x8000)), typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(OR, k, j, typnull);

		/* Clobber the calculated value with MAX if arg1==0 */
		return(binop(OR, i, j, typnull));

	case MIN: /* 16us */
	case MAX: /* 16us */
	case AVG: /* 16us */

	case AND: /* 16us */
	case ANDN: /* 16us */
	case OR: /* 16us */
	case XOR: /* 16us */

	case LAND: /* 16us */
	case LOR: /* 16us */

	case EQ: /* 16us */
	case NE: /* 16us */
	case GT: /* 16us */
	case LT: /* 16us */
	case LE: /* 16us */
	case GE: /* 16us */
		/* These are all the same as unsigned unsaturated */
		return(binop(op, arg0, arg1, typ16u));

	case SHL: /* 16us */
	case SHR: /* 16us */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			break;
		} else {
			info(0, "SHL16us not implemented");
			break;
		}

	case PACK: /* 16us */
		/* 32us -> 16us */
		/* PACK (with unsigned saturation) each 32-bit field value to
		   16 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* Saturate arg0 */
			i = binop(SHR, arg0, immed64u((p64_t) 8ULL), typnull);
			i = binop(OR, arg0, i, typnull);
			j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, i, j, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0x00010000)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			i = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(OR, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t) 8ULL), typnull);
			i = binop(OR, i, j, typnull);
			i = binop(OR, arg0, i, typnull);

			/* Saturate arg1 */
			k = binop(SHR, arg1, immed64u((p64_t) 8ULL), typnull);
			k = binop(OR, arg1, k, typnull);
			l = binop(SHR, k, immed64u((p64_t) 4ULL), typnull);
			k = binop(OR, k, l, typnull);
			l = binop(SHR, k, immed64u((p64_t) 2ULL), typnull);
			k = binop(OR, k, l, typnull);
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, l, typnull);
			l = binop(AND,
				  l,
				  immedu(cvt1x32uto4x32u((p32_t) 0x00010000)),
				  typnull);
			l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
			k = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
			k = binop(OR, k, l, typnull);
			l = binop(SHR, k, immed64u((p64_t) 2ULL), typnull);
			k = binop(OR, k, l, typnull);
			l = binop(SHR, k, immed64u((p64_t) 4ULL), typnull);
			k = binop(OR, k, l, typnull);
			l = binop(SHR, k, immed64u((p64_t) 8ULL), typnull);
			k = binop(OR, k, l, typnull);
			k = binop(OR, arg1, k, typnull);

			/* Combine packed arg0 and arg1 in result */
			return(binop(PACK, i, k, typ16u));
		}

	case INTRLVLOW: /* 16us */
	case INTRLVHIGH: /* 16us */
		/* 8-bit to 16-bit interleave of 8-bit fields */
		/* The same as unsigned */
		return(binop(op, arg0, arg1, typ16u));

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop16us op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop16(int op,
int arg0,
int arg1)
{
	/* 16-bit signed field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l;

	switch (op) {
	case ADD: /* 16s */
	case SUB: /* 16s */
	case MUL: /* 16s */

	case EQ: /* 16s */
	case NE: /* 16s */

	case LAND: /* 16s */
	case LOR: /* 16s */

	case AND: /* 16s */
	case ANDN: /* 16s */
	case OR: /* 16s */
	case XOR: /* 16s */

	case SHL: /* 16s */
		/* the same thing signed or unsigned */
		return(binop(op, arg0, arg1, typ16u));

	case MULH: /* 16s */
		if (optcpu & CPU_MMX) {
			break;
		} else if (optcpu & CPU_AltiVec) {
			/* Have even and odd 16->32 muls */
			i = binop(MULEVEN, arg0, arg1, typ16);
			i = binop(SHR,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t)16)),
				  typ32u);

			j = binop(MULODD, arg0, arg1, typ16);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t)0xffff0000)),
				  typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* Emulate using two 32-bit signed MULs */
			/* Even fields */
			i = binop(SHL, arg0, immed64u((p64_t) 16ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 16ULL), typ32);
			j = binop(SHL, arg1, immed64u((p64_t) 16ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 16ULL), typ32);
			k = binop(MUL, i, j, typ32);
			k = binop(AND,
				  k,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);
			k = binop(SHR, k, immed64u((p64_t) 16ULL), typnull);

			/* Odd fields */
			i = binop(SHR, arg0, immed64u((p64_t) 15ULL), typ32);
			i = binop(SHR, i, immed64u((p64_t) 1ULL), typ32);
			/* HEREHERE - This will be a problem...
			   For some reason, if I do the following as a single
			   SHR by 16, it gets converted to a logical shift, but
			   if I split it into two non-16 bit shifts, they both
			   stay arithmetic.  I figure this out later.
			*/
			/* See note above */
			j = binop(SHR, arg1, immed64u((p64_t) 15ULL), typ32);
			j = binop(SHR, j, immed64u((p64_t) 1ULL), typ32);
			l = binop(MUL, i, j, typ32);
			l = binop(AND,
				  l,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);

			return (binop(OR, k, l, typnull));
		}

	case MULEVEN: /* 16s */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("MULEVEN16 not supported on this target");
		}

	case MULODD: /* 16s */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("MULODD16 not supported on this target");
		}

	case GT: /* 16s */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use 32-bit GT */
			/* Compare even fields (sign-extended to 32 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 16ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 16ULL), typ32);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 16ULL), typ32);
			i = binop(GT, i, j, typ32);
			i = binop(AND,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);

			/* Compare odd fields */
			j = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);
			j = binop(GT, j, k, typ32);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case DIV: /* 16s */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		#ifdef NOTDEFD
		  } else if (optcpu & CPU_AltiVec) {
			/* use 32-bit divides */

			/* Divide low fields (sign-extended to 32 bits) */
			i = unop(UNPACKL, arg0, typ32);
			j = unop(UNPACKL, arg1, typ32);
			i = binop(DIV, i, j, typ32);

			/* Divide high fields */
			j = unop(UNPACKH, arg0, typ32);
			k = unop(UNPACKH, arg1, typ32);
			j = binop(DIV, j, k, typ32);

			/* Combine */
			return(binop(PACK, i, j, typ16u));
		#endif
		} else {
			/* use 32-bit divides */

			/* Divide even fields (sign-extended to 32 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 16ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 16ULL), typ32);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 16ULL), typ32);
			i = binop(DIV, i, j, typ32);
			i = binop(AND,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);

			/* Divide odd fields */
			j = binop(SHR, arg0, immed64u((p64_t) 16ULL), typ32);
			k = binop(SHR, arg1, immed64u((p64_t) 16ULL), typ32);
			j = binop(DIV, j, k, typ32);
			j = binop(SHL, j, immed64u((p64_t) 16ULL), typ32);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case MOD: /* 16s */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		#ifdef NOTDEFD
		  } else if (optcpu & CPU_AltiVec) {
			/* use 32-bit modulus */

			/* Take mod of low fields (sign-extended to 32 bits) */
			i = unop(UNPACKL, arg0, typ32);
			j = unop(UNPACKL, arg1, typ32);
			i = binop(MOD, i, j, typ32);

			/* Take mod of high fields */
			j = unop(UNPACKH, arg0, typ32);
			k = unop(UNPACKH, arg1, typ32);
			j = binop(MOD, j, k, typ32);

			/* Combine */
			return(binop(PACK, i, j, typ16u));
		#endif
		} else {
			/* use 32-bit modulus */

			/* Take mod of even fields (sign-extended to 32 bits) */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 16ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 16ULL), typ32);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 16ULL), typ32);
			i = binop(MOD, i, j, typ32);
			i = binop(AND,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				  typnull);

			/* Take mod of odd fields */
			j = binop(SHR, arg0, immed64u((p64_t) 16ULL), typ32);
			k = binop(SHR, arg1, immed64u((p64_t) 16ULL), typ32);
			j = binop(MOD, j, k, typ32);
			j = binop(SHL, j, immed64u((p64_t) 16ULL), typ32);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff0000)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case AVG: /* 16s */
		if ((optcpu & CPU_MAX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* Average rounds up */
			i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typ16);
			j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typ16);
			i = binop(ADD, i, j, typ16);

			/* Calculate and add rounding bit */
			j = binop(OR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x0001)),
				  typnull);
			i = binop(ADD, i, j, typ16);
			return(i);
		}

	case MIN: /* 16s */
		if ((optcpu & CPU_athlon) || (optcpu & CPU_AltiVec)) {
			break;
		} else if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* use GT */
			i = binop(GT, arg0, arg1, typ16);
			j = binop(AND, i, arg1, typnull);
			i = binop(ANDN, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ16);
			j = binop(AND, i, arg1, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		}

	case MAX: /* 16s */
		if ((optcpu & CPU_athlon) || (optcpu & CPU_AltiVec)) {
			break;
		} else if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			/* use GT */
			i = binop(GT, arg0, arg1, typ16);
			j = binop(AND, i, arg0, typnull);
			i = binop(ANDN, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ16);
			j = binop(AND, i, arg0, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		}

	case LT: /* 16s */
		return(binop(GT, arg1, arg0, typ16));

	case LE: /* 16s */
		return(binop(GE, arg1, arg0, typ16));

	case GE: /* 16s */
		/* the obvious hack is that x GE y is (x EQ y) OR (x GT y) */
		i = binop(GT, arg0, arg1, typ16);
		j = binop(EQ, arg0, arg1, typ16);
		return(binop(OR, i, j, typnull));

	case SHR: /* 16s */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			arg1 = shiftconst(arg1, typ16u);
			if (tup[arg1].op == NUM) {
				/* Shift by a constant is easy */
				i = binop(AND,
					  arg0,
					  immedu(cvt1x16uto8x16u(0x8000)),
					  typnull);
				i = binop(EQ,
					  i,
					  immedu(cvt1x16uto8x16u(0x8000)),
					  typ16u);
				switch (tup[arg1].immed.q[0]) {
				case 0x0ULL:
					i = arg0;
					break;
				case 0x1ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x7fff)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
						break;
				case 0x2ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x3fff)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x3ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x1fff)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x4ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x0fff)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x5ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x07ff)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x6ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x03ff)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x7ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x01ff)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x8ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x00ff)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x9ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x007f)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x10ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x003f)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x11ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x001f)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x12ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x000f)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x13ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x0007)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x14ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x0003)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				case 0x15ULL:
					i = binop(ANDN,
						immedu(cvt1x16uto8x16u(0x0001)),
						i,
						typnull);
					i = binop(OR,
						i,
						binop(SHR, arg0, arg1, typ16u),
						typnull);
					break;
				default:
					break;
				}
				return(i);
			}
			error("shift right of unsigned 16-bit field values "
			      "only implemented for a constant shift");
			return(immed64u((p64_t) 0ULL));
		}

	case PACK: /* 16s */
	case INTRLVLOW: /* 16s */
	case INTRLVHIGH: /* 8-bit to 16-bit interleave of 8-bit fields */
	case INTRLVODD: /* 16s */
	case INTRLVEVEN: /* 16s */
	case PERM: /* 16s */
		/* The same as unsigned */
		return(binop(op, arg0, arg1, typ16u));

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop16 op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop16ss(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 16-bit signed field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m, n;

	switch (op) {
	case ADD: /* 16ss */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* Do the signed add */
			i = binop(ADD, arg0, arg1, typ16);

			/* Correct for positive saturation */
			j = binop(OR, arg0, arg1, typnull);
			j = unop(NOT, j, typnull);	/* tX..X if both pos */
			j = binop(AND, j, i, typnull);	/* MSb(sum) = 1 */
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with PosSat if
			   arg1==0 */
			i = binop(OR, i, l, typnull);		/* T...T */
			j = unop(NOT, j, typnull);		/* f1..1 */
			i = binop(AND, i, j, typnull);


			/* Correct for negative saturation */
			j = binop(AND, arg0, arg1, typnull);
							/* tX..X if both neg */
			k = unop(NOT, i, typnull);
			j = binop(AND, j, k, typnull);	/* MSb(sum) = 0 */
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with NegSat if
			   arg1==0 */
			l = unop(NOT, l, typnull);		/* F...F */
			i = binop(AND, i, l, typnull);
			i = binop(OR, i, j, typnull);

			return(i);
		}
		
	case SUB: /* 16ss */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* Do the signed sub */
			i = binop(SUB, arg0, arg1, typ16);

			/* Correct for positive saturation */
			m = binop(XOR, arg0, arg1, typnull);
							/* tX..X if mixed */
			j = binop(AND, m, arg1, typnull);
						/* tX..X if arg0+ & arg1- */
			j = binop(AND, j, i, typnull);	/* MSb(diff) = 1 */
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with PosSat if
			   arg1==0 */
			i = binop(OR, i, l, typnull);		/* T...T */
			j = unop(NOT, j, typnull);		/* f1..1 */
			i = binop(AND, i, j, typnull);


			/* Correct for negative saturation */
			j = binop(AND, m, arg0, typnull);
						/* tX..X if arg0- & arg1+ */
			k = unop(NOT, i, typnull);
			j = binop(AND, j, k, typnull);	/* MSb(diff) = 0 */
			j = binop(AND,
				  j,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with NegSat if
			   arg1==0 */
			l = unop(NOT, l, typnull);		/* F...F */
			i = binop(AND, i, l, typnull);
			i = binop(OR, i, j, typnull);

			return(i);
		}

	case EQ: /* 16ss */
	case NE: /* 16ss */

	case LAND: /* 16ss */
	case LOR: /* 16ss */

	case AND: /* 16ss */
	case ANDN: /* 16ss */
	case OR: /* 16ss */
	case XOR: /* 16ss */
		/* These are the same as unsigned unsaturated */
		return(binop(op, arg0, arg1, typ16u));

	case AVG: /* 16ss */
	case MIN: /* 16ss */
	case MAX: /* 16ss */

	case LT: /* 16ss */
	case LE: /* 16ss */
	case GE: /* 16ss */
	case GT: /* 16ss */
		/* These are the same as signed unsaturated */
		return(binop(op, arg0, arg1, typ16));

	case MUL: /* 16ss */
		/* We want this:
			if (MSb of j)=1: -- Should be negative
			    if ((j != 0xffff) || (MSb of i)=0) return 0x8000;
			    else return low_word;
			else -- Should be positive
			    if ((j != 0x0000) || (MSb of i)=1) return 0x7fff;
			    else return low_word;
		*/

		if (optcpu & CPU_MAX) {
			/* Multiply and saturate even fields using 32 bit ops */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t)0x0000ffff)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 16ULL), typ32u);
			i = binop(SHR, i, immed64u((p64_t) 16ULL), typ32);

			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t)0x0000ffff)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 16ULL), typ32u);
			j = binop(SHR, j, immed64u((p64_t) 16ULL), typ32);

			i = binop(MUL, i, j, typ32);
			i = binop(MIN,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t)0x00007fff)),
				  typ32);
			i = binop(MAX,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t)0xffff8000)),
				  typ32);
			k = binop(AND,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t)0x0000ffff)),
				  typnull);


			/* Multiply and saturate odd fields using 32 bit ops */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t)0xffff0000)),
				  typnull);
			i = binop(SHR, i, immed64u((p64_t) 16ULL), typ32);

			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t)0xffff0000)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t) 16ULL), typ32);

			i = binop(MUL, i, j, typ32);
			i = binop(MIN,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t)0x00007fff)),
				  typ32);
			i = binop(MAX,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t)0xffff8000)),
				  typ32);
			i = binop(SHL, i, immed64u((p64_t) 16ULL), typ32u);

			return (binop(OR, i, k, typnull));
		} else if (optcpu & CPU_AltiVec) {
			/* Only have 16-bit muls...
			   sign-extend halves, mul, pack with saturation
			   sequence
			*/
			i = unop(UNPACKL, arg0, typ32);
			j = unop(UNPACKL, arg1, typ32);
			k = binop(MUL, i, j, typ32);

			i = unop(UNPACKH, arg0, typ32);
			j = unop(UNPACKH, arg1, typ32);
			i = binop(MUL, i, j, typ32);

			return(binop(PACK, k, i, typ16ss));
		} else {
			/* Do signed mul */
			i = binop(MUL, arg0, arg1, typ16);

			/* Do high mul, and convert to saturation mask */
			j = binop(MULH, arg0, arg1, typ16);

			/* Make MSb(k)=1 if NegSat */
			k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
			l = binop(AND, j, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(AND, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(AND, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, l, k, typnull);

			if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
				k = binop(AND, k, i, typnull);
				k = binop(ANDN, k, j, typnull);
			} else {
				k = binop(AND, i, k, typnull);
				k = unop(NOT, k, typnull);
				k = binop(AND, k, j, typnull);
			}

			k = binop(AND,
				  k,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);

			/* Form and apply a mask */
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			m = binop(OR, k, l, typnull);
			l = binop(SHR, m, immed64u((p64_t) 2ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 4ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, l, m, typnull);

			if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
				n = binop(ANDN, l, i, typnull);
			} else {
				l = unop(NOT, l, typnull);
				n = binop(AND, l, i, typnull);
			}
			n = binop(OR, k, n, typnull);


			/* Make MSb(k)=1 if PosSat */
			k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, j, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
			k = binop(OR, l, k, typnull);

			k = binop(OR, k, i, typnull);
			if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
				k = binop(ANDN, j, k, typnull);
			} else {
				j = unop(NOT, j, typnull);
				k = binop(AND, j, k, typnull);
			}

			k = binop(AND,
				  k,
				  immedu(cvt1x16uto8x16u(0x8000)),
				  typnull);

			/* Form and apply a mask */
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			m = binop(OR, k, l, typnull);
			l = binop(SHR, m, immed64u((p64_t) 2ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 4ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, l, m, typnull);

			i = binop(OR, l, n, typnull);
			if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
				i = binop(ANDN, k, i, typnull);
			} else {
				k = unop(NOT, k, typnull);
				i = binop(AND, k, i, typnull);
			}
			return(i);
		}

	case DIV: /* 16ss */
		/* This method will bomb on div by 0 instead of saturating,
		   which is, of course, the point of doing a sat DIV. */

		/* Modify 16-bit divides */
		i = binop(DIV, arg0, arg1, typ16);

		/* Correct for positive saturation */
		/* Get reduce-NOT of arg1 */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);

		/* Get reduceANY of arg1 */
		k = binop(SHL, arg1, immed64u((p64_t) 1ULL), typnull);
		k = binop(AND, arg1, k, typnull);
		l = binop(SHL, k, immed64u((p64_t) 2ULL), typnull);
		k = binop(AND, k, l, typnull); 
		l = binop(SHL, k, immed64u((p64_t) 4ULL), typnull);
		k = binop(AND, k, l, typnull); 
		l = binop(SHL, k, immed64u((p64_t) 8ULL), typnull);
		k = binop(AND, k, l, typnull);

		/* Get a~b~c~d~e~f~g~h... (i.e. arg0==10000000... */
		l = unop(NOT, arg0, typnull);
		m = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
		l = binop(AND, l, m, typnull);
		m = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(AND, l, m, typnull);
		m = binop(SHL, l, immed64u((p64_t) 4ULL), typnull);
		l = binop(AND, l, m, typnull);
		m = binop(SHL, l, immed64u((p64_t) 7ULL), typnull);
		l = binop(AND, l, m, typnull);
		l = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
		l = binop(AND, l, arg0, typnull);

		l = binop(AND, k, l, typnull);

		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			k = binop(ANDN, arg0, j, typnull);
		} else {
			k = unop(NOT, arg0, typnull);
			k = binop(AND, k, j, typnull);
		} /* ~a(tuple j) */
		k = binop(OR, l, k, typnull);
		k = binop(AND, k, immedu(cvt1x16uto8x16u(0x8000)), typnull);
		/* k has pattern T0000000,00000000 */

		/* Apply the mask to the MSb */
		l = unop(NOT, k, typnull);
		i = binop(AND, i, l, typnull);

		/* Convert k's T0000000,00000000 pattern to a 0TTTTTTT,TTTTTTTT
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* 0T000000*/
		l = binop(OR, k, l, typnull);			/* TT000000*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* 00TT0000*/
		l = binop(OR, l, m, typnull);			/* TTTT0000*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* 0000TTTT*/
		l = binop(OR, l, m, typnull);			/* TTTTTTTT*/
		m = binop(SHR, l, immed64u((p64_t) 7ULL), typnull);
							/* 0000000T,TTTTTTT0 */
		l = binop(OR, l, m, typnull);		/* TTTTTTTT,TTTTTTT0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* 0TTTTTTT,TTTTTTTT */

		/* Apply the mask to the LSbs */
		i = binop(OR, i, l, typnull);


		/* Correct for negative saturation */
		k = binop(AND, arg0, j, typnull);	/* a(tuple j) */
		k = binop(AND, k, immedu(cvt1x16uto8x16u(0x8000)), typnull);
		/* k has pattern T0000000,00000000 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's T0000000,00000000 pattern to a 1FFFFFFF,FFFFFFFF
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* 0T000000*/
		l = binop(OR, k, l, typnull);			/* TT000000*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* 00TT0000*/
		l = binop(OR, l, m, typnull);			/* TTTT0000*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* 0000TTTT*/
		l = binop(OR, l, m, typnull);			/* TTTTTTTT*/
		m = binop(SHR, l, immed64u((p64_t) 7ULL), typnull);
							/* 0000000T,TTTTTTT0 */
		l = binop(OR, l, m, typnull);		/* TTTTTTTT,TTTTTTT0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* 0TTTTTTT,TTTTTTTT */
		l = unop(NOT, l, typnull);		/* 1FFFFFFF,FFFFFFFF */

		/* Apply the mask to the LSbs */
		i = binop(AND, i, l, typnull);

		return(i);

	case MOD: /* 16ss */
		/* This method will bomb on mod by 0 instead of saturating,
		   which is, of course, the point of doing a sat MOD. */

		/* Modify 16-bit modulus */
		i = binop(MOD, arg0, arg1, typ16);

		/* Get reduce-NOT of arg1 */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);

		/* Correct for positive saturation */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			k = binop(ANDN, arg0, j, typnull);
		} else {
			k = unop(NOT, arg0, typnull);
			k = binop(AND, k, j, typnull);
		} /* ~a(tuple j) */

		k = binop(AND, k, immedu(cvt1x16uto8x16u(0x8000)), typnull);
		/* k has pattern T0000000,00000000 */

		/* Apply the mask to the MSb */
		l = unop(NOT, k, typnull);
		i = binop(AND, i, l, typnull);

		/* Convert k's T0000000,00000000 pattern to a 0TTTTTTT,TTTTTTTT
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* 0T000000*/
		l = binop(OR, k, l, typnull);			/* TT000000*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* 00TT0000*/
		l = binop(OR, l, m, typnull);			/* TTTT0000*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* 0000TTTT*/
		l = binop(OR, l, m, typnull);			/* TTTTTTTT*/
		m = binop(SHR, l, immed64u((p64_t) 7ULL), typnull);
							/* 0000000T,TTTTTTT0 */
		l = binop(OR, l, m, typnull);		/* TTTTTTTT,TTTTTTT0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* 0TTTTTTT,TTTTTTTT */

		/* Apply the mask to the LSbs */
		i = binop(OR, i, l, typnull);


		/* Correct for negative saturation */
		k = binop(AND, arg0, j, typnull);	/* a(tuple j) */
		k = binop(AND, k, immedu(cvt1x16uto8x16u(0x8000)), typnull);
		/* k has pattern T0000000,00000000 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's T0000000,00000000 pattern to a 1FFFFFFF,FFFFFFFF
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* 0T000000*/
		l = binop(OR, k, l, typnull);			/* TT000000*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* 00TT0000*/
		l = binop(OR, l, m, typnull);			/* TTTT0000*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* 0000TTTT*/
		l = binop(OR, l, m, typnull);			/* TTTTTTTT*/
		m = binop(SHR, l, immed64u((p64_t) 7ULL), typnull);
							/* 0000000T,TTTTTTT0 */
		l = binop(OR, l, m, typnull);		/* TTTTTTTT,TTTTTTT0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* 0TTTTTTT,TTTTTTTT */
		l = unop(NOT, l, typnull);		/* 1FFFFFFF,FFFFFFFF */

		/* Apply the mask to the LSbs */
		i = binop(AND, i, l, typnull);

		return(i);

	case SHL: /* 16ss */
		/* Not true? ... */
		/* the same thing signed or unsigned */
		return(binop(op, arg0, arg1, typ16u));

	case SHR: /* 16ss */
		/* done in the obvious way */
		break;

	case INTRLVLOW: /* 16ss */
	case INTRLVHIGH: /* 16ss */
		/* 8-bit to 16-bit interleave of 8-bit fields */
		/* The same as unsigned */
		return(binop(op, arg0, arg1, typ16u));

	case PACK: /* 16ss */
		/* 32ss -> 16ss */
		/* PACK (with saturation) each 32-bit field value to
		   16 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* Saturate arg0 */
			i = binop(MIN,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x00007fff)),
				  typ32);
			i = binop(MAX,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff8000)),
				  typ32);

			/* Saturate arg1 */
			j = binop(MIN,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x00007fff)),
				  typ32);
			j = binop(MAX,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0xffff8000)),
				  typ32);

			/* Pack as signed */
			return(binop(PACK, i, j, typ16));
		}

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop16ss op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}


int
binop32u(int op,
int arg0,
int arg1)
{
	/* 32-bit unsigned field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m, n;

	switch (op) {
	case ADD: /* 32u */
		if ((optcpu & CPU_MMX) || (optcpu == GenericIA32) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use implicit spacer technique */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x7fffffff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x7fffffff)),
				  typnull);
			i = binop(ADD, i, j, typ64u);
			j = binop(XOR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);
			return(binop(XOR, i, j, typnull));
		}

	case ADDH: /* 32u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("ADDH32u not available on this target");
		}

	case SUB: /* 32u */
		if ((optcpu & CPU_MMX) || (optcpu == GenericIA32) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use implicit spacer technique */
			i = binop(OR,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x7fffffff)),
				  typnull);
			i = binop(SUB, i, j, typ64u);
			j = binop(XOR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);
			j = binop(XOR,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);
			return(binop(XOR, i, j, typnull));
		}

	case MUL: /* 32u */
		if (optcpu & CPU_AltiVec) {
			/* Use 16-bit unsigned MULs */
			/* Clearing the odd fields of arg0 will save us one
			   masking operation. */
			i = binop(AND,
				arg0,
				immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				typnull);
			j = binop(MUL, i, arg1, typ16u);
			k = binop(MULH, i, arg1, typ16u);
			k = binop(SHL, k, immed64u((p64_t) 16ULL), typnull);

			i = binop(SHL, i, immed64u((p64_t) 16ULL), typnull);
			i = binop(MUL, i, arg1, typ16u);
			k = binop(ADD, k, i, typ16u);

			/* Clearing the odd fields of arg1 will save us one
			   masking operation. */
			i = binop(AND,
				arg1,
				immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				typnull);
			i = binop(SHL, i, immed64u((p64_t) 16ULL), typnull);
			i = binop(MUL, arg0, i, typ16u);
			k = binop(ADD, k, i, typ16u);

			return(binop(OR, k, j, typnull));
		} else if ((optcpu & CPU_MMX) || (optcpu == GenericIA32)) {
			break;
		} else {
#ifdef NOTDEFD
			/* Use 64-bit MUL
			   unsigned interleave, mul, cast via modulation, pack
			   with saturation sequence
			*/
			i = binop(INTRLVLOW,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ64u);
			j = binop(INTRLVLOW,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ64u);
			k = binop(MUL, i, j, typ64u);
			k = binop(AND,
				k,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);

			i = binop(INTRLVHIGH,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ64u);
			j = binop(INTRLVHIGH,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ64u);
			i = binop(MUL, i, j, typ64u);
			i = binop(AND,
				i,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);

			return(binop(PACK, k, i, typ32us));
#else
			unsigned long long step;

			/* Perform a shift-add sequence */
			i = immed64u((p64_t) 0ULL);
			for (step=0ULL; step<32ULL; ++step)
			{
				j = binop(AND,
					  arg0,
					  immed64u((p64_t) (1ULL<<step)),
					  typnull);
				k = binop(NE,
					  j,
					  immed64u((p64_t) 0ULL),
					  typ32u);
				j = binop(AND, arg0, k, typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) step),
					  typ32u);
				i = binop(ADD, i, j, typ32u);
			}
			return(i);
#endif
		}

	case MULH: /* 32u */
		if (optcpu & CPU_MMX) {
#ifdef NOTDEFD
			/* I think this was correct, but it is too long */
			i = binop(SHR, arg0, immed64u((p64_t) 16ULL), typ32u);
			j = binop(SHR, arg1, immed64u((p64_t) 16ULL), typ32u);
			k = binop(AND,
				arg0,
				immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				typnull);
			l = binop(AND,
				arg1,
				immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				typnull);
			m = binop(MULH, k, l, typ16u);
			n = binop(MUL, i, l, typ16u);
			m = binop(ADD, m, n, typ32u);

			n = binop(MUL, k, j, typ16u);
			m = binop(ADD, m, n, typ32u);
			m = binop(SHR, m, immed64u((p64_t) 16ULL), typ32u);

			n = binop(MULH, i, l, typ16u);
			m = binop(ADD, m, n, typ32u);

			n = binop(MULH, k, j, typ16u);
			m = binop(ADD, m, n, typ32u);

			n = binop(MUL, i, j, typ32u);
			m = binop(ADD, m, n, typ32u);

			return(m);
#else
			i = binop(AND,
				arg0,
				immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				typnull);
			k = binop(MUL, i, j, typ32u);
			k = binop(SHR, k, immed64u((p64_t) 16ULL), typ32u);

			l = binop(SHR, arg0, immed64u((p64_t) 16ULL), typ32u);
			m = binop(MUL, j, l, typ32u);
			k = binop(ADD, k, m, typ32u);

			m = binop(SHR, arg1, immed64u((p64_t) 16ULL), typ32u);
			n = binop(MUL, i, m, typ32u);
			k = binop(ADD, k, n, typ32u);
			k = binop(SHR, k, immed64u((p64_t) 16ULL), typ32u);

			l = binop(MUL, l, m, typ32u);
			i = binop(ADD, k, l, typ32u);

			return(i);
#endif
		} else if (optcpu & CPU_AltiVec) {
			i = binop(MULEVEN, arg0, arg1, typ16u);

			j = binop(SHR, arg0, immed64u((p64_t) 16ULL), typ32u);
			k = binop(SHR, arg1, immed64u((p64_t) 16ULL), typ32u);

			j = binop(MULEVEN, arg1, j, typ16u);
			k = binop(MULEVEN, arg0, k, typ16u);

			i = binop(SHR, i, immed64u((p64_t) 16ULL), typ32u);

			l = binop(ADD, i, j, typ32u);
			j = binop(ADDH, i, j, typ32u);

			i = binop(ADD, l, k, typ32u);
			k = binop(ADDH, l, k, typ32u);

			i = binop(SHR, i, immed64u((p64_t) 16ULL), typ32u);
			i = binop(ADD, i, j, typ32u);
			i = binop(ADD, i, k, typ32u);

			return(i);
		} else {
			/* Emulate using two 64-bit unsigned MULs */
			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			k = binop(MUL, i, j, typ64u);
			k = binop(AND,
				k,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			k = binop(SHR, k, immed64u((p64_t) 32ULL), typnull);

			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			i = binop(SHR, i, immed64u((p64_t) 32ULL), typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			j = binop(SHR, j, immed64u((p64_t) 32ULL), typnull);
			l = binop(MUL, i, j, typ64u);
			l = binop(AND,
				l,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);

			return (binop(OR, k, l, typnull));
		}

	case DIV: /* 32u */
		if ((optcpu & CPU_MMX) || (optcpu == GenericIA32) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		#ifdef NOTDEFD
		  } else if (optcpu & CPU_AltiVec) {
			/* use 64-bit divides */
			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			i = binop(DIV, i, j, typ64u);
			j = binop(SHR, arg0, immed64u((p64_t) 32ULL), typ64u);
			k = binop(SHR, arg1, immed64u((p64_t) 32ULL), typ64u);
			j = binop(DIV, j, k, typ64u);
			return(binop(INTRLVEVEN, i, j, typ32u));
		#endif
		} else {
			/* use 64-bit divides */
			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			i = binop(DIV, i, j, typ64u);
			j = binop(SHR, arg0, immed64u((p64_t) 32ULL), typ64u);
			k = binop(SHR, arg1, immed64u((p64_t) 32ULL), typ64u);
			j = binop(DIV, j, k, typ64u);
			j = binop(SHL, j, immed64u((p64_t) 32ULL), typ64u);
			return(binop(OR, i, j, typnull));
		}

	case MOD: /* 32u */
		if ((optcpu & CPU_MMX) || (optcpu == GenericIA32) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		#ifdef NOTDEFD
		  } else if (optcpu & CPU_AltiVec) {
			/* use 64-bit modulus */
			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			i = binop(MOD, i, j, typ64u);
			j = binop(SHR, arg0, immed64u((p64_t) 32ULL), typ64u);
			k = binop(SHR, arg1, immed64u((p64_t) 32ULL), typ64u);
			j = binop(MOD, j, k, typ64u);
			return(binop(INTRLVEVEN, i, j, typ32u));
		#endif
		} else {
			/* use 64-bit modulus */
			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			i = binop(MOD, i, j, typ64u);
			j = binop(SHR, arg0, immed64u((p64_t) 32ULL), typ64u);
			k = binop(SHR, arg1, immed64u((p64_t) 32ULL), typ64u);
			j = binop(MOD, j, k, typ64u);
			j = binop(SHL, j, immed64u((p64_t) 32ULL), typ64u);
			return(binop(OR, i, j, typnull));
		}

	case MIN: /* 32u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if (optcpu & CPU_MMX) {
			/* use GT */
			i = binop(GT, arg0, arg1, typ32u);
			j = binop(AND, i, arg1, typnull);
			i = binop(ANDN, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ32u);
			j = binop(AND, i, arg1, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		}

	case MAX: /* 32u */
		/* use GT */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if (optcpu & CPU_MMX) {
			i = binop(GT, arg0, arg1, typ32u);
			j = binop(AND, i, arg0, typnull);
			i = binop(ANDN, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = binop(GT, arg0, arg1, typ32u);
			j = binop(AND, i, arg0, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		}

	case AVG: /* 32u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* Average rounds up */
			i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typ32u);
			j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typ32u);
			i = binop(ADD, i, j, typ32u);
			j = binop(OR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t)0x00000001)),
				  typnull);
			i = binop(ADD, i, j, typ32u);
			return(i);
		}

	case AND: /* 32u */
		if (optcpu == GenericIA32) {
			break;
		} else {
			return(binop(AND, arg0, arg1, typnull));
		}

	case ANDN: /* 32u */
		if (optcpu & CPU_MMX) {
			return(binop(ANDN, arg0, arg1, typnull));
		} else {
		    return(binop(AND, unop(NOT,arg0,typnull), arg1, typnull));
		}
		break;

	case OR: /* 32u */
	case XOR: /* 32u */
		if (optcpu == GenericIA32) {
			break;
		} else {
			return(binop(op, arg0, arg1, typnull));
		}

	case EQ_C: /* 32u */
		break;

	case EQ: /* 32u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else if (optcpu == GenericIA32) {
			i = binop(EQ_C, arg0, arg1, typ32u);
			return(unop(NEG, i, typ32));
		} else {
			/* use 64-bit EQ */
			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			i = binop(EQ, i, j, typ64u);
			i = binop(AND,
				i,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			j = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			k = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			j = binop(EQ, j, k, typ64u);
			j = binop(AND,
				j,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			return(binop(OR, i, j, typnull));
		}

	case NE: /* 32u */
		return(unop(NOT, binop(EQ, arg0, arg1, typ32u), typnull));

	case LT: /* 32u */
		return(binop(GT, arg1, arg0, typ32u));

	case LE: /* 32u */
		return(binop(GE, arg1, arg0, typ32u));

	case GT_C: /* 32u */
		break;

	case GT: /* 32u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if (optcpu & CPU_MMX) {
			/* Add offset and do signed GT */
			i = binop(ADD,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t)0x80000000)),
				  typ32u);
			j = binop(ADD,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t)0x80000000)),
				  typ32u);
			return(binop(GT, i, j, typ32));
		} else if (optcpu == GenericIA32) {
			i = binop(GT_C, arg0, arg1, typ32u);
			return(unop(NEG, i, typ32));
		} else {
			/* use 64-bit GT */

			/* Compare even fields */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x64uto2x64u(
					(p64_t)0x00000000ffffffffULL)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x64uto2x64u(
					(p64_t)0x00000000ffffffffULL)),
				  typnull);
			i = binop(GT, i, j, typ64u);
			i = binop(AND,
				  i,
				  immedu(cvt1x64uto2x64u(
					(p64_t)0x00000000ffffffffULL)),
				  typnull);

			/* Compare odd fields */
			j = binop(AND,
				  arg0,
				  immedu(cvt1x64uto2x64u(
					(p64_t)0xffffffff00000000ULL)),
				  typnull);
			k = binop(AND,
				  arg1,
				  immedu(cvt1x64uto2x64u(
					(p64_t)0xffffffff00000000ULL)),
				  typnull);
			j = binop(GT, j, k, typ64u);
			j = binop(AND,
				  j,
				  immedu(cvt1x64uto2x64u(
					(p64_t)0xffffffff00000000ULL)),
				  typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case GE: /* 32u */
		/* the obvious hack is that x GE y is (x EQ y) OR (x GT y) */
		i = binop(GT, arg0, arg1, typ32u);
		j = binop(EQ, arg0, arg1, typ32u);
		return(binop(OR, i, j, typnull));

	case LAND: /* 32u */
		/* use 32-bit NE 0 to normalize fields before AND */
		i = binop(NE, arg0, immed64u((p64_t) 0x0ULL), typ32u);
		j = binop(NE, arg1, immed64u((p64_t) 0x0ULL), typ32u);
		return(binop(AND, i, j, typnull));

	case LOR: /* 32u */
		/* use 32-bit NE 0 to normalize fields after ORing */
		i = binop(OR, arg0, arg1, typnull);
		return(binop(NE, i, immed64u((p64_t) 0ULL), typ32u));

	case SHL: /* 32u */
		if ((optcpu & CPU_MMX) || (optcpu == GenericIA32) ||
		    (optcpu & CPU_AltiVec))
			break;

		arg1 = shiftconst(arg1, typ8u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			i = binop(SHL,
				  arg0,
				  immed64u((p64_t)tup[arg1].immed.uq[0]),
				  typnull);

			if (tup[arg1].immed.q[0] > 31ULL)
			    return (immed64u((p64_t) 0ULL));
			else
			    return (binop(AND,
					i,
					immedu(cvt1x32uto4x32u((p32_t)
					 (0xffffffff<<tup[arg1].immed.ud[0]))),
					typnull));
		}
		error("shift left of unsigned 32-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	case SHR: /* 32u */
		if ((optcpu & CPU_MMX) || (optcpu == GenericIA32) ||
		    (optcpu & CPU_AltiVec))
			break;

		arg1 = shiftconst(arg1, typ8u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			i = binop(SHL,
				  arg0,
				  immed64u((p64_t)tup[arg1].immed.uq[0]),
				  typnull);

			if (tup[arg1].immed.q[0] > 31ULL)
			    return (immed64u((p64_t) 0ULL));
			else
			    return (binop(AND,
					i,
					immedu(cvt1x32uto4x32u((p32_t)
					 (0xffffffff>>tup[arg1].immed.ud[0]))),
					typnull));
		}
		error("shift right of unsigned 32-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	case PACK: /* 32u */
		/* 64u -> 32u */
		/* PACK (without saturation) each 64-bit field value to
		   32 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Fail for 32-bit target */
		if (bitsperfrag() < 64) {
			char buf[64];
			snprintf(buf,
				 64,
				 "PACK32u failed for %d-bit target",
				 bitsperfrag());
			info(0, buf);
		}

		if (optcpu & CPU_MAX) {
			return(binop(INTRLVEVEN, arg1, arg0, typ32u));
		}

		if (bitsperfrag() > 64) {
			i = binop(AND,
				  arg0,
				  immedu(cvt1x64uto2x64u((p64_t)0xffffffffULL)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t) 32ULL), typnull);
			i = binop(OR, i, j, typnull);

			k = binop(AND,
				  arg1,
				  immedu(cvt1x64uto2x64u((p64_t)0xffffffffULL)),
				  typnull);
			l = binop(SHR, k, immed64u((p64_t) 32ULL), typnull);
			k = binop(OR, k, l, typnull);

			/* Combine packed arg0 and arg1 in result */
			return(binop(PACK, i, k, typ64u));
		} else {
			/* Stop here if 64-bit target */

			/* Keep low 32 bits of arg0 in packed form in
			   low half (64-bit target) */
			i = binop(AND,
				  arg0,
				  immed32u((p32_t)0xffffffff),
				  typnull);

			/* Move low 32 bits of arg1 in packed form to
			   high half */
			k = binop(SHL, arg1, immed64u((p64_t) 32ULL), typnull);

			/* Combine packed arg0 and arg1 in result */
			return(binop(OR, i, k, typnull));
		}

	case INTRLVLOW: /* 32u */
	    /* 16-bit to 32-bit interleave of 16-bit fields */
	    if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
		break;		/* vmrglh for AltiVec */
	    } else if (optcpu & CPU_MAX) {
		i = binop(PERM, arg0, immed64u((p64_t) 2031ULL), typ16u);
		j = binop(PERM, arg1, immed64u((p64_t) 2031ULL), typ16u);
		return(binop(INTRLVODD, j, i, typ16u));
	    } else {
			int bpf = bitsperfrag();
			p128_t tmp;

			/* high half */
			j = arg1;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0xffffffffffffffffULL),
					typnull);
				tmp.uq[1] = 0xffffffffffffffffULL;
				tmp.uq[0] = 0x0000000000000000ULL;
				j = binop(AND,
					j,
					immed128u(tmp),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 64ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 32:
				i = binop(AND,
					  j,
					  (bpf==32)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==32)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
				break;
			default:
				bug("INTRLVLOW 32u reports bad bitsperfrag()");
				i = arg0;
			}
			k = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);


			/* low half */
			j = arg0;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0xffffffffffffffffULL),
					typnull);
				tmp.uq[1] = 0xffffffffffffffffULL;
				tmp.uq[0] = 0x0000000000000000ULL;
				j = binop(AND,
					j,
					immed128u(tmp),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 64ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 32:
				i = binop(AND,
					  j,
					  (bpf==64)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
				break;
			default:
				bug("INTRLVLOW 32u reports bad bitsperfrag()");
				i = arg0;
			}

			return(binop(OR, i, k, typnull));
	    }

	case INTRLVHIGH: /* 32u */
	    /* 16-bit to 32-bit interleave of 16-bit fields */
	    if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
		break;		/* vmrghh for AltiVec */
	    } else if (optcpu & CPU_MAX) {
		i = binop(PERM, arg0, immed64u((p64_t) 2031ULL), typ16u);
		j = binop(PERM, arg1, immed64u((p64_t) 2031ULL), typ16u);
		return(binop(INTRLVEVEN, j, i, typ16u));
	    } else {
		/* sneaky way to reuse INTRLVLOW code... */
		unsigned long long bpf_2 =
			(unsigned long long) bitsperfrag()/2ULL;
		i = binop(SHR, arg0, immed64u((p64_t) bpf_2), typnull);
		j = binop(SHR, arg1, immed64u((p64_t) bpf_2), typnull);
		return(binop(INTRLVLOW, i, j, typ32u));
	    }

	case INTRLVODD: /* 32u */
		/* Interleave 32-bit fields */
		if (optcpu & CPU_MAX) {
			break;
		} else if (optcpu & CPU_AltiVec) {
#ifdef TRINOPS
			p128_t tmp;
			tmp.uq[1] = 0x1011121300010203ULL;
			tmp.uq[0] = 0x18191A1B08090A0BULL;
			return(trinop(TPERM,
				      arg0,
				      arg1,
				      immed128u(tmp),
				      typ32u));
#else
			/* For now, implement in target header file */
			break;
#endif
		}
		bug("INTRLVODD not available on this target");
		break;

	case INTRLVEVEN: /* 32u */
		/* Interleave 32-bit fields */
		if (optcpu & CPU_MAX) {
			break;
		} else if (optcpu & CPU_AltiVec) {
#ifdef TRINOPS
			p128_t tmp;
			tmp.uq[1] = 0x1415161704050607ULL;
			tmp.uq[0] = 0x1C1D1E1F0C0D0E0FULL;
			return(trinop(TPERM,
				      arg0,
				      arg1,
				      immed128u(tmp),
				      typ32u));
#else
			/* For now, implement in target header file */
			break;
#endif
		}
		bug("INTRLVEVEN not available on this target");
		break;

	case PERM: /* 32u */
		/* Arbitrary permutation of 32-bit fields */
		if (optcpu & CPU_MAX) {
			if (tup[arg1].op == NUM) {
				switch(tup[arg1].immed.q[0]) {
				case 00: i = immed64u((p64_t) 101ULL); break;
				case 01: i = immed64u((p64_t) 123ULL); break;
				case 10: i = immed64u((p64_t) 2310ULL); break;
				case 11: i = immed64u((p64_t) 2323ULL); break;
				default:
				    {
					char buf[64];
					snprintf(buf,
					    64,
					    "Illegal permutation index %llu",
					    tup[arg1].immed.uq[0]);
					warn(buf);
					i = immed64u((p64_t) 123ULL); break;
				    }
				}
				return(binop(PERM, arg0, i, typ16u));
			}
			bug("PERM index restricted to constants");
			break;
		}

		bug("PERM not available on this target");
		break;

	case REPL: /* 32u */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			bug("REPL32u not available on this target");
			break;
		}

	default: /* 32u */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop32u op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop32us(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 32-bit unsigned field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l;

	switch (op) {
	case MIN:
	case MAX:
	case AVG:

	case EQ:
	case NE:

	case AND:
	case ANDN:
	case OR:
	case XOR:

	case LAND:
	case LOR:

	case GT:
	case GE:
	case LT:
	case LE:
		/* These are the same as unsigned unsaturated */
		return(binop(op, arg0, arg1, typ32u));

	case ADD: /* 32us */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* Do 32u add of LSbs */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x32uto4x32u((p32_t) 0x7fffffff)),
				  typnull);
			j = binop(AND,
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x7fffffff)),
				  typnull);
			k = binop(ADD, i, j, typ32u);
				/* sum(LSbs). Cin = MSb(k) */

			/* Find propagate and generate for MSb */
			i = binop(XOR, arg0, arg1, typnull);	/* prop */
			j = binop(AND, arg0, arg1, typnull);	/* gen */

			/* If no OF, then sum = prop(MSb) + sum(LSbs) */
			l = binop(AND,
				  i,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);
			l = binop(ADD, k, l, typ32u);		/* SUM */

			/* overflow = (prop & cin) | gen */
			i = binop(AND, i, k, typnull);
			i = binop(OR, i, j, typnull);
			i = binop(AND,			/* MSb(i) = OF */
				  arg1,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);

			/* ...and create a saturation mask */
			j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
			i = binop(OR, j, i, typnull);
			j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
			i = binop(OR, j, i, typnull);
			j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
			i = binop(OR, j, i, typnull);
			j = binop(SHR, i, immed64u((p64_t) 8ULL), typnull);
			i = binop(OR, j, i, typnull);
			j = binop(SHR, i, immed64u((p64_t) 16ULL), typnull);
			i = binop(OR, j, i, typnull);

			/* Clobber the calculated value with the max on
			   overflow */
			return(binop(OR, l, i, typnull));
		}

	case SUB: /* 32us */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* Do unsaturated 32u sub */
			i = binop(SUB, arg0, arg1, typ32u);

			/* NegSat mask is (arg0 < arg1) -> want NOT of this */
			j = binop(GE, arg0, arg1, typ32u);

			/* Clobber the calculated value with zero on overflow */
			return(binop(AND, j, i, typnull));
		}

	case MUL: /* 32us */
		if (optcpu & CPU_MAX) {
			/* Use 64-bit MULs...
			   unsigned interleave, mul, pack with saturation
			   sequence
			*/
			i = binop(INTRLVLOW,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ64u);
			j = binop(INTRLVLOW,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ64u);
			k = binop(MUL, i, j, typ64u);

			i = binop(INTRLVHIGH,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ64u);
			j = binop(INTRLVHIGH,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ64u);
			i = binop(MUL, i, j, typ64u);

			return(binop(PACK, k, i, typ32us));

		} else if (optcpu & CPU_AltiVec) {
			/* Do unsigned mul */
			i = binop(MUL, arg0, arg1, typ32u);

			/* Do high mul, and convert to saturation mask */
			j = binop(MULH, arg0, arg1, typ32u);
			k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, j, k, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t) 31ULL), typ32);

			i = binop(OR, i, j, typnull);
			return(i);
		} else {
			/* Do unsigned mul */
			i = binop(MUL, arg0, arg1, typ32u);

			/* Do high mul, and convert to saturation mask */
			j = binop(MULH, arg0, arg1, typ32u);
			k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, j, k, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);
#ifdef NOTDEFD
			/* Do this if a SHR32s is not available */
			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHR, j, immed64u((p64_t) 8ULL), typnull);
			j = binop(OR, j, k, typnull);
			k = binop(SHR, j, immed64u((p64_t) 16ULL), typnull);
			j = binop(OR, j, k, typnull);
#else
			/* MMX has SHR32s so use it */
			j = binop(SHR, j, immed64u((p64_t) 31ULL), typ32);
#endif
			i = binop(OR, i, j, typnull);
			return(i);
		}

	case DIV: /* 32us */
		/* This method will bomb on div by 0 instead of saturating,
		   which is, of course, the point of doing a sat DIV. */

		/* Do divide as usual */
		i = binop(DIV, arg0, arg1, typ32u);

		/* Generate a saturation mask */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(AND, j, k, typnull);

		j = binop(AND,
			  j,
			  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
			  typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(OR, k, j, typnull);

		/* Clobber the calculated value with MAX if arg1==0 */
		return(binop(OR, i, j, typnull));

	case MOD: /* 32us */
		/* This method will bomb on mod by 0 instead of saturating,
		   which is, of course, the point of doing a sat MOD. */

		/* Do modulus as usual */
		i = binop(MOD, arg0, arg1, typ32u);

		/* Generate a saturation mask */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(AND, j, k, typnull);

		j = binop(AND,
			  j,
			  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
			  typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(OR, k, j, typnull);

		/* Clobber the calculated value with MAX if arg1==0 */
		return(binop(OR, i, j, typnull));

	case SHL: /* 32us */
	case SHR: /* 32us */
		/* done in the obvious way */
		break;

	case PACK: /* 32us */
		/* 64us -> 32us */
		/* PACK (with unsigned saturation) each 64-bit field value to
		   32 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Fail for 32-bit target */
		if (bitsperfrag() < 64) {
			char buf[64];
			snprintf(buf,
				64,
				"PACK32us failed for %d-bit target",
				bitsperfrag());
			info(0, buf);
		}


		/* Saturate arg0 */
		i = binop(SHR, arg0, immed64u((p64_t) 16ULL), typnull);
		i = binop(OR, arg0, i, typnull);
		j = binop(SHR, i, immed64u((p64_t) 8ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, i, j, typnull);
		j = binop(AND,
			  j,
			  immedu(cvt1x64uto2x64u((p64_t)0x0000000100000000ULL)),
			  typnull);
		j = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		i = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 8ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 16ULL), typnull);
		i = binop(OR, i, j, typnull);
		i = binop(OR, arg0, i, typnull);

		/* Saturate arg1 */
		k = binop(SHR, arg1, immed64u((p64_t) 16ULL), typnull);
		k = binop(OR, arg1, k, typnull);
		l = binop(SHR, k, immed64u((p64_t) 8ULL), typnull);
		k = binop(OR, k, l, typnull);
		l = binop(SHR, k, immed64u((p64_t) 4ULL), typnull);
		k = binop(OR, k, l, typnull);
		l = binop(SHR, k, immed64u((p64_t) 2ULL), typnull);
		k = binop(OR, k, l, typnull);
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, l, typnull);
		l = binop(AND,
			  l,
			  immedu(cvt1x64uto2x64u((p64_t)0x0000000100000000ULL)),
			  typnull);
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
		k = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
		k = binop(OR, k, l, typnull);
		l = binop(SHR, k, immed64u((p64_t) 2ULL), typnull);
		k = binop(OR, k, l, typnull);
		l = binop(SHR, k, immed64u((p64_t) 4ULL), typnull);
		k = binop(OR, k, l, typnull);
		l = binop(SHR, k, immed64u((p64_t) 8ULL), typnull);
		k = binop(OR, k, l, typnull);
		l = binop(SHR, k, immed64u((p64_t) 16ULL), typnull);
		k = binop(OR, k, l, typnull);
		k = binop(OR, arg1, k, typnull);

		/* Combine packed arg0 and arg1 in result */
		return(binop(PACK, i, k, typ32u));

	case INTRLVLOW: /* 32us */
	case INTRLVHIGH: /* 32us */
		/* 16-bit to 32-bit interleave of 16-bit fields */
		/* These are the same as unsigned */
		return(binop(op, arg0, arg1, typ32u));

	default: /* 32us */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop32us op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop32(int op,
int arg0,
int arg1)
{
	/* 32-bit signed field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m;

	switch (op) {
	case DIV: /* 32s */
	case MOD:
		if (optcpu == GenericIA32) {
			break;
		} else {
			return(binop(op, arg0, arg1, typ32u));
		}

	case ADD:
	case SUB:
	case MUL:

	case EQ:
	case NE:

	case AND:
	case ANDN:
	case OR:
	case XOR:

	case LAND:
	case LOR:

	case SHL: /* 32s */

	case PACK:

	case INTRLVLOW:
	case INTRLVHIGH: /* 16-bit to 32-bit interleave of 16-bit fields */
	case INTRLVEVEN:
	case INTRLVODD:
	case PERM:
		/* These are the same as unsigned */
		return(binop(op, arg0, arg1, typ32u));

	case MULH: /* 32s */
		if (optcpu & CPU_MMX) {
			i = binop(SHR, arg0, immed64u((p64_t) 16ULL), typ32);
			j = binop(SHR, arg1, immed64u((p64_t) 16ULL), typ32);
			k = binop(MUL, i, j, typ32);

			l = binop(AND,
				arg0,
				immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				typnull);
			l = binop(MULH, l, j, typ16);

			m = binop(AND,
				arg1,
				immedu(cvt1x32uto4x32u((p32_t) 0x0000ffff)),
				typnull);
			m = binop(MULH, m, i, typ16);

			k = binop(ADD, k, l, typ32);

			return (binop(ADD, k, m, typ32));
		} else if (optcpu & CPU_AltiVec) {
			i = binop(MULEVEN, arg0, arg1, typ16u);
			i = binop(SHR, i, immed64u((p64_t)16ULL), typ32u);

			j = binop(SHR, arg0, immed64u((p64_t)16ULL), typ32u);
			j = binop(MULEVEN, j, arg1, typ16u);

			k = binop(SHR, arg1, immed64u((p64_t)16ULL), typ32u);
			k = binop(MULEVEN, k, arg0, typ16u);

			l = binop(ADD, i, j, typ32u);
			j = binop(ADDH, i, j, typ32u);
			l = binop(ADD, l, k, typ32u);
			k = binop(ADDH, l, k, typ32u);

			l = binop(SHR, l, immed64u((p64_t)16ULL), typ32u);
			l = binop(ADD, l, j, typ32u);
			l = binop(ADD, l, k, typ32u);

			i = binop(MULODD, arg0, arg1, typ16u);
			return(binop(ADD, i, l, typ32));
		} else {
			/* Emulate using two 64-bit signed MULs */
			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			k = binop(MUL, i, j, typ64);
			k = binop(AND,
				k,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			k = binop(SHR, k, immed64u((p64_t) 32ULL), typnull);

			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			i = binop(SHR, i, immed64u((p64_t) 32ULL), typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			j = binop(SHR, j, immed64u((p64_t) 32ULL), typnull);
			l = binop(MUL, i, j, typ64);
			l = binop(AND,
				l,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);

			return (binop(OR, k, l, typnull));
		}

	case MIN: /* 32s */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if (optcpu & CPU_MMX) {
			/* use GT */
			i = binop(GT, arg0, arg1, typ32);
			j = binop(AND, i, arg1, typnull);
			i = binop(ANDN, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ32);
			j = binop(AND, i, arg1, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		}

	case MAX: /* 32s */
		if (optcpu & CPU_AltiVec) {
			break;
		} else if (optcpu & CPU_MMX) {
			/* use GT */
			i = binop(GT, arg0, arg1, typ32);
			j = binop(AND, i, arg0, typnull);
			i = binop(ANDN, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ32);
			j = binop(AND, i, arg0, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		}

	case AVG: /* 32s */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* Average rounds up */
			i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typ32);
			j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typ32);
			i = binop(ADD, i, j, typ32);

			/* Calculate and add rounding bit */
			j = binop(OR, arg0, arg1, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t)0x00000001)),
				  typnull);
			i = binop(ADD, i, j, typ32);
			return(i);
		}

	case LT: /* 32s */
		return(binop(GT, arg1, arg0, typ32));

	case LE: /* 32s */
		return(binop(GE, arg1, arg0, typ32));

	case GT_C: /* 32s */
		break;

	case GT: /* 32s */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			break;
		} else if (optcpu == GenericIA32) {
			i = binop(GT_C, arg0, arg1, typ32);
			return(unop(NEG, i, typ32));
		} else {
			/* use 64-bit GT */
			/* Compare even fields (sign-extended to 64 bits) */
			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			i = binop(SHL, i, immed64u((p64_t) 32ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 32ULL), typ64);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);
			j = binop(SHL, j, immed64u((p64_t) 32ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 32ULL), typ64);
			i = binop(GT, i, j, typ64);
			i = binop(AND,
				i,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x00000000ffffffffULL)),
				typnull);

			/* Compare odd fields */
			j = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			k = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);
			j = binop(GT, j, k, typ64);
			j = binop(AND,
				j,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0xffffffff00000000ULL)),
				typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

	case GE: /* 32s */
		/* the obvious hack is that x GE y is (x EQ y) OR (x GT y) */
		i = binop(GT, arg0, arg1, typ32);
		j = binop(EQ, arg0, arg1, typ32);
		return(binop(OR, i, j, typnull));

	case SHR: /* 32s */
		break;

	default: /* 32s */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop32 op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop32ss(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 32-bit signed field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m, n;

	switch (op) {
	case EQ: /* 32ss */
	case NE:

	case AND:
	case ANDN:
	case OR:
	case XOR:

	case LAND:
	case LOR:
		/* These are the same as unsigned unsaturated */
		return(binop(op, arg0, arg1, typ32u));

	case AVG:
	case MIN:
	case MAX:

	case GT:
	case LT:
	case LE:
	case GE: /* 32ss */
		/* These are the same as signed unsaturated */
		return(binop(op, arg0, arg1, typ32));

	case ADD: /* 32ss */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* Do the signed add */
			i = binop(ADD, arg0, arg1, typ32);

			/* Correct for positive saturation */
			j = binop(OR, arg0, arg1, typnull);
			j = unop(NOT, j, typnull);	/* tX..X if both pos */
			j = binop(AND, j, i, typnull);	/* MSb(sum) = 1 */
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with PosSat if
			   arg1==0 */
			i = binop(OR, i, l, typnull);		/* T...T */
			j = unop(NOT, j, typnull);		/* f1..1 */
			i = binop(AND, i, j, typnull);


			/* Correct for negative saturation */
			j = binop(AND, arg0, arg1, typnull);
							/* tX..X if both neg */
			k = unop(NOT, i, typnull);
			j = binop(AND, j, k, typnull);	/* MSb(sum) = 0 */
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with NegSat if
			   arg1==0 */
			l = unop(NOT, l, typnull);		/* F...F */
			i = binop(AND, i, l, typnull);
			i = binop(OR, i, j, typnull);

			return(i);
		}
		
	case SUB: /* 32ss */
		if (optcpu & CPU_AltiVec) {
			break;
		} else {
			/* Do the signed sub */
			i = binop(SUB, arg0, arg1, typ32);

			/* Correct for positive saturation */
			m = binop(XOR, arg0, arg1, typnull);
						/* tX..X if mixed */
			j = binop(AND, m, arg1, typnull);
						/* tX..X if arg0+ & arg1- */
			j = binop(AND, j, i, typnull);	/* MSb(diff) = 1 */
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with PosSat if
			   arg1==0 */
			i = binop(OR, i, l, typnull);		/* T...T */
			j = unop(NOT, j, typnull);		/* f1..1 */
			i = binop(AND, i, j, typnull);


			/* Correct for negative saturation */
			j = binop(AND, m, arg0, typnull);
						/* tX..X if arg0- & arg1+ */
			k = unop(NOT, i, typnull);
			j = binop(AND, j, k, typnull);	/* MSb(diff) = 0 */
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);

			k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
			l = binop(OR, k, j, typnull);
			k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, k, l, typnull);
			k = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
			l = binop(OR, k, l, typnull);		/* T...T */

			/* Clobber the calculated value with NegSat if
			   arg1==0 */
			l = unop(NOT, l, typnull);		/* F...F */
			i = binop(AND, i, l, typnull);
			i = binop(OR, i, j, typnull);

			return(i);
		}

	case MUL: /* 32ss */
		/* We want this:
			if (MSb of j)=1: -- Should be negative
			    if ((j != 0xf..f) || (MSb of i)=0) return 0x8...0;
			    else return low_word;
			else -- Should be positive
			    if ((j != 0x0..0) || (MSb of i)=1) return 0x7f..f;
			    else return low_word;
		*/

		if (optcpu & CPU_MAX) {
			/* Multiply and saturate even fields using 64 bit ops */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffffffffULL)),
				  typnull);
			i = binop(SHL, i, immed64u((p64_t) 32ULL), typ64u);
			i = binop(SHR, i, immed64u((p64_t) 32ULL), typ64);

			j = binop(AND,
				  arg1,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffffffffULL)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t) 32ULL), typ64u);
			j = binop(SHR, j, immed64u((p64_t) 32ULL), typ64);

			i = binop(MUL, i, j, typ64);
			i = binop(MIN,
				  i,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000007fffffffULL)),
				  typ64);
			i = binop(MAX,
				  i,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0xffffffff80000000ULL)),
				  typ64);
			k = binop(AND,
				  i,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffffffffULL)),
				  typnull);


			/* Multiply and saturate odd fields using 64 bit ops */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0xffffffff00000000ULL)),
				  typnull);
			i = binop(SHR, i, immed64u((p64_t) 32ULL), typ64);

			j = binop(AND,
				  arg1,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0xffffffff00000000ULL)),
				  typnull);
			j = binop(SHR, j, immed64u((p64_t) 32ULL), typ64);

			i = binop(MUL, i, j, typ64);
			i = binop(MIN,
				  i,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000007fffffffULL)),
				  typ64);
			i = binop(MAX,
				  i,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0xffffffff80000000ULL)),
				  typ64);
			i = binop(SHL, i, immed64u((p64_t) 32ULL), typ64u);

			return (binop(OR, i, k, typnull));

		} else if (optcpu & CPU_AltiVec) {
			/* Do signed mul */
			i = binop(MUL, arg0, arg1, typ32);

			/* Do high mul, and convert to saturation mask */
			j = binop(MULH, arg0, arg1, typ32);

			/* Make MSb(k)=1 if NegSat */
			k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
			l = binop(AND, j, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(AND, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(AND, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(AND, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, l, k, typnull);

			k = binop(AND, i, k, typnull);
			k = unop(NOT, k, typnull);
			k = binop(AND, k, j, typnull);

			k = binop(AND,
				  k,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);

			/* Form and apply a mask */
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			m = binop(OR, k, l, typnull);
			l = binop(SHR, m, immed64u((p64_t) 2ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 4ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 8ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 16ULL), typnull);
			l = binop(OR, l, m, typnull);
			l = unop(NOT, l, typnull);
			n = binop(AND, l, i, typnull);
			n = binop(OR, k, n, typnull);


			/* Make MSb(k)=1 if PosSat */
			k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
			l = binop(OR, j, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
			k = binop(OR, l, k, typnull);

			k = binop(OR, k, i, typnull);
			j = unop(NOT, j, typnull);
			k = binop(AND, j, k, typnull);

			k = binop(AND,
				  k,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);

			/* Form and apply a mask */
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			m = binop(OR, k, l, typnull);
			l = binop(SHR, m, immed64u((p64_t) 2ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 4ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 8ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 16ULL), typnull);
			l = binop(OR, l, m, typnull);

			i = binop(OR, l, n, typnull);
			k = unop(NOT, k, typnull);
			i = binop(AND, k, i, typnull);

			return(i);
		} else {
			/* Do signed mul */
			i = binop(MUL, arg0, arg1, typ32);

			/* Do high mul, and convert to saturation mask */
			j = binop(MULH, arg0, arg1, typ32);

			/* Make MSb(k)=1 if NegSat */
			k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
			l = binop(AND, j, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(AND, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(AND, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(AND, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
			k = binop(AND, l, k, typnull);

			if (optcpu & CPU_MMX) {
				k = binop(AND, k, i, typnull);
				k = binop(ANDN, k, j, typnull);
			} else {
				k = binop(AND, i, k, typnull);
				k = unop(NOT, k, typnull);
				k = binop(AND, k, j, typnull);
			}

			k = binop(AND,
				  k,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);

			/* Form and apply a mask */
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			m = binop(OR, k, l, typnull);
			l = binop(SHR, m, immed64u((p64_t) 2ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 4ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 8ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 16ULL), typnull);
			l = binop(OR, l, m, typnull);
			if (optcpu & CPU_MMX) {
				n = binop(ANDN, l, i, typnull);
			} else {
				l = unop(NOT, l, typnull);
				n = binop(AND, l, i, typnull);
			}
			n = binop(OR, k, n, typnull);


			/* Make MSb(k)=1 if PosSat */
			k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
			l = binop(OR, j, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 8ULL), typnull);
			l = binop(OR, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 4ULL), typnull);
			l = binop(OR, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
			l = binop(OR, l, k, typnull);
			k = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
			k = binop(OR, l, k, typnull);

			k = binop(OR, k, i, typnull);
			if (optcpu & CPU_MMX) {
				k = binop(ANDN, j, k, typnull);
			} else {
				j = unop(NOT, j, typnull);
				k = binop(AND, j, k, typnull);
			}

			k = binop(AND,
				  k,
				  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
				  typnull);

			/* Form and apply a mask */
			l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
			m = binop(OR, k, l, typnull);
			l = binop(SHR, m, immed64u((p64_t) 2ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 4ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 8ULL), typnull);
			m = binop(OR, l, m, typnull);
			l = binop(SHR, m, immed64u((p64_t) 16ULL), typnull);
			l = binop(OR, l, m, typnull);

			i = binop(OR, l, n, typnull);
			if (optcpu & CPU_MMX) {
				i = binop(ANDN, k, i, typnull);
			} else {
				k = unop(NOT, k, typnull);
				i = binop(AND, k, i, typnull);
			}

			return(i);
		}

	case DIV: /* 32ss */
		/* This method will bomb on div by 0 instead of saturating,
		   which is, of course, the point of doing a sat DIV. */

		/* Modify 32-bit divides */
		i = binop(DIV, arg0, arg1, typ32);

		/* Get reduce-NOT of arg1 */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(AND, j, k, typnull);

		/* Correct for positive saturation */
		if (optcpu & CPU_MMX) {
			k = binop(ANDN, arg0, j, typnull);  /* ~a(tuple j) */
		} else {
			k = unop(NOT, arg0, typnull);
			k = binop(AND, k, j, typnull);  /* ~a(tuple j) */
		}
		k = binop(AND,
			  k,
			  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
			  typnull);
		/* k has pattern tooooooo,0,0,0 */

		/* Apply the mask to the MSb */
		l = unop(NOT, k, typnull);
		i = binop(AND, i, l, typnull);

		/* Convert k's tooooooo,0,0,0 pattern to a 0ttttttt,T,T,T
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* otoooooo*/
		l = binop(OR, k, l, typnull);			/* ttoooooo*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* oottoooo*/
		l = binop(OR, l, m, typnull);			/* ttttoooo*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* ooootttt*/
		l = binop(OR, l, m, typnull);			/* T,0,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 15ULL), typnull);
						/* 0,0000000t,T,ttttttt0 */
		l = binop(OR, l, m, typnull);		/* T,T,T,ttttttt0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* 0ttttttt,T,T,T */

		/* Apply the mask to the LSbs */
		i = binop(OR, i, l, typnull);


		/* Correct for negative saturation */
		k = binop(AND, arg0, j, typnull);	/* a(tuple j) */
		k = binop(AND,
			  k,
			  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
			  typnull);
		/* k has pattern tooooooo,0,0,0 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's tooooooo,0,0,0 pattern to a 1fffffff,F,F,F
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* otoooooo*/
		l = binop(OR, k, l, typnull);			/* ttoooooo*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* oottoooo*/
		l = binop(OR, l, m, typnull);			/* ttttoooo*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* ooootttt*/
		l = binop(OR, l, m, typnull);			/* T,0,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 15ULL), typnull);
						/* 0,ooooooot,T,ttttttto */
		l = binop(OR, l, m, typnull);		/* T,T,T,ttttttto */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* ottttttt,T,T,T */
		l = unop(NOT, l, typnull);		/* 1fffffff,F,F,F */

		/* Apply the mask to the LSbs */
		i = binop(AND, i, l, typnull);

		return(i);

	case MOD: /* 32ss */
		/* This method will bomb on mod by 0 instead of saturating,
		   which is, of course, the point of doing a sat MOD. */

		/* Modify 32-bit modulus */
		i = binop(MOD, arg0, arg1, typ32);

		/* Get reduce-NOT of arg1 */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(AND, j, k, typnull);

		/* Correct for positive saturation */
		if (optcpu & CPU_MMX) {
			k = binop(ANDN, arg0, j, typnull);  /* ~a(tuple j) */
		} else {
			k = unop(NOT, arg0, typnull);
			k = binop(AND, k, j, typnull);  /* ~a(tuple j) */
		}
		k = binop(AND,
			  k,
			  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
			  typnull);
		/* k has pattern tooooooo,0,0,0 */

		/* Apply the mask to the MSb */
		l = unop(NOT, k, typnull);
		i = binop(AND, i, l, typnull);

		/* Convert k's tooooooo,0,0,0 pattern to a 0ttttttt,T,T,T
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* otoooooo*/
		l = binop(OR, k, l, typnull);			/* ttoooooo*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* oottoooo*/
		l = binop(OR, l, m, typnull);			/* ttttoooo*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* ooootttt*/
		l = binop(OR, l, m, typnull);			/* T,0,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 15ULL), typnull);
						/* 0,0000000t,T,ttttttt0 */
		l = binop(OR, l, m, typnull);		/* T,T,T,ttttttt0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* 0ttttttt,T,T,T */

		/* Apply the mask to the LSbs */
		i = binop(OR, i, l, typnull);


		/* Correct for negative saturation */
		k = binop(AND, arg0, j, typnull);	/* a(tuple j) */
		k = binop(AND,
			  k,
			  immedu(cvt1x32uto4x32u((p32_t) 0x80000000)),
			  typnull);
		/* k has pattern tooooooo,0,0,0 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's tooooooo,0,0,0 pattern to a 1fffffff,F,F,F
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* otoooooo*/
		l = binop(OR, k, l, typnull);			/* ttoooooo*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* oottoooo*/
		l = binop(OR, l, m, typnull);			/* ttttoooo*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* ooootttt*/
		l = binop(OR, l, m, typnull);			/* T,0,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 15ULL), typnull);
						/* 0,ooooooot,T,ttttttto */
		l = binop(OR, l, m, typnull);		/* T,T,T,ttttttto */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* ottttttt,T,T,T */
		l = unop(NOT, l, typnull);		/* 1fffffff,F,F,F */

		/* Apply the mask to the LSbs */
		i = binop(AND, i, l, typnull);

		return(i);

	case SHL: /* 32ss */
		/* the same thing signed or unsigned */
		return(binop(op, arg0, arg1, typ32u));

	case SHR: /* 32ss */
		break;

	case INTRLVLOW: /* 32ss */
	case INTRLVHIGH: /* 32ss */
		/* 16-bit to 32-bit interleave of 16-bit fields */
		/* These are the same thing as unsigned */
		return(binop(op, arg0, arg1, typ32u));

	case PACK: /* 32ss */
		/* 64ss -> 32ss */
		/* PACK (with saturation) each 64-bit field value to
		   32 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Saturate arg0 */
		i = binop(MIN,
			  arg0,
			  immedu(cvt1x64uto2x64u((p64_t)0x000000007fffffffULL)),
			  typ64);
		i = binop(MAX,
			  i,
			  immedu(cvt1x64uto2x64u((p64_t)0xffffffff80000000ULL)),
			  typ64);

		/* Saturate arg1 */
		j = binop(MIN,
			  arg1,
			  immedu(cvt1x64uto2x64u((p64_t)0x000000007fffffffULL)),
			  typ64);
		j = binop(MAX,
			  j,
			  immedu(cvt1x64uto2x64u((p64_t)0xffffffff80000000ULL)),
			  typ64);

		/* Pack as signed */
		return(binop(PACK, i, j, typ32));

	default: /* 32ss */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop32ss op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop32f(int op,
int arg0,
int arg1)
{
	/* 32-bit float field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k;

	switch (op) {
	case ADD: /* 32f */
	case SUB:
	case MUL: /* This will have to generate vmaddfp with (arg0*arg1)+0 */
	case MIN:
	case MAX:
	case EQ:
	case GT:
	case GE:
	case RCP:
		break;

	case RCP1:
	case RCP2:
		if (!(optcpu & (CPU_3DNow | CPU_athlon)))
			bug("RCP1/2 not used for non-3DNow!/Athlon targets");
		break;
	case AVG: /* 32f */
		/* add and multiply by 0.5 */
		i = binop(ADD, arg0, arg1, typ32f);
		{
			p64_t t;
			t.sf[0] = 0.5;
			t.sf[1] = 0.5;
			j = immed64u(t);
		}
		return(binop(MUL, i, j, typ32f));

	case SHL: /* 32f */
		/* multiply by appropriate constant */
		if (tup[arg1].op == NUM) {
			p64_t t = (p64_t) tup[arg1].immed.q[0];
			t.sf[0] = 1.0 * (1 << t.d[0]);
			t.sf[1] = 1.0 * (1 << t.d[1]);
			return(binop(MUL, arg0, immed64u(t), typ32f));
		}
		/* shift by a vector is NYI */
		error("shift left of float values only implemented for a "
		      "constant shift");
		return(immed64u((p64_t) 0ULL));

	case SHR: /* 32f */
		/* multiply by appropriate constant */
		if (tup[arg1].op == NUM) {
			p64_t t = (p64_t) tup[arg1].immed.q[0];
			t.sf[0] = 1.0 / (1 << t.d[0]);
			t.sf[1] = 1.0 / (1 << t.d[1]);
			return(binop(MUL, arg0, immed64u(t), typ32f));
		}
		/* shift by a vector is NYI */
		error("shift right of float values only implemented for a "
		      "constant shift");
		return(immed64u((p64_t) 0ULL));

	case DIV: /* 32f */
		/* multiply by appropriate constant */
		if (tup[arg1].op == NUM) {
			p64_t t = (p64_t) tup[arg1].immed.q[0];
			t.sf[0] = 1.0 / t.sf[0];
			t.sf[1] = 1.0 / t.sf[1];
			return(binop(MUL, arg0, immed64u(t), typ32f));
		}


		/* divide by a vector using reciprocal */
		if (optcpu & CPU_3DNow) {
			/* 3DNow's reciprocal is not a partitioned operation */
			i = unop(RCP, arg1, typ32f);

		    #define NOTYET
		    #ifdef NOTYET
			/* The only test I did on this, the iterations just got
			   worse.  Until I figure out why, I'll just take the
			   14-bit approx.
			*/
			j = binop(INTRLVLOW, arg1, arg1, typ64u);
			j = binop(RCP1, j, i, typ32f);
			k = binop(RCP2, i, j, typ32f);
		    #else
			k = i;
		    #endif
			k = binop(AND,
				  k,
				  immedu(cvt1x64uto2x64u((p64_t)0xffffffffULL)),
				  typnull);

			i = binop(SHR, arg1, immed64u((p64_t) 32ULL), typnull);
			i = unop(RCP, i, typ32f);
		    #ifdef NOTYET
			j = binop(INTRLVHIGH, arg1, arg1, typ64u);
			j = binop(RCP1, j, i, typ32f);
			j = binop(RCP2, i, j, typ32f);
		    #else
			j = i;
		    #endif
		    #undef NOTYET
			j = binop(AND,
			  j,
			  immedu(cvt1x64uto2x64u((p64_t)0xffffffff00000000ULL)),
			  typnull);
	
			j = binop(OR, j, k, typnull);
			return(binop(MUL, arg0, j, typ32f));
		} else {
			/* AltiVec's reciprocal is a partitioned operation */
			i = unop(RCP, arg1, typ32f);
			return(binop(MUL, arg0, i, typ32f));
		}

	case XOR: /* 32f */
	case OR: /* 32f */
	case ANDN: /* 32f */
	case AND: /* 32f */
		/* the same thing as signed integer */
		return(binop(op, arg0, arg1, typ32));

	case NE: /* 32f */
		return(unop(NOT, binop(EQ, arg0, arg1, typ32f), typnull));
	case LT: /* 32f */
		return(binop(GT, arg1, arg0, typ32f));
	case LE: /* 32f */
		return(binop(GE, arg1, arg0, typ32f));

	case LAND: /* 32f */
	case LOR: /* 32f */
		/* the same thing as signed integer */
		return(binop(op, arg0, arg1, typ32));

	case PACK: /* 32f */
	case INTRLVLOW: /* 32f */
	case INTRLVHIGH: /* 32f */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop32f op=%s (un)packs should never be "
				"floating-point operations",
				opname(op));
			bug(buf);
		}

	case MOD: /* 32f */
		error("invalid operands to binary %");

	default: /* 32f */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop32f op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}


int
binop64u(int op,
int arg0,
int arg1)
{
	/* 64-bit unsigned field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i=-1, j, k;

	if (optcpu == GenericIA32) {
		bug("Field sizes greater than a fragment are not supported");
	}

/* Left off here adding CPU_MAX */
	switch (op) {
	case ADD: /* 64u */
		if (optcpu & CPU_MAX) {
			break;
		} else if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			i = binop(ADD, arg0, arg1, typ32u);
			j = binop(LT, i, arg0, typ32u);
			k = binop(LT, i, arg1, typ32u);
			j = binop(OR, j, k, typnull);
			j = binop(AND,
				  j,
				  immedu(cvt1x32uto4x32u(
					(p32_t) 0x80000000U)),
				  typnull);
			j = binop(SHL, j, immed64u((p64_t)1ULL), typnull);
			return (binop(ADD, i, j, typ32u));
		} else {
			/* use implicit spacer technique */
			i = binop(AND,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x7fffffffffffffffULL)),
				typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x7fffffffffffffffULL)),
				typnull);
			i = binop(ADD, i, j, typ128u);
			j = binop(XOR, arg0, arg1, typnull);
			j = binop(AND,
				j,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
				typnull);
			return(binop(XOR, i, j, typnull));
		}

	case SUB: /* 64u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			break;
		} else {
			/* use implicit spacer technique */
			i = binop(OR,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
				typnull);
			j = binop(AND,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x7fffffffffffffffULL)),
				typnull);
			i = binop(SUB, i, j, typ128u);
			j = binop(XOR, arg0, arg1, typnull);
			j = binop(AND,
				j,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
				typnull);
			j = binop(XOR,
				j,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
				typnull);
			return(binop(XOR, i, j, typnull));
		}

	case MUL: /* 64u */
		if (optcpu & CPU_MMX) {
			break;
		} else {
#ifdef NOTDEFD
			/* Use 128-bit MUL
			   unsigned interleave, mul, cast via modulation, pack
			   with saturation sequence
			*/
			i = binop(INTRLVLOW,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ128u);
			j = binop(INTRLVLOW,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ128u);
			k = binop(MUL, i, j, typ128u);
			k = binop(AND,
				  k,
				  immed128((p128_t)
					   {0xffffffffffffffffULL, 0ULL}),
				  typnull);

			i = binop(INTRLVHIGH,
				  arg0,
				  immed64u((p64_t) 0ULL),
				  typ128u);
			j = binop(INTRLVHIGH,
				  arg1,
				  immed64u((p64_t) 0ULL),
				  typ128u);
			i = binop(MUL, i, j, typ128u);
			i = binop(AND,
				i,
				immed128((p128_t)
					 {0xffffffffffffffffULL, 0ULL}),
				typnull);

			return(binop(PACK, k, i, typ64us));
#else
			unsigned long long step;

			/* Perform a shift-add sequence */
			i = immed64u((p64_t) 0ULL);
			for (step=0ULL; step<64ULL; ++step)
			{
				j = binop(AND,
					  arg0,
					  immed64u((p64_t) (1ULL<<step)),
					  typnull);
				k = binop(NE,
					  j,
					  immed64u((p64_t) 0ULL),
					  typ64u);
				j = binop(AND, arg0, k, typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) step),
					  typ64u);
				i = binop(ADD, i, j, typ64u);
			}
			return(i);
#endif
		}

	case DIV: /* 64u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use 128-bit divides */
			i = binop(AND,
				arg0,
				immed128((p128_t)
					 {{0xffffffffffffffffULL, 0ULL}}),
				typnull);
			j = binop(AND,
				arg1,
				immed128((p128_t)
					 {{0xffffffffffffffffULL, 0ULL}}),
				typnull);
			i = binop(DIV, i, j, typ128u);
			j = binop(SHR, arg0, immed64u((p64_t) 64ULL), typ128u);
			k = binop(SHR, arg1, immed64u((p64_t) 64ULL), typ128u);
			j = binop(DIV, j, k, typ128u);
			j = binop(SHL, j, immed64u((p64_t) 64ULL), typ128u);
			return(binop(OR, i, j, typnull));
		}

	case MOD: /* 64u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX) ||
		    (optcpu & CPU_AltiVec)) {
			break;
		} else {
			/* use 128-bit modulus */
			i = binop(AND,
				arg0,
				immed128((p128_t)
					 {{0xffffffffffffffffULL, 0ULL}}),
				typnull);
			j = binop(AND,
				arg1,
				immed128((p128_t)
					 {{0xffffffffffffffffULL, 0ULL}}),
				typnull);
			i = binop(MOD, i, j, typ128u);
			j = binop(SHR, arg0, immed64u((p64_t) 64ULL), typ128u);
			k = binop(SHR, arg1, immed64u((p64_t) 64ULL), typ128u);
			j = binop(MOD, j, k, typ128u);
			j = binop(SHL, j, immed64u((p64_t) 64ULL), typ128u);
			return(binop(OR, i, j, typnull));
		}

	case MIN: /* 64u */
		if (optcpu & CPU_MMX) {
			break;
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ64u);
			j = binop(AND, i, arg1, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		}

	case MAX: /* 64u */
		if (optcpu & CPU_MMX) {
			break;
		} else {
			/* use GT */
			i = binop(GT, arg0, arg1, typ64u);
			j = binop(AND, i, arg0, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		}

	case AVG: /* 64u */
		if (optcpu & CPU_MMX) {
			break;			/* Is this really here? */
		} else {
			/* Average rounds up */
			i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typ64u);
			j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typ64u);
			i = binop(ADD, i, j, typ64u);
			j = binop(OR, arg0, arg1, typnull);
			j = binop(AND,
				j,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x0000000000000001ULL)),
				typnull);
			i = binop(ADD, i, j, typ64u);
			return(i);
		}

	case AND: /* 64u */
		break;

	case ANDN: /* 64u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			break;
		} else {
			i = unop(NOT, arg0, typnull);
			return(binop(AND, i, arg1, typnull));
		}

	case OR: /* 64u */
	case XOR: /* 64u */
		break;

	case EQ: /* 64u */
		if (optcpu & CPU_MAX) {
			break;
		} else if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
			/* use 32-bit EQ */
			i = binop(EQ, arg0, arg1, typ32u);
			j = binop(SHL, i, 32ULL, typnull);
			i = binop(AND, i, j, typnull);
			j = binop(SHR, i, 32ULL, typ64u);
			return(binop(OR, i, j, typnull));
		} else {
			/* use 128-bit EQ */
			i = binop(AND,
				arg0,
				immed128((p128_t)
					 {{0xffffffffffffffffULL, 0ULL}}),
				typnull);
			j = binop(AND,
				arg1,
				immed128((p128_t)
					 {{0xffffffffffffffffULL, 0ULL}}),
				typnull);
			i = binop(EQ, i, j, typ128u);
			i = binop(AND,
				i,
				immed128((p128_t)
					 {{0xffffffffffffffffULL, 0ULL}}),
				typnull);
			j = binop(AND,
				arg0,
				immed128((p128_t)
					 {{0ULL, 0xffffffffffffffffULL}}),
				typnull);
			k = binop(AND,
				arg1,
				immed128((p128_t)
					 {{0ULL, 0xffffffffffffffffULL}}),
				typnull);
			j = binop(EQ, j, k, typ128u);
			j = binop(AND,
				j,
				immed128((p128_t)
					 {{0ULL, 0xffffffffffffffffULL}}),
				typnull);
			return(binop(OR, i, j, typnull));
		}

	case NE: /* 64u */
		if (optcpu & CPU_MMX) {
			break;
		} else {
			i = binop(EQ, arg0, arg1, typ64u);
			return(unop(NOT, i, typnull));
		}

	case GT: /* 64u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			break;
		} else if (0) {
			/* If there is a C GT64u available, do this... */
			i = binop(GT_C, arg0, arg1, typ64u);
			return(unop(NEG, i, typ64));
		} else if (optcpu & CPU_AltiVec) {
			/* If GT32u and EQ32u are available, do this... */
			i = binop(GT, arg0, arg1, typ32u);
			j = binop(EQ, arg0, arg1, typ32u);
			k = binop(SHL, i, immed64u((p64_t)32ULL), typnull);
			j = binop(AND, j, k, typnull);
			i = binop(OR, i, j, typnull);
			i = binop(AND,
				  i,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0xffffffff00000000ULL)),
				  typnull);
			j = binop(SHR, i, immed64u((p64_t)32ULL), typnull);
			return(binop(OR, i, j, typnull));
		} else {
			/* Add offset and do signed GT */
			i = binop(ADD,
				arg0,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
				typ64u);
			j = binop(ADD,
				arg1,
				immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
				typ64u);
			return(binop(GT, i, j, typ64));
		}

	case GT_C: /* 64u */
		if (optcpu & CPU_MAX) {
			break;
		} else {
			bug("GT_C not available for this target");
		}

	case GE: /* 64u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			break;
		} else {
			/* x GE y is (x EQ y) OR (x GT y) */
			i = binop(GT, arg0, arg1, typ64u);
			j = binop(EQ, arg0, arg1, typ64u);
			return(binop(OR, i, j, typnull));
		}

	case LT: /* 64u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			break;
		} else {
			return(binop(GT, arg1, arg0, typ64u));
		}

	case LE: /* 64u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX)) {
			break;
		} else {
			return(binop(GE, arg1, arg0, typ64u));
		}

	case LAND: /* 64u */
		if (optcpu & CPU_MMX) {
			break;
		} else {
			/* use 64-bit NE 0 to normalize fields before AND */
			i = binop(NE, arg0, immed64u((p64_t) 0x0ULL), typ64u);
			j = binop(NE, arg1, immed64u((p64_t) 0x0ULL), typ64u);
			return(binop(AND, i, j, typnull));
		}

	case LOR: /* 64u */
		if (optcpu & CPU_MMX) {
			break;
		} else {
			/* use 64-bit NE 0 to normalize fields after ORing */
			i = binop(OR, arg0, arg1, typnull);
			return(binop(NE, i, immed64u((p64_t) 0ULL), typ64u));
		}

	case SHL: /* 64u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX))
			break;

		if (bitsperfrag() == 64) {
			bug("Can't emulate frag-sized SHL");
			break;
		}

		arg1 = shiftconst(arg1, typ8u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			i = binop(SHL,
				  arg0,
				  immed64u((p64_t)tup[arg1].immed.uq[0]),
				  typnull);

			if (tup[arg1].immed.q[0] > 63)
			    return (immed64u((p64_t) 0ULL));
			else
			    return (binop(AND,
					  i,
					  immedu(cvt1x64uto2x64u((p64_t)
						(0xffffffffffffffffULL <<
						 tup[arg1].immed.uq[0]))),
					  typnull));
		}
		error("shift left of unsigned 64-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	case SHR: /* 64u */
		if ((optcpu & CPU_MMX) || (optcpu & CPU_MAX))
			break;

		if (bitsperfrag() == 64) {
			bug("Can't emulate frag-sized SHR");
			break;
		}

		arg1 = shiftconst(arg1, typ8u);
		if (tup[arg1].op == NUM) {
			/* Shift by a constant is easy */
			i = binop(SHL,
				  arg0,
				  immed64u((p64_t)tup[arg1].immed.uq[0]),
				  typnull);

			if (tup[arg1].immed.q[0] > 63ULL)
			    return (immed64u((p64_t) 0ULL));
			else
			    return (binop(AND,
					  i,
					  immedu(cvt1x64uto2x64u((p64_t)
						(0xffffffffffffffffULL >>
						 tup[arg1].immed.uq[0]))),
					  typnull));
		}
		error("shift right of unsigned 64-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	case PACK: /* 64u */
		/* 128u -> 64u */
		/* PACK (without saturation) each 128-bit field value to
		   64 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Fail for narrow targets */
		if (bitsperfrag() < 128) {
			char buf[64];
			snprintf(buf,
				 64,
				 "PACK64u failed for %d-bit target",
				 bitsperfrag());
			info(0, buf);
		}

		/* Keep low 64 bits of arg0 in packed form in
		   low half */
		i = binop(AND,
			  arg0,
			  immed64u((p64_t)0xffffffffffffffffULL),
			  typnull);

		/* Move low 64 bits of arg1 in packed form to
		   high half */
		k = binop(SHL, arg1, immed64u((p64_t) 64ULL), typnull);

		/* Combine packed arg0 and arg1 in result */
		return(binop(OR, i, k, typnull));

	case INTRLVLOW: /* 64u */
	    /* 32-bit to 64-bit interleave of 32-bit fields */
	    if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
		break;		/* vmrglw for AltiVec */
	    } else {
			int bpf = bitsperfrag();

			if (bpf < 64)
			{
				bug("Field sizes greater than a fragment are "
				    "not supported");
				break;
			}

			/* high bit */
			j = arg1;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					  j,
					  (bpf==64)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}
			k = binop(SHL, i, immed64u((p64_t) 1ULL), typnull);


			/* low bit */
			j = arg0;
			switch (bitsperfrag()) {
			case 128:
				i = binop(AND,
					j,
					immed64u((p64_t) 0x00000000ffffffffULL),
					typnull);
				j = binop(AND,
					j,
					immed64u((p64_t) 0xffffffff00000000ULL),
					typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 32ULL),
					  typnull);
				j = binop(OR, i, j, typnull);

			case 64:
				i = binop(AND,
					  j,
					  (bpf==64)?
						immed64u((p64_t)
							0x000000000000ffffULL):
					  immedu(cvt1x64uto2x64u(
						(p64_t)0x000000000000ffffULL)),
					  typnull);
				j = binop(AND,
					  j,
					  (bpf==64)?
					      immed64u((p64_t)
							0x00000000ffff0000ULL):
					      immedu(cvt1x64uto2x64u(
						(p64_t)0x00000000ffff0000ULL)),
					  typnull);
				j = binop(SHL,
					  j,
					  immed64u((p64_t) 16ULL),
					  typnull);
				i = binop(OR, i, j, typnull);
			}

		return(binop(OR, i, k, typnull));
	    }

	case INTRLVHIGH: /* 64u */
	    /* 32-bit to 64-bit interleave of 32-bit fields */
	    if ((optcpu & CPU_MMX) || (optcpu & CPU_AltiVec)) {
		break;		/* vmrghw for AltiVec */
	    } else {
		/* sneaky way to reuse INTRLVLOW code... */
		unsigned long long bpf_2 =
			(unsigned long long) bitsperfrag()/2ULL;
		i = binop(SHR, arg0, immed64u((p64_t) bpf_2), typnull);
		j = binop(SHR, arg1, immed64u((p64_t) bpf_2), typnull);
		return(binop(INTRLVLOW, i, j, typ64u));
	    }

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop64u op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop64us(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 64-bit unsigned field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l;

	switch (op) {
	case EQ: /* 64us */
	case NE: /* 64us */

	case GT: /* 64us */
	case GE: /* 64us */
	case LT: /* 64us */
	case LE: /* 64us */

	case MIN: /* 64us */
	case MAX: /* 64us */
	case AVG: /* 64us */

	case AND: /* 64us */
	case ANDN: /* 64us */
	case OR: /* 64us */
	case XOR: /* 64us */

	case LAND: /* 64us */
	case LOR: /* 64us */
		/* These are the same as unsigned unsaturated */
		return (binop(op, arg0, arg1, typ64u));

	case ADD: /* 64us */
		/* Do 64u add of LSbs */
		i = binop(AND,
			  arg0,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x7fffffffffffffffULL)),
			  typnull);
		j = binop(AND,
			  arg1,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x7fffffffffffffffULL)),
			  typnull);
		k = binop(ADD, i, j, typ64u);	/* sum(LSbs). Cin = MSb(k) */

		/* Find propagate and generate for MSb */
		i = binop(XOR, arg0, arg1, typnull);		/* prop */
		j = binop(AND, arg0, arg1, typnull);		/* gen */

		/* If no OF, then sum = prop(MSb) + sum(LSbs) */
		l = binop(AND,
			  i,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);
		l = binop(ADD, k, l, typ64u);			/* SUM */

		/* overflow = (prop & cin) | gen */
		i = binop(AND, i, k, typnull);
		i = binop(OR, i, j, typnull);
		i = binop(AND,				/* MSb(i) = OF */
			  arg1,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);

		/* ...and create a saturation mask */
		j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, j, i, typnull);
		j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
		i = binop(OR, j, i, typnull);
		j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
		i = binop(OR, j, i, typnull);
		j = binop(SHR, i, immed64u((p64_t) 8ULL), typnull);
		i = binop(OR, j, i, typnull);
		j = binop(SHR, i, immed64u((p64_t) 16ULL), typnull);
		i = binop(OR, j, i, typnull);
		j = binop(SHR, i, immed64u((p64_t) 32ULL), typnull);
		i = binop(OR, j, i, typnull);

		/* Clobber the calculated value with the max on overflow */
		return(binop(OR, l, i, typnull));

	case SUB: /* 64us */
		/* Do unsaturated 64u sub */
		i = binop(SUB, arg0, arg1, typ64u);

		/* NegSat mask is (arg0 < arg1) -> want NOT of this */
		j = binop(GE, arg0, arg1, typ64u);

		/* Clobber the calculated value with zero on overflow */
		return(binop(AND, j, i, typnull));

	case MUL: /* 64us */
		/* Do unsigned mul */
		i = binop(MUL, arg0, arg1, typ64u);

		/* Do high mul, and convert to saturation mask */
		j = binop(MULH, arg0, arg1, typ64u);
		k = binop(SHL, j, immed64u((p64_t) 32ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, j, k, typnull);
		j = binop(AND,
			  j,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);
		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHR, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHR, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(OR, j, k, typnull);
		k = binop(SHR, j, immed64u((p64_t) 32ULL), typnull);
		j = binop(OR, j, k, typnull);

		return(binop(OR, i, j, typnull));

	case DIV: /* 64us */
		/* This method will bomb on div by 0 instead of saturating,
		   which is, of course, the point of doing a sat DIV. */

		/* Do divide as usual */
		i = binop(DIV, arg0, arg1, typ64u);

		/* Generate a saturation mask */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 32ULL), typnull);
		j = binop(AND, j, k, typnull);

		j = binop(AND,
			  j,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 32ULL), typnull);
		j = binop(OR, k, j, typnull);

		/* Clobber the calculated value with MAX if arg1==0 */
		return(binop(OR, i, j, typnull));

	case MOD: /* 64us */
		/* This method will bomb on mod by 0 instead of saturating,
		   which is, of course, the point of doing a sat MOD. */

		/* Do modulus as usual */
		i = binop(MOD, arg0, arg1, typ64u);

		/* Generate a saturation mask */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 32ULL), typnull);
		j = binop(AND, j, k, typnull);

		j = binop(AND,
			  j,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(OR, k, j, typnull);
		k = binop(SHR, j, immed64u((p64_t) 32ULL), typnull);
		j = binop(OR, k, j, typnull);

		/* Clobber the calculated value with MAX if arg1==0 */
		return(binop(OR, i, j, typnull));

	case SHL: /* 64us */
	case SHR: /* 64us */
		/* done in the obvious way */
		break;

	case PACK: /* 64us */
		/* 128us -> 64us */
		/* PACK (with unsigned saturation) each 128-bit field value to
		   64 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Fail for narrow targets */
		if (bitsperfrag() < 128) {
			char buf[64];
			snprintf(buf,
				64,
				"PACK64us failed for %d-bit target",
				bitsperfrag());
			bug(buf);
		}


		/* Calculate the saturated values, then pack as unsaturated */
		i = binop(SHR, arg0, immed64u((p64_t) 32ULL), typnull);
		i = binop(OR, arg0, i, typnull);
		j = binop(SHR, i, immed64u((p64_t) 16ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 8ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, i, j, typnull);
		j = binop(AND,
			  j,
			  immed128((p128_t) {{0ULL, 0x0000000100000000ULL}}),
			  typnull);
		j = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		i = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 8ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 16ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 32ULL), typnull);
		i = binop(OR, i, j, typnull);
		i = binop(OR, arg0, i, typnull);


		/* Keep low 64 bits of arg0 in packed form in
		   low half */
		k = binop(AND,
			  i,
			  immed64u((p64_t)0xffffffffffffffffULL),
			  typnull);


		/* Calculate the saturated values, then pack as unsaturated */
		i = binop(SHR, arg1, immed64u((p64_t) 32ULL), typnull);
		i = binop(OR, arg1, i, typnull);
		j = binop(SHR, i, immed64u((p64_t) 16ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 8ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 1ULL), typnull);
		j = binop(OR, i, j, typnull);
		j = binop(AND,
			  j,
			  immed128((p128_t) {{0ULL, 0x0000000100000000ULL}}),
			  typnull);
		j = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		i = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 2ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 4ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 8ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 16ULL), typnull);
		i = binop(OR, i, j, typnull);
		j = binop(SHR, i, immed64u((p64_t) 32ULL), typnull);
		i = binop(OR, i, j, typnull);
		i = binop(OR, arg1, i, typnull);

		/* Move low 64 bits of arg1 in packed form to high half */
		i = binop(SHL,
			  i,
			  immed64u((p64_t) 64ULL),
			  typnull);

		/* Combine packed arg0 and arg1 in result */
		return(binop(OR, i, k, typnull));

	case INTRLVLOW: /* 64us */
	case INTRLVHIGH: /* 64us */
		/* 32-bit to 64-bit interleave of 32-bit fields */
		/* These are the same as unsigned */
		return(binop(op, arg0, arg1, typ64u));

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop64us op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop64(int op,
int arg0,
int arg1)
{
	/* 64-bit signed field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k;

	switch (op) {
	case ADD: /* 64s */
		/* I'm not sure this is true for the way I emulate MMX ADD64u */
	case SUB: /* 64s */
	case MUL: /* 64s */
	case DIV: /* 64s */
	case MOD: /* 64s */

	case AND: /* 64s */
	case ANDN: /* 64s */
	case OR: /* 64s */
	case XOR: /* 64s */

	case EQ: /* 64s */
	case NE: /* 64s */

	case LAND: /* 64s */
	case LOR: /* 64s */

	case PACK: /* 64s */
	case INTRLVLOW: /* 64s */
	case INTRLVHIGH: /* 32-bit to 64-bit interleave of 32-bit fields */
		/* These are the same as unsigned */
		return(binop(op, arg0, arg1, typ64u));

	case MIN: /* 64s */
		/* use GT */
		if (optcpu & CPU_MMX) {
			i = binop(GT, arg0, arg1, typ64);
			j = binop(AND, i, arg1, typnull);
			i = binop(ANDN, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = binop(GT, arg0, arg1, typ64);
			j = binop(AND, i, arg1, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg0, typnull);
			return(binop(OR, i, j, typnull));
		}

	case MAX: /* 64s */
		/* use GT */
		if (optcpu & CPU_MMX) {
			i = binop(GT, arg0, arg1, typ64);
			j = binop(AND, i, arg0, typnull);
			i = binop(ANDN, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		} else {
			i = binop(GT, arg0, arg1, typ64);
			j = binop(AND, i, arg0, typnull);
			i = unop(NOT, i, typnull);
			i = binop(AND, i, arg1, typnull);
			return(binop(OR, i, j, typnull));
		}

	case AVG: /* 64s */
		/* Average rounds up */
		i = binop(SHR, arg0, immed64u((p64_t) 1ULL), typ64);
		j = binop(SHR, arg1, immed64u((p64_t) 1ULL), typ64);
		i = binop(ADD, i, j, typ64);

		/* Calculate and add rounding bit */
		j = binop(OR, arg0, arg1, typnull);
		j = binop(AND,
			  j,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x0000000000000001ULL)),
			  typnull);
		i = binop(ADD, i, j, typ64);
		return(i);

	case LT: /* 64s */
		return(binop(GT, arg1, arg0, typ64));

	case LE: /* 64s */
		return(binop(GE, arg1, arg0, typ64));

	case GT: /* 64s */
		if ((optcpu & CPU_MMX) ||
		    (optcpu & CPU_MAX) ||
		    (optcpu == GenericIA32)) {
			break;
		} else {
			/* use 128-bit GT */
			/* Compare even fields (sign-extended to 128 bits) */
			i = binop(AND,
				arg0,
				immed128((p128_t)
					 {{0xffffffffffffffffULL, 0ULL}}),
				typnull);
			i = binop(SHL, i, immed64u((p64_t) 64ULL), typnull);
			i = binop(SHR, i, immed64u((p64_t) 64ULL), typ128);
			j = binop(AND,
				arg1,
				immed128((p128_t)
					 {{0xffffffffffffffffULL, 0ULL}}),
				typnull);
			j = binop(SHL, j, immed64u((p64_t) 64ULL), typnull);
			j = binop(SHR, j, immed64u((p64_t) 64ULL), typ128);
			i = binop(GT, i, j, typ128);
			i = binop(AND,
				i,
				immed128((p128_t)
					 {{0xffffffffffffffffULL, 0ULL}}),
				typnull);

			/* Compare odd fields */
			j = binop(AND,
				arg0,
				immed128((p128_t)
					 {{0ULL, 0xffffffffffffffffULL}}),
				typnull);
			k = binop(AND,
				arg1,
				immed128((p128_t)
					 {{0ULL, 0xffffffffffffffffULL}}),
				typnull);
			j = binop(GT, j, k, typ128);
			j = binop(AND,
				j,
				immed128((p128_t)
					 {{0ULL, 0xffffffffffffffffULL}}),
				typnull);

			/* Combine */
			return(binop(OR, i, j, typnull));
		}

#ifdef NOTDEFD
		/* If there is a C GT64 available, do this... */
			i = binop(GT_C, arg0, arg1, typ64);
			return(unop(NEG, i, typ64));

	case GT_C: /* 64s */
		if (optcpu & CPU_MAX)
			break;
#endif

	case GE: /* 64s */
		/* the obvious hack is that x GE y is (x EQ y) OR (x GT y) */
		i = binop(GT, arg0, arg1, typ64);
		j = binop(EQ, arg0, arg1, typ64);
		return(binop(OR, i, j, typnull));

	case SHL: /* 64s */
		/* the same thing signed or unsigned */
		return(binop(op, arg0, arg1, typ64u));

	case SHR: /* 64s */
		arg1 = shiftconst(arg1, typ8u);

		if (optcpu & CPU_MAX) {
			/* "SHRD,S,cond r,sa,t" See page 7-48 of Kane */
			break;
		}

		/* MMX does not directly do 64-bit arithmetic shifts...
		   use unsigned shift, but paste-in sign extension
		*/
		if (tup[arg1].op == NUM) {
			/* Done if shift count is 0 */
			if (tup[arg1].immed.q[0] == 0x0ULL) return(arg0);

			/* Shift by a constant is easy */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0x8000000000000000ULL)),
				  typnull);
			i = binop(EQ,
				  i,
				  immedu(cvt1x64uto2x64u(
						(p64_t)0x8000000000000000ULL)),
				  typ64u);

			i = binop(ANDN,
				  immedu(cvt1x64uto2x64u(
						(p64_t)
						(0xffffffffffffffffULL >>
						tup[arg1].immed.uq[0]    ) )),
				  i,
				  typnull);
			i = binop(OR,
				  i,
				  binop(SHR, arg0, arg1, typ64u),
				  typnull);
			return(i);
		}
		error("shift right of signed 64-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop64 op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop64ss(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 64-bit signed field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k, l, m;

	switch (op) {
	case AND: /* 64ss */
	case ANDN: /* 64ss */
	case OR: /* 64ss */
	case XOR: /* 64ss */

	case EQ: /* 64ss */
	case NE: /* 64ss */

	case LAND: /* 64ss */
	case LOR: /* 64ss */
		/* These are the same as unsigned unsaturated */
		return (binop(op, arg0, arg1, typ64u));

	case MIN: /* 64ss */
	case MAX: /* 64ss */
	case AVG: /* 64ss */

	case GT: /* 64ss */
	case LT: /* 64ss */
	case LE: /* 64ss */
	case GE: /* 64ss */
		/* These are the same as signed unsaturated */
		return (binop(op, arg0, arg1, typ64));

	case ADD: /* 64ss */
		/* Do the signed add */
		i = binop(ADD, arg0, arg1, typ64);

		/* Correct for positive saturation */
		j = binop(OR, arg0, arg1, typnull);
		j = unop(NOT, j, typnull);		/* tX..X if both pos */
		j = binop(AND, j, i, typnull);		/* MSb(sum) = 1 */
		j = binop(AND,
			  j,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, j, typnull);
		k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 32ULL), typnull);
		l = binop(OR, k, l, typnull);			/* T...T */

		/* Clobber the calculated value with PosSat if arg1==0 */
		i = binop(OR, i, l, typnull);			/* T...T */
		j = unop(NOT, j, typnull);			/* f1..1 */
		i = binop(AND, i, j, typnull);


		/* Correct for negative saturation */
		j = binop(AND, arg0, arg1, typnull);	/* tX..X if both neg */
		k = unop(NOT, i, typnull);
		j = binop(AND, j, k, typnull);		/* MSb(sum) = 0 */
		j = binop(AND,
			  j,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, j, typnull);
		k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 32ULL), typnull);
		l = binop(OR, k, l, typnull);			/* T...T */

		/* Clobber the calculated value with NegSat if arg1==0 */
		l = unop(NOT, l, typnull);			/* F...F */
		i = binop(AND, i, l, typnull);
		i = binop(OR, i, j, typnull);

		return(i);
		
	case SUB: /* 64ss */
		/* Do the signed sub */
		i = binop(SUB, arg0, arg1, typ64);

		/* Correct for positive saturation */
		m = binop(XOR, arg0, arg1, typnull);	/* tX..X if mixed */
		j = binop(AND, m, arg1, typnull); /* tX..X if arg0+ & arg1- */
		j = binop(AND, j, i, typnull);		/* MSb(diff) = 1 */
		j = binop(AND,
			  j,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, j, typnull);
		k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 32ULL), typnull);
		l = binop(OR, k, l, typnull);			/* T...T */

		/* Clobber the calculated value with PosSat if arg1==0 */
		i = binop(OR, i, l, typnull);			/* T...T */
		j = unop(NOT, j, typnull);			/* f1..1 */
		i = binop(AND, i, j, typnull);


		/* Correct for negative saturation */
		j = binop(AND, m, arg0, typnull); /* tX..X if arg0- & arg1+ */
		k = unop(NOT, i, typnull);
		j = binop(AND, j, k, typnull);		/* MSb(diff) = 0 */
		j = binop(AND,
			  j,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);

		k = binop(SHR, j, immed64u((p64_t) 1ULL), typnull);
		l = binop(OR, k, j, typnull);
		k = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
		l = binop(OR, k, l, typnull);
		k = binop(SHR, l, immed64u((p64_t) 32ULL), typnull);
		l = binop(OR, k, l, typnull);			/* T...T */

		/* Clobber the calculated value with NegSat if arg1==0 */
		l = unop(NOT, l, typnull);			/* F...F */
		i = binop(AND, i, l, typnull);
		i = binop(OR, i, j, typnull);

		return(i);

	case MUL: /* 64ss */
		/* We want this:
			if (MSb of j)=1: -- Should be negative
			    if ((j != 0xffff) || (MSb of i)=0) return 0x8000;
			    else return low_word;
			else -- Should be positive
			    if ((j != 0x0000) || (MSb of i)=1) return 0x7fff;
			    else return low_word;
		*/
		/* Do signed mul */
		i = binop(MUL, arg0, arg1, typ64);

		/* Do high mul, and convert to saturation mask */
		j = binop(MULH, arg0, arg1, typ64);

		/* Make MSb(k)=1 if NegSat */
		k = binop(SHL, j, immed64u((p64_t) 32ULL), typnull);
		l = binop(AND, j, k, typnull);
		k = binop(SHL, l, immed64u((p64_t) 16ULL), typnull);
		l = binop(AND, l, k, typnull);
		k = binop(SHL, l, immed64u((p64_t) 8ULL), typnull);
		l = binop(AND, l, k, typnull);
		k = binop(SHL, l, immed64u((p64_t) 4ULL), typnull);
		l = binop(AND, l, k, typnull);
		k = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(AND, l, k, typnull);
		k = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
		k = binop(AND, l, k, typnull);
		k = binop(AND, k, i, typnull);
		k = unop(NOT, k, typnull);
		k = binop(AND,
			  k,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);

		/* Form and apply a mask */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
		m = binop(OR, k, l, typnull);
		l = binop(SHR, m, immed64u((p64_t) 2ULL), typnull);
		m = binop(OR, l, m, typnull);
		l = binop(SHR, m, immed64u((p64_t) 4ULL), typnull);
		m = binop(OR, l, m, typnull);
		l = binop(SHR, m, immed64u((p64_t) 8ULL), typnull);
		m = binop(OR, l, m, typnull);
		l = binop(SHR, m, immed64u((p64_t) 16ULL), typnull);
		m = binop(OR, l, m, typnull);
		l = binop(SHR, m, immed64u((p64_t) 32ULL), typnull);
		l = binop(OR, l, m, typnull);
		if (optcpu & CPU_MMX) {
			i = binop(ANDN, l, i, typnull);
		} else {
			m = unop(NOT, l, typnull);
			i = binop(AND, m, i, typnull);
		}
		i = binop(OR, k, i, typnull);


		/* Make MSb(k)=1 if PosSat */
		k = binop(SHL, j, immed64u((p64_t) 32ULL), typnull);
		l = binop(OR, j, k, typnull);
		k = binop(SHL, l, immed64u((p64_t) 16ULL), typnull);
		l = binop(OR, l, k, typnull);
		k = binop(SHL, l, immed64u((p64_t) 8ULL), typnull);
		l = binop(OR, l, k, typnull);
		k = binop(SHL, l, immed64u((p64_t) 4ULL), typnull);
		l = binop(OR, l, k, typnull);
		k = binop(SHL, l, immed64u((p64_t) 2ULL), typnull);
		l = binop(OR, l, k, typnull);
		k = binop(SHL, l, immed64u((p64_t) 1ULL), typnull);
		k = binop(OR, l, k, typnull);
		k = binop(OR, k, i, typnull);
		k = binop(AND,
			  k,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);

		/* Form and apply a mask */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
		m = binop(OR, k, l, typnull);
		l = binop(SHR, m, immed64u((p64_t) 2ULL), typnull);
		m = binop(OR, l, m, typnull);
		l = binop(SHR, m, immed64u((p64_t) 4ULL), typnull);
		m = binop(OR, l, m, typnull);
		l = binop(SHR, m, immed64u((p64_t) 8ULL), typnull);
		m = binop(OR, l, m, typnull);
		l = binop(SHR, m, immed64u((p64_t) 16ULL), typnull);
		m = binop(OR, l, m, typnull);
		l = binop(SHR, m, immed64u((p64_t) 32ULL), typnull);
		l = binop(OR, l, m, typnull);
		i = binop(OR, l, i, typnull);
		if (optcpu & CPU_MMX) {
			i = binop(ANDN, k, i, typnull);
		} else {
			m = unop(NOT, k, typnull);
			i = binop(AND, m, i, typnull);
		}

		return(i);

	case DIV: /* 64ss */
		/* This method will bomb on div by 0 instead of saturating,
		   which is, of course, the point of doing a sat DIV. */

		/* Modify 64-bit divides */
		i = binop(DIV, arg0, arg1, typ64);

		/* Get reduce-NOT of arg1 */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 32ULL), typnull);
		j = binop(AND, j, k, typnull);

		/* Correct for positive saturation */
		if (optcpu & CPU_MMX) {
			k = binop(ANDN, arg0, j, typnull);  /* ~a(tuple j) */
		} else {
			k = unop(NOT, arg0, typnull);	/* ~a */
			k = binop(AND, k, j, typnull);	/* ~a(tuple j) */
		}
		k = binop(AND,
			  k,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);
		/* k has pattern tooooooo,0,0,0 */

		/* Apply the mask to the MSb */
		if (optcpu & CPU_MMX) {
			i = binop(ANDN, k, i, typnull);
		} else {
			l = unop(NOT, k, typnull);
			i = binop(AND, i, l, typnull);
		}

		/* Convert k's tooooooo,0,0,0 pattern to a 0ttttttt,T,T,T
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* otoooooo*/
		l = binop(OR, k, l, typnull);			/* ttoooooo*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* oottoooo*/
		l = binop(OR, l, m, typnull);			/* ttttoooo*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* ooootttt*/
		l = binop(OR, l, m, typnull);			/* T,0,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 31ULL), typnull);
						/* 0,0000000t,T,ttttttt0 */
		l = binop(OR, l, m, typnull);		/* T,T,T,ttttttt0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* 0ttttttt,T,T,T */

		/* Apply the mask to the LSbs */
		i = binop(OR, i, l, typnull);


		/* Correct for negative saturation */
		k = binop(AND, arg0, j, typnull);	/* a(tuple j) */
		k = binop(AND,
			  k,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);
		/* k has pattern tooooooo,0,0,0 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's tooooooo,0,0,0 pattern to a 1fffffff,F,F,F
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* otoooooo*/
		l = binop(OR, k, l, typnull);			/* ttoooooo*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* oottoooo*/
		l = binop(OR, l, m, typnull);			/* ttttoooo*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* ooootttt*/
		l = binop(OR, l, m, typnull);			/* T,0,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 31ULL), typnull);
						/* 0,ooooooot,T,ttttttto */
		l = binop(OR, l, m, typnull);		/* T,T,T,ttttttto */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* ottttttt,T,T,T */
		l = unop(NOT, l, typnull);		/* 1fffffff,F,F,F */

		/* Apply the mask to the LSbs */
		i = binop(AND, i, l, typnull);

	case MOD: /* 64ss */
		/* This method will bomb on mod by 0 instead of saturating,
		   which is, of course, the point of doing a sat MOD. */

		/* Modify 64-bit modulus */
		i = binop(MOD, arg0, arg1, typ64);

		/* Get reduce-NOT of arg1 */
		j = unop(NOT, arg1, typnull);
		k = binop(SHL, j, immed64u((p64_t) 1ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 2ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 4ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 8ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 16ULL), typnull);
		j = binop(AND, j, k, typnull);
		k = binop(SHL, j, immed64u((p64_t) 32ULL), typnull);
		j = binop(AND, j, k, typnull);

		/* Correct for positive saturation */
		if (optcpu & CPU_MMX) {
			k = binop(ANDN, arg0, j, typnull);  /* ~a(tuple j) */
		} else {
			k = unop(NOT, arg0, typnull);	/* ~a */
			k = binop(AND, k, j, typnull);	/* ~a(tuple j) */
		}
		k = binop(AND,
			  k,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);
		/* k has pattern tooooooo,0,0,0 */

		/* Apply the mask to the MSb */
		l = unop(NOT, k, typnull);
		i = binop(AND, i, l, typnull);

		/* Convert k's tooooooo,0,0,0 pattern to a 0ttttttt,T,T,T
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* otoooooo*/
		l = binop(OR, k, l, typnull);			/* ttoooooo*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* oottoooo*/
		l = binop(OR, l, m, typnull);			/* ttttoooo*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* ooootttt*/
		l = binop(OR, l, m, typnull);			/* T,0,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 31ULL), typnull);
						/* 0,0000000t,T,ttttttt0 */
		l = binop(OR, l, m, typnull);		/* T,T,T,ttttttt0 */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* 0ttttttt,T,T,T */

		/* Apply the mask to the LSbs */
		i = binop(OR, i, l, typnull);


		/* Correct for negative saturation */
		k = binop(AND, arg0, j, typnull);	/* a(tuple j) */
		k = binop(AND,
			  k,
			  immedu(cvt1x64uto2x64u(
					(p64_t) 0x8000000000000000ULL)),
			  typnull);
		/* k has pattern tooooooo,0,0,0 */

		/* Apply the mask to the MSb */
		i = binop(OR, i, k, typnull);

		/* Convert k's tooooooo,0,0,0 pattern to a 1fffffff,F,F,F
		   pattern */
		l = binop(SHR, k, immed64u((p64_t) 1ULL), typnull);
								/* otoooooo*/
		l = binop(OR, k, l, typnull);			/* ttoooooo*/
		m = binop(SHR, l, immed64u((p64_t) 2ULL), typnull);
								/* oottoooo*/
		l = binop(OR, l, m, typnull);			/* ttttoooo*/
		m = binop(SHR, l, immed64u((p64_t) 4ULL), typnull);
								/* ooootttt*/
		l = binop(OR, l, m, typnull);			/* T,0,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 8ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 16ULL), typnull);
								/* 0,T,0,0 */
		l = binop(OR, l, m, typnull);			/* T,T,0,0 */
		m = binop(SHR, l, immed64u((p64_t) 31ULL), typnull);
						/* 0,ooooooot,T,ttttttto */
		l = binop(OR, l, m, typnull);		/* T,T,T,ttttttto */
		l = binop(SHR, l, immed64u((p64_t) 1ULL), typnull);
							/* ottttttt,T,T,T */
		l = unop(NOT, l, typnull);		/* 1fffffff,F,F,F */

		/* Apply the mask to the LSbs */
		i = binop(AND, i, l, typnull);

	case SHL: /* 64ss */
		/* done in the obvious way */
		break;

	case SHR: /* 64ss */
		/* MMX does not directly do 64-bit arithmetic shifts...
		   use unsigned shift, but paste-in sign extension
		*/
		arg1 = shiftconst(arg1, typ8u);
		if (tup[arg1].op == NUM) {
			/* Done if shift count is 0 */
			if (tup[arg1].immed.q[0] == 0x0ULL) return(arg0);

			/* Shift by a constant is easy */
			i = binop(AND,
				  arg0,
				  immedu(cvt1x64uto2x64u(
					 (p64_t)0x8000000000000000ULL)),
				  typnull);
			i = binop(EQ,
				  i,
				  immedu(cvt1x64uto2x64u(
					 (p64_t)0x8000000000000000ULL)),
				  typ64u);

			i = binop(ANDN,
				  immedu(cvt1x64uto2x64u(
						(p64_t)
						(0xffffffffffffffffULL >>
						tup[arg1].immed.uq[0]    ) )),
				  i,
				  typnull);
			i = binop(OR,
				  i,
				  binop(SHR, arg0, arg1, typ64u),
				  typnull);
			return(i);
		}
		error("shift right of signed 64-bit field values only "
		      "implemented for a constant shift");
		return(immed64u((p64_t) 0ULL));

	case INTRLVLOW: /* 64ss */
	case INTRLVHIGH: /* 64ss */
		/* 32-bit to 64-bit interleave of 32-bit fields */
		/* These are the same as unsigned */
		return(binop(op, arg0, arg1, typ64u));

	case PACK: /* 64ss */
		/* 128ss -> 64ss */
		/* PACK (with saturation) each 128-bit field value to
		   64 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

		/* Saturate arg0 */
		i = binop(MIN,
			  arg0,
			  immed64u((p64_t)0x7fffffffffffffffULL),
			  typ128);
		i = binop(MAX,
			  i,
			  immed64s((p64_t)0x8000000000000000LL),
			  typ128);

		/* Saturate arg1 */
		j = binop(MIN,
			  arg1,
			  immed64u((p64_t)0x7fffffffffffffffULL),
			  typ128);
		j = binop(MAX,
			  j,
			  immed64s((p64_t)0x8000000000000000LL),
			  typ128);

		/* Pack as signed */
		return(binop(PACK, i, j, typ64));

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop64ss op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}


int
binop128u(int op,
int arg0,
int arg1)
{
	/* 128-bit unsigned field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	register int i, j, k;

	if (bitsperfrag() < 128) {
		bug("Field sizes greater than a fragment are not supported");
	}

	switch (op) {
	case ADD: /* 128u */
		/* Use 64-bit ADDs */
		i = binop(ADD, arg0, arg1, typ64u);
		j = binop(LT, i, arg0, typ64u);
		k = binop(LT, i, arg1, typ64u);
		j = binop(OR, j, k, typnull);
		j = binop(AND,
			  j,
			  immed64u((p64_t) 0x8000000000000000ULL),
			  typnull);
		j = binop(SHL, j, immed64u((p64_t)1ULL), typnull);
		return (binop(ADD, i, j, typ64u));

	case SUB: /* 128u */
	case MUL: /* 128u */
	case DIV: /* 128u */
	case MOD: /* 128u */
	case MIN: /* 128u */
	case MAX: /* 128u */
	case AVG: /* 128u */

	case GT: /* 128u */
	case GE: /* 128u */
	case LT: /* 128u */
	case LE: /* 128u */

	case LAND: /* 128u */
	case LOR: /* 128u */
		/* None of these exist */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop128u op=%s not supported.",
				opname(op));
			bug(buf);
		}

	case AND: /* 128u */
	case ANDN: /* 128u */
	case OR: /* 128u */
	case NOR: /* 128u */
	case XOR: /* 128u */
		break;

	case EQ: /* 128u */
		if (optcpu & CPU_AltiVec) {
			i = binop(EQ, arg0, arg1, typ32u);
			j = binop(SHR, i, immed64u((p64_t)64ULL), typnull);
			i = binop(AND, i, j, typnull);
			j = binop(SHR, i, immed64u((p64_t)32ULL), typnull);
			i = binop(AND, i, j, typnull);
			return(binop(REPL, immed64u((p64_t)3ULL), i, typ32u));
		} else {
			char buf[64];
			snprintf(buf,
				64,
				"binop128u op=%s not supported.",
				opname(op));
			bug(buf);
			return 0;
		}

	case NE: /* 128u */
		i = binop(EQ, arg0, arg1, typ128u);
		return(unop(NOT, i, typnull));

	case SHL: /* 128u */
		if (optcpu & CPU_AltiVec) {
			/* This must be split into separate byte and bit
			   shifts.
			*/
			if (tup[arg1].op == NUM) {
				/* Fix index for a byte shift left (vslo)
				   instruction which uses bits 121-124(AltiVec)
				   as the shift count */
				i = immedu(cvt1x8uto16x8u(
						tup[arg1].immed.b[7]&0x78));
				i = binop(SHLBYTE, arg0, i, typ128u);

				/* vsl expects the shift count to be replicated
				   on a bytewise basis (i.e. be a 16x8).
				*/
				j = immedu(cvt1x8uto16x8u(
						tup[arg1].immed.b[7]&0x7));
				return(binop(SHLBIT, i, j, typnull));

			} else {
				/* Fix index for a byte shift left (vslo)
				   instruction which uses bits 121-124(AltiVec)
				   as the shift count */
				i = binop(SHLBYTE, arg0, arg1, typ128u);

				/* vsl expects the shift count to be replicated
				   on a bytewise basis (i.e. be a 16x8).
				*/
				j = binop(REPL,
					  immed64u((p64_t)15ULL),
					  arg1,
					  typ8u);
				return(binop(SHLBIT, i, j, typnull));
			}
		} else {
			bug("SHL128u not implemented for this target");
		}
		break;

	case SHLBIT:	/* Bit Shift Left */ /* 128u */
		break;

	case SHLBYTE:	/* Byte Shift Left */ /* 128u */
		break;

	case SHR: /* 128u */
		if (optcpu & CPU_AltiVec) {
			/* This must be split into separate byte and bit
			   shifts.
			*/
			if (tup[arg1].op == NUM) {
				/* Fix index for a byte shift right (vsro)
				   instruction which uses bits 121-124(AltiVec)
				   as the shift count */
				i = immedu(cvt1x8uto16x8u(
						tup[arg1].immed.b[7]&0x78));
				i = binop(SHRBYTE, arg0, i, typ128u);

				/* vsr expects the shift count to be replicated
				   on a bytewise basis (i.e. be a 16x8).
				*/
				j = immedu(cvt1x8uto16x8u(
						tup[arg1].immed.b[7]&0x7));
				return(binop(SHRBIT, i, j, typnull));
			} else {
				/* Fix index for a byte shift left (vslo)
				   instruction which uses bits 121-124(AltiVec)
				   as the shift count */
				i = binop(SHRBYTE, arg0, arg1, typ128u);
				/* vsl expects the shift count to be replicated
				   on a bytewise basis (i.e. be a 16x8).
				*/
				j = binop(REPL,
					  immed64u((p64_t)15ULL),
					  arg1,
					  typ8u);
				return(binop(SHRBIT, i, j, typnull));
			}
		} else {
			bug("SHR128u not implemented for this target");
		}
		break;

	case SHRBIT:	/* Bit Shift Right */ /* 128u */
		break;

	case SHRBYTE:	/* Byte Shift Right */ /* 128u */
		break;

	case PACK: /* 128u */
		/* 256u -> 128u */
		/* PACK (without saturation) each 256-bit field value to
		   128 bits, then copy the lower halves of each field into the
		   result, with arg0 in one half and arg1 in the other half
		   of the result register. */

	case INTRLVLOW: /* 128u */
	    /* 64-bit to 128-bit interleave of 64-bit fields */
	    if (optcpu & CPU_MMX) break;

	case INTRLVHIGH: /* 128u */
	    /* 64-bit to 128-bit interleave of 64-bit fields */
	    if (optcpu & CPU_MMX) break;

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop128u op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop128us(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 128-bit unsigned field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	/* register int i, j, k, l, m, n; */

	switch (op) {
	case EQ: /* 128us */
	case NE: /* 128us */

	case GT: /* 128us */
	case GE: /* 128us */
	case LT: /* 128us */
	case LE: /* 128us */

	case MIN: /* 128us */
	case MAX: /* 128us */
	case AVG: /* 128us */

	case AND: /* 128us */
	case ANDN: /* 128us */
	case OR: /* 128us */
	case XOR: /* 128us */

	case LAND: /* 128us */
	case LOR: /* 128us */
		/* These are the same as unsigned unsaturated */
		return (binop(op, arg0, arg1, typ128u));

	case ADD: /* 128us */
	case SUB: /* 128us */
	case MUL: /* 128us */
	case DIV: /* 128us */
	case MOD: /* 128us */

	case SHL: /* 128us */
	case SHR: /* 128us */

	case PACK: /* 128us */
	case INTRLVLOW: /* 128us */
	case INTRLVHIGH: /* 128us */
		/* None of these exist */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop128us op=%s not supported.",
				opname(op));
			bug(buf);
		}

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop128us op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop128(int op,
int arg0,
int arg1)
{
	/* 128-bit signed field binary ops */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	/* register int i, j, k, l, m, n; */

	switch (op) {
	case ADD: /* 128s */
	case SUB: /* 128s */
	case MUL: /* 128s */
	case DIV: /* 128s */
	case MOD: /* 128s */

	case AND: /* 128s */
	case ANDN: /* 128s */
	case OR: /* 128s */
	case XOR: /* 128s */

	case EQ: /* 128s */
	case NE: /* 128s */

	case LAND: /* 128s */
	case LOR: /* 128s */

	case SHL: /* 128s */

	case PACK: /* 128s */
	case INTRLVLOW: /* 128s */
	case INTRLVHIGH: /* 64-bit to 128-bit interleave of 64-bit fields */
		/* These are the same as unsigned */
		return(binop(op, arg0, arg1, typ128u));

	case MIN: /* 128s */
	case MAX: /* 128s */
	case AVG: /* 128s */

	case LT: /* 128s */
	case LE: /* 128s */
	case GT: /* 128s */
	case GE: /* 128s */

	case SHR: /* 128s */
		/* None of these exist */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop128 op=%s not supported.",
				opname(op));
			bug(buf);
		}
		break;

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop128 op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}

	return(-1);
}

int
binop128ss(int op,
int arg0,
int arg1,
int decl_bits)
{
	/* 128-bit signed field binary ops with saturation */
	/* Convert the op into a tree of subpos if necessary for the target,
	   then strip the type information from the op to indicate that it has
	   been handled.
	*/

	/* register int i, j, k, l, m, n; */

	switch (op) {
	case AND: /* 128ss */
	case ANDN: /* 128ss */
	case OR: /* 128ss */
	case XOR: /* 128ss */

	case EQ: /* 128ss */
	case NE: /* 128ss */

	case LAND: /* 128ss */
	case LOR: /* 128ss */
		/* These are the same as unsigned unsaturated */
		return (binop(op, arg0, arg1, typ128u));

	case MIN: /* 128ss */
	case MAX: /* 128ss */
	case AVG: /* 128ss */

	case GT: /* 128ss */
	case LT: /* 128ss */
	case LE: /* 128ss */
	case GE: /* 128ss */
		/* These are the same as signed unsaturated */
		return (binop(op, arg0, arg1, typ128));

	case ADD: /* 128ss */
	case SUB: /* 128ss */
	case MUL: /* 128ss */
	case DIV: /* 128ss */
	case MOD: /* 128ss */

	case SHL: /* 128ss */
	case SHR: /* 128ss */

	case INTRLVLOW: /* 128ss */
	case INTRLVHIGH: /* 128ss */
	case PACK: /* 128ss */
		/* None of these exist */
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop128ss op=%s not supported.",
				opname(op));
			bug(buf);
		}

	default:
		{
			char buf[64];
			snprintf(buf,
				64,
				"binop128ss op=%s (this cannot happen?)",
				opname(op));
			bug(buf);
		}
	}
	return(-1);
}

