/*	fragmenter.c

	The fragmenter.

	This code is responsible for fragmenting arbitrary vectors
	into word sized chunks.
*/

#include "stdpccts.h"
#include "swartypes.h"
#include "tuple.h"
#include "tuplegen.h"
#include "tuple_immed.h"
#include "tuple_binop.h"
#include "oputils.h"
#include "sym.h"
#include "output.h"
#include "scheduler.h"
#include "cpool.h"
#include "spool.h"
#include "pseudoregs.h"
#include "Libstdswar/stdswar.h"
#include "showir.h"
#include "messages.h"


#undef DEBUG_FUNCS
#undef DEBUG_BLOCKS
#undef DEBUG_VNUM
#undef DEBUG_VNUM_DUMP

static int returncalled;


static void
regvec(register tree *t)
{
	/* Procedure to allocate, and mark as unused (write -1's into), an
	   array of tuple indices (ints) for the fragments of "t".  This will
	   hold indices into "tup[]" to the tuples for the translation of the
	   tree "t".
	*/

	register int s = enh_size(t->type);
	register int *p = ((int *) malloc(s * sizeof(int)));
	register int *e = (p + s);

	t->regvec = p;
	while (p < e) {
		*p = -1;
		++p;
	}
}

static void
binfrag(register tree *t)
{
	/* Build a tuple tree to handle a binary operation on
	   the fragments of two vectors
	*/

	register int i;
	register int s;

	#ifdef DEBUG_FUNCS
		info(0, "Start binfrag()...");
	#endif

	s = enh_size(t->type);

	regvec(t);
	for (i=0; i<s; ++i) {
		*(t->regvec + i) = binop(t->op,
					 *((t->down)->regvec + i),
					 *(((t->down)->right)->regvec + i),
					 t->type);
	}
}

static void
unfrag(register tree *t)
{
	/* Build a tuple tree to handle a unary operation on
	   the fragments of a vector. */

	register int i;
	register int s;

	#ifdef DEBUG_FUNCS
		info(0, "Start unfrag()...");
	#endif

	s = enh_size(t->type);

	regvec(t);
	for (i=0; i<s; ++i) {
		*(t->regvec + i) = unop(t->op,
					*((t->down)->regvec + i),
					t->type);
	}
}

static void
numfrag(register tree *t)
{
	/* Build a tuple tree to generate the fragments of a NUM constant */

	#ifdef DEBUG_FUNCS
		info(0, "Start numfrag()...");
	#endif

	regvec(t);
	switch (bitsperfrag()) {
	case 128:
		*(t->regvec) = immed128((p128_t){{(long long) t->num, 0LL}});
		break;
	case 64:
		*(t->regvec) = immed64((p64_t) ((long long) t->num));
		break;
	case 32:
		*(t->regvec) = immed32((p32_t) ((int) t->num));
		break;
        default:
		{
			char buf[50];
			snprintf(buf,
				 50,
				 "numfrag() doesn't handle %d-bit frags",
				 bitsperfrag());
			bug(buf);
		}
	}
}

static int
vfrag32(register unsigned int decl_bits, register tree *t)
{
	/* Build a tuple tree to generate the fragments of a VNUM constant
	   for 32-bit targets.
	   Returns 1 if the value is truncated to fit the required precision,
	   and 0 otherwise.
	*/

	register int i, j;
	register int s;
	register int trunc = 0;
	p32_t m;
	register unsigned int bits = bitsperfield(decl_bits);

	#ifdef DEBUG_FUNCS
		info(0, "Start vfrag32()...");
	#endif

	s = enh_size(t->type);

	for (i=0; i<s; ++i) {
		m.d = 0;
		for (j=0; j<(32/bits); ++j) {
			int c, k;
			switch (bits) {
			case 1:
			case 2:
			case 4:
				c = numbuf[t->num+(i*(32/bits))+j];
				k = (c & ((0x1<<bits)-1));
				trunc |= (c != k);
				m.d |= (k << (j*bits));
				break;
			case 8:
				m.b[j] = numbuf[t->num + (i * (32/bits)) + j];
				break;
			case 16:
				m.w[j] = numbuf[t->num + (i * (32/bits)) + j];
				break;
			case 32:
				m.d = numbuf[t->num + (i * (32/bits)) + j];
			}
		}
		if (i == (s - 1)) {
			/* Expecting IA32 */
			p32_t x = (p32_t) target_mask(t->type).ud[0];
			m.ud &= x.ud;
		}
		*(t->regvec + i) = immed32(m);
	}
	return trunc;
}

static int
vfrag64(register unsigned int decl_bits, register tree *t)
{
	/* Build a tuple tree to generate the fragments of a VNUM constant
	   for 64-bit targets.
	   Returns 1 if the value is truncated to fit the required precision,
	   and 0 otherwise.
	*/

	register int i, j;
	register int s;
	register int trunc;
	p64_t m;
	register unsigned int bits = bitsperfield(decl_bits);

	#ifdef DEBUG_FUNCS
		info(0, "Start vfrag64()...");
	#endif

	s = enh_size(t->type);
	trunc = 0;

	for (i=0; i<s; ++i) {
		m.q = 0LL;
		for (j=0; j<(64/bits); ++j) {
			long long c, k;
			switch (bits) {
			case 1:
			case 2:
			case 4:
				c = numbuf[t->num+(i*(64/bits))+j];
				k = (c & ((0x1LL<<bits)-1));
				trunc |= (c != k);
				m.q |= (k << (j*bits));
				break;
			case 8:
				m.b[j] = numbuf[t->num + (i * (64/bits)) + j];
				break;
			case 16:
				m.w[j] = numbuf[t->num + (i * (64/bits)) + j];
				break;
			case 32:
				m.d[j] = numbuf[t->num + (i * (64/bits)) + j];
			}
		}
		if (i == (s - 1)) {
			/* Expecting MMX */
			p64_t x = (p64_t) target_mask(t->type).uq[0];
			m.q &= x.q;
		}
		*(t->regvec + i) = immed64(m);
	}
	return trunc;
}

/* This function needs to be checked over with a fine-toothed comb */
static int
vfrag128(register unsigned int decl_bits, register tree *t)
{
	/* Build a tuple tree to generate the fragments of a VNUM constant
	   for 128-bit targets.
	   Returns 1 if the value is truncated to fit the required precision,
	   and 0 otherwise.
	*/

	register unsigned int fragment, j;
	register unsigned int frags;
	register int trunc;
	p128_t m;
	register unsigned int bits = bitsperfield(decl_bits);

	#ifdef DEBUG_FUNCS
		info(0, "Start vfrag128()...");
	#endif
	#ifdef DEBUG_VNUM_DUMP
	{
		int x;
		for (x=0; x<701; ++x)
			fprintf(Cout, "numbuf[%d]=%d\n", x, numbuf[x]);
	}
	#endif


	frags = (unsigned) enh_size(t->type);
	trunc = 0;

	for (fragment=0; fragment<frags; ++fragment) {
		m.uq[1] = m.uq[0] = 0ULL;
		for (j=0; j<(128/bits); ++j) {
			p128_t c, k;
			/* Note: t->num is the starting index at which the
			   vector's elements are stored in numbuf[].
			*/

			switch (bits) {
			case 1:
			case 2:
			case 4:
				c = (p128_t)
				   {{numbuf[t->num+(fragment*(128/bits))+j],
				     0LL}};
				k = (p128_t)
				   {{(c.uq[0] &
				     ((0x1ULL<<(unsigned long long)bits)-1ULL)),
				    0LL}};
				#ifdef DEBUG_VNUM
					fprintf(Cout,
						"c={0x%016llx,0x%016llx} <- "
						"element %d "
						"(fragment=%d,j=%d)\n",
						c.uq[0], c.uq[1],
						(fragment*(128/bits))+j,
						fragment, j);
					fprintf(Cout,
						"k={0x%016llx,0x%016llx}\n",
						k.uq[0], k.uq[1]);
				#endif

				/* This test must be fixed if c.q[1] or
				   k.q[1] should be non-zero. */
				/* Because the stored numbuf entry is an int,
				   this test will suffice for now. */
				trunc |= (c.q[0] != k.q[0]);

				#ifdef DEBUG_VNUM
					fprintf(Cout,
						"m={0x%016llx,0x%016llx}\n",
						m.uq[0], m.uq[1]);
				#endif
				if ( (j*bits) > 63 ) {
					m.uq[1] |= (k.uq[0] << ((j*bits)-64U));
				} else {
					m.uq[0] |= (k.uq[0] << (j * bits));
				}
				#ifdef DEBUG_VNUM
					fprintf(Cout,
						"m={0x%016llx,0x%016llx}\n",
						m.uq[0], m.uq[1]);
				#endif
				break;
			case 8:
				m.b[j] = numbuf[t->num+(fragment*(128/bits))+j];
				break;
			case 16:
				m.w[j] = numbuf[t->num+(fragment*(128/bits))+j];
				break;
			case 32:
				m.d[j] = numbuf[t->num+(fragment*(128/bits))+j];
				break;
			case 64:
				m.q[j] = numbuf[t->num+(fragment*(128/bits))+j];
			}
		}
		#ifdef DEBUG_VNUM
			fprintf(Cout, "m={0x%016llx,0x%016llx}\n",
				m.uq[0], m.uq[1]);
		#endif

		if (fragment == (frags - 1)) {
			p128_t x = target_mask(t->type);

			#ifdef DEBUG_VNUM
				fprintf(Cout,
					"target_mask={0x%016llx,0x%016llx}\n",
					x.uq[0], x.uq[1]);
			#endif
			m.uq[1] &= x.uq[1];
			m.uq[0] &= x.uq[0];
		}

		*(t->regvec + fragment) = immed128(m);
		#ifdef DEBUG_VNUM
			fprintf(Cout,
				"t->regvec[%d] is {0x%016llx,0x%016llx}\n",
				fragment,
				tup[*(t->regvec+fragment)].immed.uq[0],
				tup[*(t->regvec+fragment)].immed.uq[1]);
		#endif
	}
	return trunc;
}

static void
vnumfrag(register tree *t)
{
	/* Build a tuple tree to generate the fragments of a VNUM constant.
	   Print a warning if the constant needs to be truncated to fit
	   the required precision.
	*/

	int trunc = 0;

	#ifdef DEBUG_FUNCS
		info(0, "Start vnumfrag()...");
	#endif

	regvec(t);

	switch (bitsperfrag()) {
	case 128:
		trunc = vfrag128(t->type.bits, t);
		break;
	case 64:
		trunc = vfrag64(t->type.bits, t);
		break;
	case 32:
		trunc = vfrag32(t->type.bits, t);
		break;
	default:
		{
			char buf[50];
			snprintf(buf,
				 50,
				 "vnumfrag() doesn't handle %d-bit frags",
				 bitsperfrag());
			bug(buf);
		}
	}

	if (trunc) {
		warn("constant value(s) truncated to fit precision");
	}
}

static void
redfrag(register tree *t)
{
	/* Build a tuple tree to generate the fragments of a reduction
	   operation.
	*/

	register int s;
	p128_t lastmask;
	register int op;
	register int bpf;

	#ifdef DEBUG_FUNCS
		info(0, "Start redfrag()...");
	#endif

	s = enh_size((t->down)->type);
	bpf = bitsperfrag();

	/* Allocate array of tuple indices for the fragments of "t" */
	regvec(t);

	/* Get a mask for the fields in the last fragment of the vector */
	lastmask = target_mask((t->down)->type);

	/* If last frag isn't full, we need to mask it to
	   the appropriate fields; other values are op-dependent
	*/
	if ( ((bpf == 128) && (
		 (lastmask.q[1] != 0xffffffffffffffffLL) ||
		 (lastmask.q[0] != 0xffffffffffffffffLL)))
	     || ((bpf == 64) && (lastmask.q[0] != 0xffffffffffffffffLL))
	     || ((bpf == 32) && (lastmask.d[0] != 0xffffffff))) {

		/* Set padding in last fragment to a value which won't ruin the
		   result */
		switch (t->op) {
		case ALL:
		case REDUCEAND:
		{
			register int i;

			/* pad with all ones */
			if (bpf == 128) {
				i = immed128((p128_t)
					    {{~lastmask.q[0], ~lastmask.q[1]}});
			} else if (bpf == 64) {
				i = immed64((p64_t) ~lastmask.q[0]);
			} else if (bpf == 32) {
				i = immed32((p32_t) ~lastmask.d[0]);
			} else {
				char buf[50];
				snprintf(buf,
					 50,
					 "redfrag() unknown bpf=%d",
					 bpf);
				bug(buf);
				/* Get rid of warning... */
				i = 0;
			}
			i = binop(OR,
				  i,
				  *((t->down)->regvec + s - 1),
				  typnull);
			*((t->down)->regvec + s - 1) = i;
			break;
		}
		default:
		{
			register int i;

			/* zero out padding (may insert a different value) */
			if (bpf == 128) {
				i = immed128u(lastmask);
			} else if (bpf == 64) {
				i = immed64u((p64_t)lastmask.q[0]);
			} else if (bpf == 32) {
				i = immed32u((p32_t)lastmask.d[0]);
			} else {
				char buf[50];
				snprintf(buf,
					 50,
					 "redfrag() unknown bpf=%d",
					 bpf);
				bug(buf);
				/* Get rid of warning... */
				i = 0;
			}
			i = binop(AND,
				  i,
				  *((t->down)->regvec + s - 1),
				  typnull);
			*((t->down)->regvec + s - 1) = i;
		}
		}

		/* Install no-effect constant values */
		switch (t->op) {
		case REDUCEMIN:
		{
			p128_t m;
			register int i;

			m = target_minval((t->down)->type);

			if (bpf == 128) {
				i = immed128((p128_t)
					     {{ m.q[0] & ~(lastmask.q[0]),
					        m.q[1] & ~(lastmask.q[1]) }});
			} else if (bpf == 64) {
				i = immed64((p64_t)
					    (m.q[0] & ~(lastmask.q[0])));
			} else if (bpf == 32) {
				i = immed32((p32_t)
					    (m.d[0] & ~(lastmask.d[0])));
			} else {
				char buf[50];
				snprintf(buf,
					 50,
					 "redfrag() unknown bpf=%d",
					 bpf);
				bug(buf);
				/* Get rid of warning... */
				i = 0;
			}

			i = binop(OR,
				  i,
				  *((t->down)->regvec + s - 1),
				  typnull);
			*((t->down)->regvec + s - 1) = i;
			break;
		}
		case REDUCEMAX:
		{
			p128_t m;
			register int i;

			m = target_maxval((t->down)->type);

			if (bpf == 128) {
				i = immed128((p128_t)
					     {{ m.q[0] & ~(lastmask.q[0]),
					        m.q[1] & ~(lastmask.q[1]) }});
			} else if (bpf == 64) {
				i = immed64((p64_t)
					    (m.q[0] & ~(lastmask.q[0])));
			} else if (bpf == 32) {
				i = immed32((p32_t)
					    (m.d[0] & ~(lastmask.d[0])));
			} else {
				char buf[50];
				snprintf(buf,
					 50,
					 "redfrag() unknown bpf=%d",
					 bpf);
				bug(buf);
				/* Get rid of warning... */
				i = 0;
			}

			i = binop(OR,
				  i,
				  *((t->down)->regvec + s - 1),
				  typnull);
			*((t->down)->regvec + s - 1) = i;
			break;
		}
		case REDUCEMUL:
		{
			p128_t m;
			register int i;

			m = target_ONES((t->down)->type);

			if (bpf == 128) {
				i = immed128((p128_t)
					     {{ m.q[0] & ~(lastmask.q[0]),
					        m.q[1] & ~(lastmask.q[1]) }});
			} else if (bpf == 64) {
				i = immed64((p64_t)
					    (m.q[0] & ~(lastmask.q[0])));
			} else if (bpf == 32) {
				i = immed32((p32_t)
					    (m.d[0] & ~(lastmask.d[0])));
			} else {
				char buf[50];
				snprintf(buf,
					 50,
					 "redfrag() unknown bpf=%d",
					 bpf);
				bug(buf);
				/* Get rid of warning... */
				i = 0;
			}

			i = binop(OR,
				  i,
				  *((t->down)->regvec + s - 1),
				  typnull);
			*((t->down)->regvec + s - 1) = i;
			break;
		}
		}
	}
			
	/* Which binary op is this? */
	switch (t->op) {
	case ALL:	op = LAND; break;
	case ANY:	op = LOR; break;
	case REDUCEADD:	op = ADD; break;
	case REDUCEAND:	op = AND; break;
	case REDUCEMAX:	op = MAX; break;
	case REDUCEMIN:	op = MIN; break;
	case REDUCEMUL:	op = MUL; break;
	case REDUCEOR:	op = OR; break;
	case REDUCEXOR:	op = XOR; break;
	default:
		{
			char buf[50];
			snprintf(buf,
				 50,
				 "redfrag() unknown op=%s",
				 opname(t->op));
			bug(buf);
			/* If we are here, there is an internal bug, but this
			   next lines gets rid of a warning, and doesn't cost
			   anything in normal execution time.
			*/
			op=ADD;
		}
	}

#ifdef	OLD
#ifdef	NOTNOW
	/* Use binary ops to reduce to a single p64_t...
	   this uses a binary tree rather than linear summation
	*/
	if (s > 1) {
		register int i, j, k;

		for (i=1; i<s; i+=i) {
			for (j=0; (j+i)<s; j+=(i + i)) {
				k = binop(op,
					  *((t->down)->regvec + j),
					  *((t->down)->regvec + j + i),
					  (t->down)->type);
				*((t->down)->regvec + j) = k;
			}
		}
	}
#endif

	/* Use binary ops to reduce to a single p64_t...
	   a binary tree would maximize parallelism, but that also
	   maximizes register pressure, and we only have 8 registers;
	   conversely, a linear reduction minimizes register use, but
	   provides no parallelism, and we may have two MMX pipelines.
	   For now, use linear summation assuming that there are
	   probably other instructions to insert between for pipes.
	*/
	if (s > 1) {
		register int i, j;

		j = *((t->down)->regvec);
		for (i=1; i<s; ++i) {
			j = binop(op,
				  j,
				  *((t->down)->regvec + i),
				  (t->down)->type);
		}
		*((t->down)->regvec) = j;
	}

	/* Now reduce within the single remaining fragment */
	{
		register int i = *((t->down)->regvec);

		switch (t->op) {
		case ALL:
		case ANY:
			i = unop(t->op, i, typnull);
			break;
		default:
			switch (bitsperfield((t->down)->type.bits)) {
			case 1:
				switch (t->op) {
				case REDUCEAND:
				case REDUCEMIN:
				case REDUCEMUL:
					i = unop(ALL, i, typnull);
					break;
				case REDUCEMAX:
				case REDUCEOR:
					i = unop(ANY, i, typnull);
					break;
				case REDUCEADD:
				case REDUCEXOR:
					/* really parity... */
					i = unop(REDUCEXOR, i, typnull);
				}
				break;
			case 2:
			{
				int j, k;

				if (t->down->type.attr & TYP_UNSIGN) {
					j = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x33)),
						  typnull);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 2ULL),
						  typnull);
					k = binop(AND,
						  k,
						  immedu(cvt1x8uto16x8u(0x33)),
						  typnull);
					i = binop(op, j, k, typ4u);
				} else {
					j = binop(SHL,
						  i,
						  immed64u((p64_t) 2ULL),
						  typnull);
					j = binop(SHR,
						  j,
						  immed64u((p64_t) 2ULL),
						  typ4);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 2ULL),
						  typ4);
					i = binop(op, j, k, typ4);
				}

				/* Fall through... */
			}
			case 4:
			{
				int j, k;

				if (t->down->type.attr & TYP_UNSIGN) {
					j = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x0f)),
						  typnull);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 4ULL),
						  typnull);
					k = binop(AND,
						  k,
						  immedu(cvt1x8uto16x8u(0x0f)),
						  typnull);
					i = binop(op, j, k, typ8u);
				} else {
					j = binop(SHL,
						  i,
						  immed64u((p64_t) 4ULL),
						  typnull);
					j = binop(SHR,
						  j,
						  immed64u((p64_t) 4ULL),
						  typ8);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 4ULL),
						  typ8);
					i = binop(op, j, k, typ8);
				}

				/* Fall through... */
			}
			case 8:
			{
				int j, k;

				if (t->down->type.attr & TYP_UNSIGN) {
					j = binop(AND,
						  i,
						  immedu(
						    cvt1x16uto8x16u(0x00ff)),
						  typnull);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 8ULL),
						  typnull);
					k = binop(AND,
						  k,
						  immedu(
						    cvt1x16uto8x16u(0x00ff)),
						  typnull);
					i = binop(op, j, k, typ16u);
				} else {
					j = binop(SHL,
						  i,
						  immed64u((p64_t) 8ULL),
						  typnull);
					j = binop(SHR,
						  j,
						  immed64u((p64_t) 8ULL),
						  typ16);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 8ULL),
						  typ16);
					i = binop(op, j, k, typ16);
				}

				/* Fall through... */
			}
			case 16:
			{
				int j, k;

				if (t->down->type.attr & TYP_UNSIGN) {
					j = binop(AND,
						  i,
						  immedu(cvt1x32uto4x32u(
							 (p32_t)0x0000ffff)),
						  typnull);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 16ULL),
						  typnull);
					k = binop(AND,
						  k,
						  immedu(cvt1x32uto4x32u(
							 (p32_t)0x0000ffff)),
						  typnull);
					i = binop(op, j, k, typ32u);
				} else {
					j = binop(SHL,
						  i,
						  immed64u((p64_t) 16ULL),
						  typnull);
					j = binop(SHR,
						  j,
						  immed64u((p64_t) 16ULL),
						  typ32);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 16ULL),
						  typ32);
					i = binop(op, j, k, typ32);
				}

				/* Top off if 32-bit register */
				if (bpf == 32) break;

				/* Otherwise, fall through... */
			}
			case 32:
			{
				int j, k;

				if (t->down->type.attr & TYP_UNSIGN) {
					j = binop(AND,
						  i,
						  immedu(cvt1x64uto2x64u((p64_t)
							0x00000000ffffffffULL)),
						  typnull);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 32ULL),
						  typnull);
					k = binop(AND,
						  k,
						  immedu(cvt1x64uto2x64u((p64_t)
							0x00000000ffffffffULL)),
						  typnull);
					i = binop(op, j, k, typ64u);
				} else {
					j = binop(SHL,
						  i,
						  immed64u((p64_t) 32ULL),
						  typnull);
					j = binop(SHR,
						  j,
						  immed64u((p64_t) 32ULL),
						  typ64);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 32ULL),
						  typ64);
					i = binop(op, j, k, typ64);
				}

				/* Top off if 64-bit register */
				if (bpf == 64) break;

				/* Fall through... */
			}
			case 64:
			{
				int j, k;

				/* Top off if 128-bit register */
				if (bpf == 128) {
					if (t->down->type.attr & TYP_UNSIGN) {
						j = binop(SHR,
						  i,
						  immed64u((p64_t) 64ULL),
						  typnull);
						i = binop(AND,
						  i,
						  immedu((p128_t)
						       {0x0000000000000000ULL,
							0xffffffffffffffffULL}),
						  typnull);
						i = binop(op, i, j, typ128u);
					} else {
						j = binop(SHL,
						  i,
						  immed64u((p64_t) 64ULL),
						  typ128);
						j = binop(SHR,
						  j,
						  immed64u((p64_t) 64ULL),
						  typ128);
						k = binop(SHR,
						  i,
						  immed64u((p64_t) 64ULL),
						  typ128);
						i = binop(op, i, j, typ128);
					}
					break;
				} else {
					bug ("redfrag() bpf not supported");
				}
			}
			}
		}

		*(t->regvec) = i;
	}

#else
	/* Reduce within each fragment */
	{
	    int frag;

	    for (frag=0; frag<s; ++frag) {
		register int i = *((t->down)->regvec + frag);

		switch (t->op) {
		case ALL:
		case ANY:
			i = unop(t->op, i, typnull);
			break;
		default:
			switch (bitsperfield((t->down)->type.bits)) {
			case 1:
				switch (t->op) {
				case REDUCEAND:
				case REDUCEMIN:
				case REDUCEMUL:
					i = unop(ALL, i, typnull);
					break;
				case REDUCEMAX:
				case REDUCEOR:
					i = unop(ANY, i, typnull);
					break;
				case REDUCEADD:
				case REDUCEXOR:
					/* really parity... */
					i = unop(REDUCEXOR, i, typnull);
				}
				break;
			case 2:
			{
				int j, k;

				if (t->down->type.attr & TYP_UNSIGN) {
					j = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x33)),
						  typnull);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 2ULL),
						  typnull);
					k = binop(AND,
						  k,
						  immedu(cvt1x8uto16x8u(0x33)),
						  typnull);
					i = binop(op, j, k, typ4u);
				} else {
					j = binop(SHL,
						  i,
						  immed64u((p64_t) 2ULL),
						  typnull);
					j = binop(SHR,
						  j,
						  immed64u((p64_t) 2ULL),
						  typ4);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 2ULL),
						  typ4);
					i = binop(op, j, k, typ4);
				}

				/* Fall through... */
			}
			case 4:
			{
				int j, k;

				if (t->down->type.attr & TYP_UNSIGN) {
					j = binop(AND,
						  i,
						  immedu(cvt1x8uto16x8u(0x0f)),
						  typnull);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 4ULL),
						  typnull);
					k = binop(AND,
						  k,
						  immedu(cvt1x8uto16x8u(0x0f)),
						  typnull);
					i = binop(op, j, k, typ8u);
				} else {
					j = binop(SHL,
						  i,
						  immed64u((p64_t) 4ULL),
						  typnull);
					j = binop(SHR,
						  j,
						  immed64u((p64_t) 4ULL),
						  typ8);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 4ULL),
						  typ8);
					i = binop(op, j, k, typ8);
				}

				/* Fall through... */
			}
			case 8:
			{
/* HEREHERE - AltiVec should use vsum4[su]bs when possible */
				int j, k;

				if (t->down->type.attr & TYP_UNSIGN) {
					j = binop(AND,
						  i,
						  immedu(
						    cvt1x16uto8x16u(0x00ff)),
						  typnull);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 8ULL),
						  typnull);
					k = binop(AND,
						  k,
						  immedu(
						    cvt1x16uto8x16u(0x00ff)),
						  typnull);
					i = binop(op, j, k, typ16u);
				} else {
					j = binop(SHL,
						  i,
						  immed64u((p64_t) 8ULL),
						  typnull);
					j = binop(SHR,
						  j,
						  immed64u((p64_t) 8ULL),
						  typ16);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 8ULL),
						  typ16);
					i = binop(op, j, k, typ16);
				}

				/* Fall through... */
			}
			case 16:
			{
/* HEREHERE - AltiVec should use vsum4shs when possible */
				int j, k;

				if (t->down->type.attr & TYP_UNSIGN) {
					j = binop(AND,
						  i,
						  immedu(cvt1x32uto4x32u(
							 (p32_t)0x0000ffff)),
						  typnull);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 16ULL),
						  typnull);
					k = binop(AND,
						  k,
						  immedu(cvt1x32uto4x32u(
							 (p32_t)0x0000ffff)),
						  typnull);
					i = binop(op, j, k, typ32u);
				} else {
					j = binop(SHL,
						  i,
						  immed64u((p64_t) 16ULL),
						  typnull);
					j = binop(SHR,
						  j,
						  immed64u((p64_t) 16ULL),
						  typ32);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 16ULL),
						  typ32);
					i = binop(op, j, k, typ32);
				}

				/* Top off if 32-bit register */
				if (bpf == 32) break;

				/* Otherwise, fall through... */
			}
			case 32:
			{
				int j, k;

				if (t->down->type.attr & TYP_UNSIGN) {
					j = binop(AND,
						  i,
						  immedu(cvt1x64uto2x64u((p64_t)
							0x00000000ffffffffULL)),
						  typnull);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 32ULL),
						  typnull);
					k = binop(AND,
						  k,
						  immedu(cvt1x64uto2x64u((p64_t)
							0x00000000ffffffffULL)),
						  typnull);
					i = binop(op, j, k, typ64u);
				} else {
					j = binop(SHL,
						  i,
						  immed64u((p64_t) 32ULL),
						  typnull);
					j = binop(SHR,
						  j,
						  immed64u((p64_t) 32ULL),
						  typ64);
					k = binop(SHR,
						  i,
						  immed64u((p64_t) 32ULL),
						  typ64);
					i = binop(op, j, k, typ64);
				}

				/* Top off if 64-bit register */
				if (bpf == 64) break;

				/* Fall through... */
			}
			case 64:
			{
				int j, k;

				/* Top off if 128-bit register */
				if (bpf == 128) {
				    if (optcpu & CPU_AltiVec) {
					/* Want result in virtual field 0 */
					if (t->down->type.attr & TYP_UNSIGN) {
						j = binop(SHR,
						  i,
						  immed64u((p64_t) 64ULL),
						  typnull);
						i = binop(AND,
						  i,
						  immedu((p128_t)
						     {{0xffffffffffffffffULL,
						       0x0000000000000000ULL}}),
						  typnull);
						i = binop(op, i, j, typ128u);
					} else {
						j = binop(SHL,
						  i,
						  immed64u((p64_t) 64ULL),
						  typ128u);
						j = binop(SHR,
						  j,
						  immed64u((p64_t) 64ULL),
						  typ128);
						i = binop(SHR,
						  i,
						  immed64u((p64_t) 64ULL),
						  typ128);
						i = binop(op, i, j, typ128);
					}
				    } else {
					if (t->down->type.attr & TYP_UNSIGN) {
						j = binop(SHR,
						  i,
						  immed64u((p64_t) 64ULL),
						  typnull);
						i = binop(AND,
						  i,
						  immedu((p128_t)
						     {{0x0000000000000000ULL,
						       0xffffffffffffffffULL}}),
						  typnull);
						i = binop(op, i, j, typ128u);
					} else {
						j = binop(SHL,
						  i,
						  immed64u((p64_t) 64ULL),
						  typ128);
						j = binop(SHR,
						  j,
						  immed64u((p64_t) 64ULL),
						  typ128);
						k = binop(SHR,
						  i,
						  immed64u((p64_t) 64ULL),
						  typ128);
						i = binop(op, i, j, typ128);
					}
				    }
				    break;
				} else {
					bug ("redfrag() bpf not supported");
				}
			}
			}
		}

		*((t->down)->regvec + frag) = i;
	    }
	}

	/* Use binary ops to reduce to a single fragment.
	   A binary tree would maximize parallelism, but that also
	   maximizes register pressure, and we only have 8 registers;
	   conversely, a linear reduction minimizes register use, but
	   provides no parallelism, and we may have two MMX pipelines.
	   For now, use linear summation assuming that there are
	   probably other instructions to insert between for pipes.
	*/
	{
		register int i, j;

		j = *((t->down)->regvec);
		for (i=1; i<s; ++i) {
			j = binop(op,
				  j,
				  *((t->down)->regvec + i),
				  t->type);
		}
		*(t->regvec) = j;
	}
#endif
}

static void
castfrag(register tree *t)
{
	/* Build a tuple tree to handle casting operations
	   the fragments of a vector */

	register int i;
	register int s;
	register int b;
	register int bfrom;

	#ifdef DEBUG_FUNCS
	{
		char buf[50];

		snprintf(buf,
			 50,
			 "Start castfrag()...t=%p",
			 t);
		info(0, buf);
	}
	#endif

	s = enh_size(t->type);	/* # frags to store t */
	b = bitsperfield(t->type.bits);
	bfrom = bitsperfield((t->down)->type.bits);

	/* Allocate array of tuple indices for the fragments of "t" */
	regvec(t);

	/* First, the easy case of a cast that really does nothing */
	if ((t->type.dim == (t->down)->type.dim) &&
	    (bfrom == b)) {
		/* Copy vector from the child */
		for (i=0; i<s; ++i) {
			*(t->regvec + i) = *((t->down)->regvec + i);
		}
		goto intfloat;
	}

	/* Ok, now the case of a cast from one value to a vector */
	if ((t->type.dim > 1) &&
	    ((t->down)->type.dim == 1)) {
		register int i;
		register int j;

		/* Create a fragment with the value replicated in all fields */
		if ( !t->down ) {
			bug("castfrag() was about to dereference null node");
			i = -1;
		} else if ( !t->down->regvec ) {
			bug("castfrag() was about to dereference null regvec");
			i = -1;
		} else {
			i = replicate(*((t->down)->regvec), b);
		}

		/* Clone it for the entire array...
		   wrong if the array length is not a multiple
		   of the fields per p64_t, but that should make no
		   difference because the reduce/store will fix it
		*/
		for (j=0; j<s; ++j) {
			*(t->regvec + j) = i;
		}

		goto intfloat;
	}

	/* Making elements smaller? */
	if (bfrom > b) {
		/* Kludge to make 32-bit float-to-int conversion
		   happen before size conversion.
		*/
		if ((t->type.attr & TYP_FLOAT) !=
		    ((t->down)->type.attr & TYP_FLOAT)) {
			for (i=0; i<enh_size((t->down)->type); ++i) {
				*((t->down)->regvec + i) =
				  unop(((t->type.attr & TYP_FLOAT) ?
					      I2F : F2I),
					     *((t->down)->regvec + i),
					     typ32);
			}
		}

		/* Handle everything else */
		pack(bfrom,
		     (t->down)->regvec,
		     b,
		     t->regvec,
		     enh_size((t->down)->type));
		return;
	}

	/* Making elements bigger is harder... */
	if (b > bfrom) {
		interleave ( bfrom,
			(t->down)->regvec,
			b,
			t->regvec,
			enh_size(t->type),
			t->type.attr);
		goto intfloat;
	}

	bug("unimplemented type cast");
	return;

intfloat:
	/* Int/float conversion?
	   First do other parts of the conversion,
	   then do the int/float part
	*/

	if ((t->type.attr & TYP_FLOAT) !=
	    ((t->down)->type.attr & TYP_FLOAT)) {
		for (i=0; i<s; ++i) {
			*(t->regvec + i) =
				unop(((t->type.attr & TYP_FLOAT) ?
				      I2F : F2I),
				     *(t->regvec + i),
				     t->type);
		}
	}

	#ifdef DEBUG_FUNCS
		info(0, "End castfrag().");
	#endif
}

static void
shiftfrag(register tree *t)
{
	/* Build a tuple tree to shift vector left by t->num fields */
	/* If t->num is negative, shift right by -(t->num) fields. */

	register int i;
	register int s;		/* frags in vector */
	register int b;		/* actual bits per field */
	register int e;		/* fields per frag */
	register int d;		/* shift count in fields (>0 means rt.) */
	register int wordoff;	/* shift count in full frags (>0 means rt.) */
	register long long dbits; /* remaining shift count within a frag (after
				     full frags shift) in bits (lt.) */
	p128_t lastword;

	#ifdef DEBUG_FUNCS
		info(0, "Start shiftfrag()...");
	#endif

	s = enh_size(t->type);
	b = bitsperfield(t->type.bits);
	e = (bitsperfrag() / b);
	d = -(t->num);
	wordoff = ((d < 0) ? -(-d / e) : (d / e));

	/* Allocate array of tuple indices for the fragments of "t" */
	regvec(t);


	/* if the last word has padding, sanitize the padding
	   before shifting:
	*/

	/* First, get a mask for the field bits in the last frag */
	lastword = target_mask((t->down)->type);

	/* Now apply it... */
	switch (bitsperfrag()) {
	case 128:
		if ( (lastword.q[1] != 0xffffffffffffffffLL) ||
		     (lastword.q[0] != 0xffffffffffffffffLL) ) {
			*((t->down)->regvec + s - 1) =
				binop(AND,
			      	*((t->down)->regvec + s - 1),
			      	immed128(lastword),
			      	typnull);
		}
		break;
	case 64:
		if (lastword.q[0] != 0xffffffffffffffffLL) {
			*((t->down)->regvec + s - 1) =
				binop(AND,
			      	*((t->down)->regvec + s - 1),
			      	immed64((p64_t)lastword.q[0]),
			      	typnull);
		}
		break;
	case 32:
		if (lastword.d[0] != 0xffffffffL) {
			*((t->down)->regvec + s - 1) =
				binop(AND,
			      	*((t->down)->regvec + s - 1),
			      	immed32((p32_t)lastword.d[0]),
			      	typnull);
		}
		break;
	default:
		bug("shiftfrag A: unknown register size");
	}


	/* shift elements to within a word of right spots,
	   because that costs nothing -- we are just renaming the
	   elements within the vector
	*/
	for (i=0; i<s; ++i) {
		register int j = i + wordoff;

		if ((j < 0) || (j >= s)) {
			switch (bitsperfrag()) {
			case 128:
				*(t->regvec+i) = immed128((p128_t){{0LL,0LL}});
				break;
			case 64:
				*(t->regvec+i) = immed64((p64_t) 0LL);
				break;
			case 32:
				*(t->regvec+i) = immed32((p32_t) 0);
				break;
			default:
				bug("shiftfrag B: unknown register size");
			}
		} else {
			*(t->regvec + i) = *((t->down)->regvec + j);
		}
	}
	d -= (wordoff * e);

	/* Here, d is the number of fields to shift within each single fragment
	   after the above shift by full fragments.
	   We are done if d is now 0.
	*/
	if (d == 0) return;

	/* now we have just a little shifting to do */
	dbits = (d * b);	/* Number of bits to shift frag right by */
	if (d > 0) {
		/* Shift right by d fields */
		p128_t low;

		/* make shift word masks */
		switch (bitsperfrag()) {
		case 128:
			if ( dbits>63 ) {
				low.uq[1] = 0LL;
				low.uq[0] = ((1ULL << (128ULL-(unsigned)dbits))
					     -1ULL);
			} else {
				low.uq[1] = ((1ULL << (64ULL-(unsigned)dbits))
					     -1ULL);
				low.uq[0] = 0xffffffffffffffffULL;
			}
			break;
		case 64:
			low.uq[1] = 0ULL;
			low.uq[0] = ((1ULL << (64ULL-(unsigned)dbits)) - 1ULL);
			break;
		case 32:
			low.uq[1] = 0ULL;
			low.ud[1] = 0UL;
			low.uq[0] = ((1UL << (32ULL-(unsigned)dbits)) - 1UL);
			break;
		default:
			bug("shiftfrag C: unknown register size");
		}

		/* shift elements to lower positions */
		for (i=0; i<s-wordoff; ++i) {
			register int j = *(t->regvec + i);
			register int k;

			if ((i + 1) < s) {
				k = *(t->regvec + i + 1);
			} else {
				k = immed128((p128_t) {{0LL,0LL}});
			}

			/* shift k into j d positions */
			switch (bitsperfrag()) {
			case 128:
				j = binop(SHR, j,
					  immed128((p128_t) {{dbits,0LL}}),
					  typnull);
				j = binop(AND, j, immed128(low), typnull);
				k = binop(SHL,
					  k,
					  immed128((p128_t){{128LL-dbits,0LL}}),
					  typnull);
				k = binop(AND,
					  k,
					  immed128((p128_t)
						   {{~low.q[0],~low.q[1]}}),
					  typnull);
				break;
			case 64:
				j = binop(SHR, j, immed64((p64_t) dbits),
					  typnull);
				j = binop(AND, j, immed64((p64_t) low.q[0]),
					  typnull);
				k = binop(SHL, k,
					  immed64((p64_t) (64LL - dbits)),	
					  typnull);
				k = binop(AND, k, immed64((p64_t) ~low.q[0]),
					  typnull);
				break;
			case 32:
				j = binop(SHR, j,
					  immed32((p32_t)((int)dbits)),
					  typnull);
				j = binop(AND, j, immed32((p32_t) low.d[0]),
					  typnull);
				k = binop(SHL, k,
					  immed32((p32_t)(32-((int)dbits))),	
					  typnull);
				k = binop(AND, k, immed32((p32_t) ~low.d[0]),
					  typnull);
				break;
			default:
				bug("shiftfrag E: unknown register size");
			}
			j = binop(OR, j, k, typnull);
			*(t->regvec + i) = j;
		}
	} else {
		/* Shift left by d fields */
		p128_t low;

		/* Make dbits positive */
		dbits = -dbits;

		/* make shift word masks */
		switch (bitsperfrag()) {
		case 128:
			if ( dbits > 63 ) {
				low.uq[1] = ((1ULL<<(unsigned)(dbits-64))-1ULL);
				low.uq[0] = 0xffffffffffffffffULL;
			} else {
				low.uq[1] = 0ULL;
				low.uq[0] = ((1ULL << dbits) - 1ULL);
			}
			break;
		case 64:
			low.uq[1] = 0ULL;
			low.uq[0] = ((1ULL << (unsigned)dbits) - 1ULL);
			break;
		case 32:
			low.q[1] = 0ULL;
			low.d[1] = 0;
			low.d[0] = ((1U << (unsigned)dbits) - 1U);
			break;
		default:
			bug("shiftfrag F: unknown register size");
		}

		/* shift elements to higher positions */
		for (i=s-1; i>=-wordoff; --i) {
			register int j = *(t->regvec + i);
			register int k;

			if ((i - 1) >= -wordoff) {
				k = *(t->regvec + i - 1);
			} else {
				k = immed128((p128_t){{0LL,0LL}});
			}

			/* shift k into j d positions */
			switch (bitsperfrag()) {
			case 128:
				j = binop(SHL,
					  j,
					  immed128((p128_t){{dbits,0LL}}),
					  typnull);
				/* I think this may be redundant depending on
				   how low is derived from dbits. */
				j = binop(AND,
					  j,
					  immed128((p128_t)
						   {{~low.q[0], ~low.q[1]}}),
					  typnull);
				k = binop(SHR,
					  k,
					  immed128((p128_t){{128LL-dbits,0LL}}),
					  typnull);
				/* I think this may be redundant depending on
				   how low is derived from dbits. */
				k = binop(AND,
					k,
					immed128((p128_t){{low.q[0],low.q[1]}}),
					typnull);
				break;
			case 64:
				j = binop(SHL, j, immed64((p64_t) dbits),
					  typnull);
				j = binop(AND, j, immed64((p64_t) ~low.q[0]),
					  typnull);
				k = binop(SHR, k,
					  immed64((p64_t) (64LL - dbits)),
				  	  typnull);
				k = binop(AND, k, immed64((p64_t) low.q[0]),
					  typnull);
				break;
			case 32:
				j = binop(SHL, j,
					  immed32((p32_t)((int)dbits)),
					  typnull);
				j = binop(AND, j, immed32((p32_t) ~low.d[0]),
					  typnull);
				k = binop(SHR, k,
					  immed32((p32_t)(32-((int)dbits))),
				  	  typnull);
				k = binop(AND, k, immed32((p32_t) low.d[0]),
					  typnull);
				break;
			default:
				bug("shiftfrag H: unknown register size");
			}
			j = binop(OR, j, k, typnull);
			*(t->regvec + i) = j;
		}
	}
}

static void
rotatefrag(register tree *t)
{
	/* Build a tuple tree to shift rotate vector (left or right?)
	   by "t"->num fields */
	/* Uses a rather sneaky cheap trick...
	   do two shifts and then piece-together the results;
	   this may generate a lot of needless tuples, but the
	   unreferenced tuples get ignored anyway
	*/
	tree up = *t;
	register int i;
	register int s;			/* frags in vector */

	#ifdef DEBUG_FUNCS
		info(0, "Start rotatefrag()...");
	#endif

	s = enh_size(t->type);

#ifdef NOTDEFD
	register int b = bitsperfield(t->type.bits);
	register int e = (bitsperfrag() / b);		/* fields per frag */
	register int d = t->num;		/* field rotate count */
	register int wordoff = (d / e);		/* full frags rotate count */
	register int m;				/* 1s mask of leftover bits */
	register int nm;			/* 0x mask of leftover bits */
	int bitsleftover = b * (d - (wordoff * e));

	/* Make both a 1s and 0s mask of the bits that would be leftover after
	   all the full frags are removed */
	switch (bitsperfrag()) {
	case 128:
		p128_t tmp;
		if (bitsleftover > 63) {
			tmp.q[1] = (1LL << (bitsleftover-64)) - 1LL;
			tmp.q[0] = 0xffffffffffffffffLL;
		} else {
			tmp.q[1] = 0LL;
			tmp.q[0] = (1LL << bitsleftover) - 1LL;
		}
		m = immed128( tmp );
		nm = immed128((p128_t) {~tmp.q[0], ~tmp.q[1]});
		break;
	case 64:
		m = immed64((p64_t) ((1LL << bitsleftover) - 1LL));
		nm = immed64((p64_t) ~((1LL << bitsleftover) - 1LL));
		break;
	case 32:
		m = immed32((p32_t) ((1LL << bitsleftover) - 1L));
		nm = immed32((p32_t) ~((1LL << bitsleftover) - 1L));
		break;
	default:
		bug("rotatefrag: unknown register size");
	}
#endif

	shiftfrag(&up);
	t->num = -(t->type.dim - t->num);
	shiftfrag(t);

	/* Simply OR them together...
	   the optimizer will remove ORs with 0s.
	*/
	for (i=0; i<s; ++i) {
		*(t->regvec + i) = binop(OR,
					 *(t->regvec + i),
					 *(up.regvec + i),
					 typnull);
	}
}

static void
sizeoffrag(register tree *t)
{
	#ifdef DEBUG_FUNCS
		info(0, "Start sizeoffrag()...");
	#endif

	regvec(t);
	switch (bitsperfrag()) {
	case 128:
		*(t->regvec) =
			immed128((p128_t)
				{{(long long)sse_size((t->symbol)->type),0LL}});
		break;
	case 64:
		*(t->regvec) =
			immed64((p64_t)
		      	((long long)
			 mmx_size((t->symbol)->type)));
		break;
	case 32:
		*(t->regvec) =
			immed32((p32_t)
		      	((int)
			 ia32_size((t->symbol)->type)));
		break;
	default:
		bug("sizeoffrag: unknown register size");
	}
}

static void
questfrag(register tree *t)
{
	/* Build a tuple tree for the trinary operation */
	/* The trinary is all done with masking only... */
	register int i, j, k, l;
	register int fragments;

	#ifdef DEBUG_FUNCS
		info(0, "Start questfrag()...");
	#endif

	fragments = enh_size(t->type);

	/* Allocate array of tuple indices for the fragments of "t" */
	regvec(t);

	/* For each fragment, build a tuple tree for the trinary operation. */
	for (i=0; i<fragments; ++i) {
		j = *((t->down)->regvec + i);			// test
		k = *(((t->down)->right)->regvec + i);		// true
		l = *((((t->down)->right)->right)->regvec + i);	// false

		/* Optimization for (a cond b)? */
		switch ( t->down->op ) {
		case EQ:
		case NE:
		case LT:
		case LE:
		case GT:
		case GE:
			/* Already in field mask form */
			break;

		default:
			switch (bitsperfrag()) {
			case 128:
				j=binop(NE,
					j,
					immed128((p128_t){{0LL,0LL}}),
					t->type);
				break;
			case 64:
				j=binop(NE, j, immed64((p64_t) 0LL), t->type);
				break;
			case 32:
				j=binop(NE, j, immed32((p32_t) 0), t->type);
				break;
			default:
				bug("questfrag: unknown register size");
			}
		}

		k = binop(AND, j, k, typnull);
		l = binop(ANDN, j, l, typnull);

		*(t->regvec + i) = binop(OR, k, l, typnull);
	}
}

static void
loadxfrag(register tree *t)
{
	/* Build a tuple tree to load the fragments of an array or vector
	   containing a particular element. */

	#ifdef DEBUG_FUNCS
		info(0, "Start loadxfrag()...");
	#endif

	/* Allocate array of tuple indices for the fragment */
	t->regvec = ((int *) malloc(sizeof(int)));

	/* Generate a load tuple tree for the fragment */
	*(t->regvec) = loadxop(t->symbol, t->down->regvec[0]);
}

static void
loadfrag(register tree *t)
{
	/* Build a tuple tree to load the fragments of an array or vector */
	/* The good news is that even if the array is not
	   aligned, the MMX instructions work, so we do
	   not have any nasty special-case stuff!  Even
	   better, we don't care about reading past the
	   end, so we just blast the 64-bit words into
	   registers.
	   Of course, this isn't true for AltiVec...
	*/
	register unsigned long long i;
	register unsigned long long fragments;

	#ifdef DEBUG_FUNCS
		info(0, "Start loadfrag()...");
	#endif

	fragments = (unsigned long long)enh_size(t->type);

	/* Allocate array of tuple indices for the fragments of "t" */
	regvec(t);

	/* Generate a load tuple tree for each fragment */
	if (optcpu & CPU_AltiVec) {
		/* AltiVec loads cannot be unaligned */
		for (i=0ULL; i<fragments; ++i) {
			*(t->regvec+i) =
				uloadop(t->symbol, immed64u((p64_t)i));
		}
	} else {
		for (i=0ULL; i<fragments; ++i) {
			*(t->regvec+i) =
				loadop(t->symbol, immed64u((p64_t)i), 0);
		}
	}
}

static void
storexfrag(register tree *t)
{
	/* Build a tuple tree to store the fragments of an array or vector
	   containing a particular element. */

	register int j;
	p128_t lastmask;

	#ifdef DEBUG_FUNCS
		info(0, "Start storexfrag()...");
	#endif

	/* Allocate array of tuple indices for the fragments of "t" */
	t->regvec = ((int *) malloc(sizeof(int)));
	*(t->regvec) = -1;


	/* Build a tuple tree to store the RHS fragment
	   into the corresponding LHS fragment.
	*/

	/* Make j the tuple for the LHS fragment */
	j = *(((t->down)->right)->regvec);

	/* The last fragment requires special handling... */
	/* Get a mask for the field bits in the last frag */
	lastmask = target_mask((t->symbol)->type);

	/* If "t" is a Non-SWAR,
	   is at the top-level or is a function argument,
	   and the last fragment is only partially used,
	   then mask off the unused portions.
	*/
	if ((((t->symbol)->type.attr & TYP_SWAR) == 0) &&
	    ((t->symbol)->scope < 2)) {
		int bpf = bitsperfrag();

		if ((bpf == 32) &&
		    (lastmask.d[0]!=0xffffffffL)) {
			*(((t->down)->right)->regvec) =
				binop(AND,
				      *(((t->down)->right)->regvec),
				      immed32((p32_t)
					lastmask.d[0]),
				      typnull);
			j = loadpadop(t->symbol, 0, 0);
			j = binop(AND,
				  j,
				  immed32((p32_t)
					(~lastmask.d[0])),
				  typnull);
			j = binop(OR,
				  j,
				  *(((t->down)->right)->regvec),
				  typnull);
		} else if ((bpf == 64) &&
		  (lastmask.q[0] != 0xffffffffffffffffLL)) {
			*(((t->down)->right)->regvec) =
				binop(AND,
				      *(((t->down)->right)->regvec),
				      immed64((p64_t)
					lastmask.q[0]),
				      typnull);
			j = loadpadop(t->symbol, 0, 0);
			j = binop(AND,
				  j,
				  immed64((p64_t)
					(~lastmask.q[0])),
				  typnull);
			j = binop(OR,
				  j,
				  *(((t->down)->right)->regvec),
				  typnull);
		} else if ((bpf == 128) &&
		  ((lastmask.q[1] != 0xffffffffffffffffLL) ||
		  (lastmask.q[0] != 0xffffffffffffffffLL))) {
			p128_t maskcomp;

			maskcomp.q[1] = ~lastmask.q[1];
			maskcomp.q[0] = ~lastmask.q[0];
			*(((t->down)->right)->regvec) =
				binop(AND,
				      *(((t->down)->right)->regvec),
				      immed128(lastmask),
				      typnull);
			if (optcpu & CPU_AltiVec) {
				j = uloadpadop(t->symbol, 0);
			} else {
				j = loadpadop(t->symbol, 0, 0);
			}
			j = binop(AND,
				  j,
				  immed128(maskcomp),
				  typnull);
			j = binop(OR,
				  j,
				  *(((t->down)->right)->regvec),
				  typnull);
		}
	}

	/* Use the same tuple tree for both the LHS and STORE frags */
	*(((t->down)->right)->regvec) = j;
	*(t->regvec) = j;

	/* Generate a store tuple tree for the fragment */
	storexop(j, t->symbol, t->down->regvec[0]);
}

static void
storefrag(register tree *t)
{
	/* Build a tuple tree to store the fragments of an array... */
	register unsigned long long i, j;
	register unsigned long long fragments;

	#ifdef DEBUG_FUNCS
		info(0, "Start storefrag()...");
	#endif

	fragments = enh_size(t->type);

	/* Allocate array of tuple indices for the fragments of "t" */
	regvec(t);

	/* For each fragment, build a tuple tree to store the RHS fragment
	   into the corresponding LHS fragment.
	*/
	for (i=0; i<fragments; ++i) {
		/* Make j the tuple for the LHS fragment */
		j = *((t->down)->regvec + i);

		#ifdef DEBUG_VNUM
		if (tup[j].op == NUM)
			fprintf(Cout, "NUM was {0x%016llx,0x%016llx}\n",
				tup[j].immed.uq[0], tup[j].immed.uq[1]);
		#endif

		/* The last fragment requires special handling... */
		if (i == (fragments - 1)) {
			p128_t lastmask;
			/* Get a mask for the field bits in the last frag */
			lastmask = target_mask((t->symbol)->type);

			/* If "t" is a Non-SWAR,
			   is at the top-level or is a function argument,
			   and the last fragment is only partially used,
			   then mask off the unused portions.
			*/
			if ((((t->symbol)->type.attr & TYP_SWAR) == 0) &&
			    ((t->symbol)->scope < 2)) {
				int bpf = bitsperfrag();

				if ((bpf == 32) &&
				    (lastmask.d[0]!=0xffffffffL)) {
					*((t->down)->regvec + i) =
						binop(AND,
						      *((t->down)->regvec + i),
						      immed32((p32_t)
							lastmask.d[0]),
						      typnull);
					j = loadpadop(t->symbol, i, 0);
					j = binop(AND,
						  j,
						  immed32((p32_t)
							(~lastmask.d[0])),
						  typnull);
					j = binop(OR,
						  j,
						  *((t->down)->regvec + i),
						  typnull);
				} else if ((bpf == 64) &&
				  (lastmask.q[0] != 0xffffffffffffffffLL)) {
					*((t->down)->regvec + i) =
						binop(AND,
						      *((t->down)->regvec + i),
						      immed64((p64_t)
							lastmask.q[0]),
						      typnull);
					j = loadpadop(t->symbol, i, 0);
					j = binop(AND,
						  j,
						  immed64((p64_t)
							(~lastmask.q[0])),
						  typnull);
					j = binop(OR,
						  j,
						  *((t->down)->regvec + i),
						  typnull);
				} else if ((bpf == 128) &&
				  ((lastmask.q[1] != 0xffffffffffffffffLL) ||
				  (lastmask.q[0] != 0xffffffffffffffffLL))) {
					p128_t maskcomp;

					maskcomp.q[1] = ~lastmask.q[1];
					maskcomp.q[0] = ~lastmask.q[0];
					*((t->down)->regvec + i) =
						binop(AND,
						      *((t->down)->regvec + i),
						      immed128(lastmask),
						      typnull);
					if (optcpu & CPU_AltiVec) {
						j = uloadpadop(t->symbol, i);
					} else {
						j = loadpadop(t->symbol, i, 0);
					}
					j = binop(AND,
						  j,
						  immed128(maskcomp),
						  typnull);
					j = binop(OR,
						  j,
						  *((t->down)->regvec + i),
						  typnull);
				}
			}
		}

		/* Use the same tuple tree for both the LHS and STORE frags */
		*((t->down)->regvec + i) = j;
		*(t->regvec + i) = j;

		#ifdef DEBUG_VNUM
			if (tup[j].op == NUM)
				fprintf(Cout, "NUM is {0x%016llx,0x%016llx}\n",
					tup[j].immed.uq[0], tup[j].immed.uq[1]);
		#endif
		if (optcpu & CPU_AltiVec) {
			/* AltiVec stores cannot be unaligned */
			ustoreop(j, t->symbol, immed64u((p64_t)i), typnull);
		} else {
			storeop(j, t->symbol, immed64u((p64_t)i), 0, typnull);
		}
	}

	#ifdef DEBUG_FUNCS
		info(0, "End storefrag().");
	#endif
}

static char *
argtype(typ *type)
{
	/* Returns a string containing the C type to use for "type".
	   Assumes that you should use the largest enhanced type
	     for the given architecture.
	   Overwrites the static buffer with each call. */
	static char buffer[22];
	strcpy(buffer, "");

	/* I'll treat the attributes TYP_UNSIGN and TYP_SHORT as modifiers to
	   the base type, which should also be part of the type name.
	   Although, this isn't actually what Scc does.  ;-) */
	if (type->attr & TYP_UNSIGN) strcat(buffer, "unsigned ");
	if (type->attr & TYP_SHORT) strcat(buffer, "short ");
	if (type->attr & TYP_LONG) strcat(buffer, "long ");
	if (type->attr & TYP_LLONG) strcat(buffer, "long long ");

	/* The base types should not be mixed. */
	if (type->attr & TYP_VOID) strcat(buffer, "void");
	if (type->attr & TYP_CHAR) strcat(buffer, "char");
	if (type->attr & TYP_INT) strcat(buffer, "int");
	if (type->attr & TYP_FLOAT) {
		if (type->attr & TYP_SWAR) {
			switch (bitsperfrag()) {
			case 128:
				strcat(buffer, "p128_t");
				break;
			case 64:
				strcat(buffer, "p64_t");
				break;
			case 32:
				strcat(buffer, "p32_t");
				break;
			default:
				bug("argtype: unknown register size");
			}
		} else {
			strcat(buffer, "float");
		}
	} else if (type->attr & TYP_SWAR) {
		switch (bitsperfrag()) {
		case 128:
			strcat(buffer, "p128_t");
			break;
		case 64:
			strcat(buffer, "p64_t");
			break;
		case 32:
			strcat(buffer, "p32_t");
			break;
		default:
			bug("argtype: unknown register size");
		}
	}
	return buffer;
}

static char *
ctype(typ *type)
{
	/* Returns a string containing the C type to use for "type".
	   Assumes that you should use the largest enhanced type
	     for the given architecture.
	   Overwrites the static buffer with each call. */
	static char buffer[25];
	strcpy(buffer, "");

	/* Should be either extern or static, not both.  If it is both,
	   I want the error to be output so it crashes loudly without adding
	   what should be an unnecessary check here. */
	if (type->attr & TYP_EXTERN) strcat(buffer, "extern ");
	if (type->attr & TYP_STATIC) strcat(buffer, "static ");

	/* Should be either void or a SWAR type, not both.  Again, crash
	   loudly if incorrect */
	if (type->attr & TYP_VOID) strcat(buffer, "void");
	if (type->attr &
	    (TYP_CHAR | TYP_SHORT | TYP_INT |
	     TYP_LONG | TYP_LLONG | TYP_FLOAT | TYP_SWAR)) {
		switch (bitsperfrag()) {
		case 128:
			strcat(buffer, "p128_t");
			break;
		case 64:
			strcat(buffer, "p64_t");
			break;
		case 32:
			strcat(buffer, "p32_t");
			break;
		default:
			bug("ctype: unknown register size");
		}
	}
	return buffer;
}

static int
cdim(typ *type)
{
	/* Returns the number of fragments necessary to hold "*type" if it is
	   a swar or array type.  Otherwise, returns 0. */

	if ((type->attr & TYP_SWAR) ||
	    (type->dim > 1)) {
		return ( enh_size(*type) );
	}
	return 0;
}

static void
flush_enh_regs (void)
{
	/* Output code to flush state of enhanced registers */

	if ( (optcpu & CPU_3DNow) || (optcpu & CPU_XMMX) ) {
		Ctab();
		fprintf(Cout, "femms();");
	} else if ( optcpu & CPU_MMX ) {
		Ctab();
		fprintf(Cout, "emms();");
	}
	fprintf(Cout, "\n");
	fflush(Cout);
}

static void
fragment(register tree *t, register tree *root)
{
	/* This function handles output for control constructs, and drives
	   the *frag() functions to generate tuple lists to operate on the
	   fragments of the vectors operated on in the IR tree "t".
	*/

	#ifdef DEBUG_FUNCS
		info(0, "Start fragment()...");
	#endif

	if (t) {
		register tree *n;

		/* Generate the control structures and tuple lists for the
		   operation represented by the current IR tree node.
		*/
		switch (t->op) {
		case BLOCK:
		{
			register int serial = (t->symbol)->serial;
			register sym *p = &(symtab[0]);
			register int dim;

			/* If this is the root node, then there is no previous
			   block to end.  Otherwise, this is a new block within
			   a block, so we need to end the outer one. */
			if (t != root)
				end_bb();

			if (optlines) {
				fprintf(Cout, "\n/* # %d \"%s\" */\n",
					t->line, sourcename);
			}
			Ctab(); ++Ctabpos;
			fprintf(Cout, "{\n");
			fflush(Cout);

			while (p <= (symptr - 1)) {
				/* This serial number and non-empty text */
				if ((p->serial == serial) &&
				    (p->text != 0) &&
				    (*(p->text)) &&
				    (!(p->type.attr & TYP_LAB))) {
					Ctab();
					fprintf(Cout, "volatile ");
					fprintf(Cout, "%s ", ctype(&(p->type)));
					fprintf(Cout, "%s", p->text);
					if ( (dim=cdim(&(p->type))) )
						fprintf(Cout, "[%d]", dim);
					fprintf(Cout, ";\n");
					fflush(Cout);
				}
				++p;
			}

			begin_bb();
			break;
		}
		case IF:
		{
			fragment(t->down,root);
			end_bb();
			Ctab(); ++Ctabpos;
			fprintf(Cout, "if (_if[0].d[0]) {\n");
			begin_bb();
			fragment((t->down)->right,root);
			end_bb();
			if (((t->down)->right)->right) {
				/* has an else clause */
				--Ctabpos; Ctab(); ++Ctabpos;
				fprintf(Cout, "} else {\n");
				begin_bb();
				fragment(((t->down)->right)->right,root);
				end_bb();
			}
			--Ctabpos; Ctab();
			fprintf(Cout, "} /* end if */\n");
			fflush(Cout);
			begin_bb();
			return;
		}
		case WHILE:
		{
			end_bb();
			Ctab();
			/* The test code emitted here works as is, because
			   _while is a SWAR type (see while, for, do sections
			   in swarc.g.  However, considering the way _while is
			   actually used, I think it should be a C type
			   instead.  Masking out the TYP_SWAR in the above
			   cases causes a test in those cases to fail, which
			   I don't know how to correct off-hand, so for now
			   I'll just change this to work with those cases as
			   they are currently written (7-20-2000).
			*/
			switch(bitsperfrag()) {
			case 128:
				fprintf(Cout,
					"if (!_while[0].d[%d]) goto %s;\n",
					1,
					// target_field(0,32),
					(t->symbol)->text);
				break;
			case 64:
				fprintf(Cout,
					"if (!_while[0].d[%d]) goto %s;\n",
					target_field(0,32),
					(t->symbol)->text);
				break;
			case 32:
				fprintf(Cout,
					"if (!_while[0].d) goto %s;\n",
					(t->symbol)->text);
				break;
			}
			fflush(Cout);
			begin_bb();
			return;
		}
		case DO:
		{
			end_bb();
			Ctab();
			fprintf(Cout,
				"if (_do[0].d[0]) goto %s;\n",
				(t->symbol)->text);
			fflush(Cout);
			begin_bb();
			return;
		}
		case GOTO:
		{
			end_bb();
			Ctab();
			fprintf(Cout, "goto %s;\n", (t->symbol)->text);
			fflush(Cout);
			begin_bb();
			return;
		}
		case LABEL:
		{
			end_bb();
			fprintf(Cout, "%s:\n", (t->symbol)->text);
			fflush(Cout);
			begin_bb();
			return;
		}
		case CCODE:
		{
			end_bb();

			/* Flush enhanced state just in case */
			/* We don't need to do this if this is the first
			   statment in the function body */
			if (root->down != t) flush_enh_regs();

			/* Now output the C code in its own scope */
			Ctab();
			fprintf(Cout, "{\n");
			fprintf(Cout, "\n%s\n", (t->symbol)->text);
			Ctab();
			fprintf(Cout, "}\n");
			fflush(Cout);
			begin_bb();
			return;
		}
		case RETURN:
		{
			returncalled = 1;
			end_bb();
			Ctab();
			fprintf(Cout, "goto _return;\n");
			fflush(Cout);
			begin_bb();
			return;
		}
		case CALL:
		{
			register int i = 0;
			register tree *p = t->down;
			register sym *argsym;

			/* first do the stores to dummy arguments */
			while (p) {
				if (p->op == EXPR) {
					fragment(p,root);
				}
				p = p->right;
			}
			end_bb();

			/* Flush enhanced state just in case */
			flush_enh_regs();

			/* Now output the function name */
			Ctab();
			fprintf(Cout, "%s(", (t->symbol)->text);

			/* and the arguments */
			argsym = (t->symbol + 1);
			p = t->down;
			while (p) {
				/* Take care of argument type cast */
				if ((t->symbol)->scope+1 != argsym->scope) {
					/* too many args */
					error(  "function call has too many "
						"arguments");
					fprintf(Cout, "(");
				} else {
					fprintf(Cout, "((");
					fprintf(Cout, "%s ",
						argtype(&(argsym->type)) );
					fprintf(Cout, "*) ");
					++argsym;
				}

				/* Output the argument itself */
				if (p->op == EXPR) {
					fprintf(Cout,
						"&%s",
						argname(i));
				} else {
					if ((p->symbol)->scope != 1) {
						fprintf(Cout, "&");
					}
					fprintf(Cout,
						"%s",
						(p->symbol)->text);
				}
				++i;

				/* end the type cast */
				fprintf(Cout, ")");

				/* is there another arg? */
				if (p->right) fprintf(Cout, ", ");

				p = p->right;
			}

			/* end of function call args */
			fprintf(Cout, ");\n");
			fflush(Cout);
			begin_bb();
			return;
		}
		}

		/* Fragment children in order */
		if ((n = t->down)) {
			do {
				fragment(n,root);
			} while ((n = n->right));
		}

		/* Handle expressions and blocks */
		switch (t->op) {
		case BLOCK:
			#ifdef DEBUG_BLOCKS
				info(0, "fragment():A");
				fflush(stderr);
			#endif
			deadscope((t->symbol)->serial);
			#ifdef DEBUG_BLOCKS
				info(0, "fragment():B");
				fflush(stderr);
			#endif
			end_bb();
			#ifdef DEBUG_BLOCKS
				info(0, "fragment():C");
				fflush(stderr);
			#endif
			--Ctabpos; Ctab();
			fprintf(Cout, "}\n");
			fflush(Cout);
			begin_bb();
			break;
		case QUEST:
			fieldcount += (2 * t->type.dim);
			questfrag(t);
			break;
		case LOADX:
			fieldcount += (2 * t->type.dim);
			loadxfrag(t);
			break;
		case LOAD:
			fieldcount += (2 * t->type.dim);
			loadfrag(t);
			break;
		case STOREX:
			fieldcount += t->type.dim;
			storexfrag(t);
			break;
		case STORE:
			fieldcount += t->type.dim;
			storefrag(t);
			break;
		case NUM:
			++fieldcount;
			numfrag(t);
			break;
		case VNUM:
			fieldcount += t->type.dim;
			vnumfrag(t);
			break;

		case ADD:
		case AND:
		case EQ:
		case GE:
		case GT:
		case LAND:
		case LE:
		case LOR:
		case LT:
		case NE:
		case OR:
		case SHL:	/* Intra-element shift */
		case SHR:	/* Intra-element shift */
		case INTRLVEVEN:
		case INTRLVODD:
		case PERM:
		case SUB:
		case XOR:
			fieldcount += t->type.dim;
			binfrag(t);
			break;
		case AVG:
		case DIV:
		case MAX:
		case MIN:
		case MOD:
		case MUL:
			fieldcount += (3 * t->type.dim);
			binfrag(t);
			break;
		case LNOT:
		case NEG:
		case NOT:
			fieldcount += t->type.dim;
			unfrag(t);
			break;
		case ALL:
		case ANY:
		case REDUCEADD:
		case REDUCEAND:
		case REDUCEOR:
		case REDUCEXOR:
			fieldcount += (t->type.dim - 1);
			redfrag(t);
			break;
		case REDUCEAVG:
		case REDUCEMAX:
		case REDUCEMIN:
		case REDUCEMUL:
			fieldcount += (3 * (t->type.dim - 1));
			redfrag(t);
			break;
		case CAST:
			/* Assume casts have no serial cost */
			castfrag(t);
			break;
		case SHIFT:	/* Inter-element shift */
			fieldcount += t->type.dim;
			shiftfrag(t);
			break;
		case ROTATE:	/* Inter-element rotate */
			fieldcount += t->type.dim;
			rotatefrag(t);
			break;
		case SIZEOF:
			++fieldcount;
			sizeoffrag(t);
			break;
		}
	}
}


void
cg_tree(tree *t)
{
	/* This is the driver for the backend of the SWARC compiler.
	   It initializes the constant and spill pools, then begins the
	     scheduling and code generation processes.
	   It then outputs code to cleanup the enhanced registers.
	   Finally, it optionally calculates and displays performance
	     estimations.
	*/

	clocktotal = 0;
	fieldtotal = 0;
	returncalled = 0;

	/* Initialize pseudo-regs for non-scheduled targets (GenericIA32) */
	pseudoregsinit();

	/* Initialize constant pool */
	cpoolinit();

	/* Initialize spill pool */
	spoolinit();

	/* Initialize scheduler master tuple list pointer and fieldcount */
	begin_bb();

	/* Generate code for the tree representing the function body */
	fragment(t,t);

	/* Output the return label if there was a reference to it.
	   This may be followed by code to clean up after the function body. */
	if (returncalled)
		fprintf(Cout, "\n_return:\n");

	/* Output code to flush the enhanced registers */
	flush_enh_regs();

	/* Calculate and announce speedup estimate */
	if (optperf && (clocktotal > 0)) {
		register double d = (fieldtotal / (0.5 * clocktotal));
		char perf[256];

		snprintf(perf,
			256,
			"Scc SWAR code for %s() should yield about %1.1fx "
			"speedup",
			funcname,
			d);
		info(1, perf);
		if (d < 1.0) {
			snprintf(perf,
				256,
				"Scc SWAR code for %s() might be less efficient"
				" than serial C code",
				funcname);
			warn(perf);
		}
	}
}

