/*	Scc_altivec.h

	GCC interface library to the PowerPC AltiVec extensions using two
	registers (hack for reusing MMX-base scheduler for AltiVec).

	To use this library, include this header file and compile with GCC.
	You MUST have inlining enabled in order for altivec_ok() to work;
	this can be done by using -O on the GCC command line.

	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
	AND FITNESS FOR ANY PARTICULAR PURPOSE.

	2000 by R. Fisher
*/

#ifndef _ALTIVEC_H
#define _ALTIVEC_H

/*	Helper functions for the instruction macros that follow...
	(note that memory-to-register, m2r, instructions are nearly
	 as efficient as register-to-register, r2r, instructions;
	 however, memory-to-memory instructions are really simulated
	 as a convenience, and are only 1/3 as efficient)
*/
#define alt_i2r(op, imm, regs, regd) \
	__asm__ __volatile__ (#op " " #regd ", " #regs ", %0" \
			      : /* nothing */ \
			      : "X" (imm) )

#define alt_y2r(op, mem, regs, regd) \
	__asm__ __volatile__ (#op " " #regd ", %0, " #regd \
			      : /* nothing */ \
			      : "X" (mem) )

#ifdef OLDALT
#define alt_m2r(op, mem, reg) \
	__asm__ __volatile__ (#op " " #reg ", %0" \
			      : /* nothing */ \
			      : "X" (mem) )
#else
#define alt_m2r(op, mem, reg) \
{ \
	register long treg; \
	__asm__ __volatile__ ("la %0, %1\n\t" \
			      #op " " #reg ", 0, %0" \
			      : "=r" (treg) \
			      : "X" (mem) ); \
}
#endif

#ifdef OLDALT
#define alt_r2m(op, reg, mem) \
	__asm__ __volatile__ (#op " " #reg ", %0" \
			      : /* nothing */ \
			      : "X" (mem) )
#else
#define alt_r2m(op, reg, mem) \
{ \
	register long treg; \
	__asm__ __volatile__ ("la %0, %1\n\t" \
			      #op " " #reg ", 0, %0" \
			      : "=r" (treg) \
			      : "X" (mem) ); \
}
#endif



#define alt_r2x(op, reg, ea) \
	__asm__ __volatile__ (#op " " #reg ", 0, " #ea)

#define alt_rr2r(op, vrega, vregb, vregd) \
	__asm__ __volatile__ (#op  " " #vregd ", " #vrega ", " #vregb)

#define alt_r2r(op, vregs, vregd) \
	__asm__ __volatile__ (#op  " " #vregd ", " #vregs)




/*	1x128 Vector Move
*/
#define	vmr_r2r(regs, regd)	alt_rr2r(vor, regs, regs, regd)



/*	4x32, 8x16, and 16x8 Vector ADDs
*/
#define	vaddw_r2r(vregA, vregB)		alt_rr2r(vadduwm, vregB, vregA, vregB)
#define	vaddh_r2r(vregA, vregB)		alt_rr2r(vadduhm, vregB, vregA, vregB)
#define	vaddb_r2r(vregA, vregB)		alt_rr2r(vaddubm, vregB, vregA, vregB)

/*	4x32, 8x16, and 16x8 Vector ADDs
*/
#define	vadduw_r2r(vregA, vregB)	alt_rr2r(vadduwm, vregB, vregA, vregB)
#define	vadduh_r2r(vregA, vregB)	alt_rr2r(vadduhm, vregB, vregA, vregB)
#define	vaddub_r2r(vregA, vregB)	alt_rr2r(vaddubm, vregB, vregA, vregB)
#define	vadduwm_r2r(vregA, vregB)	alt_rr2r(vadduwm, vregB, vregA, vregB)
#define	vadduwm_m2r(mem, vregD)		alt_y2r(vadduwm, mem, vregD, vregD)
#define	vadduhm_r2r(vregA, vregB)	alt_rr2r(vadduhm, vregB, vregA, vregB)
#define	vaddubm_r2r(vregA, vregB)	alt_rr2r(vaddubm, vregB, vregA, vregB)

/*	4x32, 8x16, and 16x8 Vector ADDs
*/
#define	vaddsw_r2r(vregA, vregB)	alt_rr2r(vadduwm, vregB, vregA, vregB)
#define	vaddsh_r2r(vregA, vregB)	alt_rr2r(vadduhm, vregB, vregA, vregB)
#define	vaddsb_r2r(vregA, vregB)	alt_rr2r(vaddubm, vregB, vregA, vregB)




/*	4x32 Floating-Point Vector ADDs
*/
#define	vaddfp_r2r(vregA, vregB)	alt_rr2r(vaddfp, vregB, vregA, vregB)


/*	4x32, 8x16 and 16x8 Vector ADDs using Signed Saturation arithmetic
*/
#define	vaddsws_r2r(vregA, vregB)	alt_rr2r(vaddsws, vregB, vregA, vregB)
#define	vaddshs_r2r(vregA, vregB)	alt_rr2r(vaddshs, vregB, vregA, vregB)
#define	vaddsbs_r2r(vregA, vregB)	alt_rr2r(vaddsbs, vregB, vregA, vregB)


/*	4x32, 8x16 and 16x8 Vector ADDs using Unsigned Saturation arithmetic
*/
#define	vadduws_r2r(vregA, vregB)	alt_rr2r(vadduws, vregB, vregA, vregB)
#define	vadduhs_r2r(vregA, vregB)	alt_rr2r(vadduhs, vregB, vregA, vregB)
#define	vaddubs_r2r(vregA, vregB)	alt_rr2r(vaddubs, vregB, vregA, vregB)


/*	4x32 Vector Unsigned ADD Carryout
*/
#define	vaddcuw_r2r(vregA, vregB)	alt_rr2r(vaddcuw, vregB, vregA, vregB)


/*	4x32, 8x16, and 16x8 Vector SUBs
*/
#define	vsubw_r2r(vregA, vregB)		alt_rr2r(vsubuwm, vregB, vregA, vregB)
#define	vsubh_r2r(vregA, vregB)		alt_rr2r(vsubuhm, vregB, vregA, vregB)
#define	vsubb_r2r(vregA, vregB)		alt_rr2r(vsububm, vregB, vregA, vregB)

/*	4x32, 8x16, and 16x8 Vector SUBs
*/
#define	vsubuw_r2r(vregA, vregB)	alt_rr2r(vsubuwm, vregB, vregA, vregB)
#define	vsubuh_r2r(vregA, vregB)	alt_rr2r(vsubuhm, vregB, vregA, vregB)
#define	vsubub_r2r(vregA, vregB)	alt_rr2r(vsububm, vregB, vregA, vregB)

/*	4x32, 8x16, and 16x8 Vector SUBs
*/
#define	vsubsw_r2r(vregA, vregB)	alt_rr2r(vsubuwm, vregB, vregA, vregB)
#define	vsubsh_r2r(vregA, vregB)	alt_rr2r(vsubuhm, vregB, vregA, vregB)
#define	vsubsb_r2r(vregA, vregB)	alt_rr2r(vsububm, vregB, vregA, vregB)


/*	4x32 Floating-Point Vector SUBs
*/
#define	vsubfp_r2r(vregA, vregB)	alt_rr2r(vsubfp, vregB, vregA, vregB)


/*	4x32, 8x16 and 16x8 Vector SUBs using Signed Saturation arithmetic
*/
#define	vsubsws_r2r(vregA, vregB)	alt_rr2r(vsubsws, vregB, vregA, vregB)
#define	vsubshs_r2r(vregA, vregB)	alt_rr2r(vsubshs, vregB, vregA, vregB)
#define	vsubsbs_r2r(vregA, vregB)	alt_rr2r(vsubsbs, vregB, vregA, vregB)


/*	4x32, 8x16 and 16x8 Vector SUBs using Unsigned Saturation arithmetic
*/
#define	vsubuws_r2r(vregA, vregB)	alt_rr2r(vsubuws, vregB, vregA, vregB)
#define	vsubuhs_r2r(vregA, vregB)	alt_rr2r(vsubuhs, vregB, vregA, vregB)
#define	vsububs_r2r(vregA, vregB)	alt_rr2r(vsububs, vregB, vregA, vregB)


/*	4x32 Vector Unsigned SUB Carryout
*/
#define	vsubcuw_r2r(vregA, vregB)	alt_rr2r(vsubcuw, vregB, vregA, vregB)


/*	8x16 and 16x8 Vector MUL Odd Unsigned
*/
#define	vmulouh_r2r(vregA, vregB)	alt_rr2r(vmulouh, vregB, vregA, vregB)
#define	vmuloub_r2r(vregA, vregB)	alt_rr2r(vmuloub, vregB, vregA, vregB)


/*	8x16 and 16x8 Vector MUL Odd Signed
*/
#define	vmulosh_r2r(vregA, vregB)	alt_rr2r(vmulosh, vregB, vregA, vregB)
#define	vmulosb_r2r(vregA, vregB)	alt_rr2r(vmulosb, vregB, vregA, vregB)


/*	8x16 and 16x8 Vector MUL Even Unsigned
*/
#define	vmuleuh_r2r(vregA, vregB)	alt_rr2r(vmuleuh, vregB, vregA, vregB)
#define	vmuleub_r2r(vregA, vregB)	alt_rr2r(vmuleub, vregB, vregA, vregB)


/*	8x16 and 16x8 Vector MUL Even Signed
*/
#define	vmulesh_r2r(vregA, vregB)	alt_rr2r(vmulesh, vregB, vregA, vregB)
#define	vmulesb_r2r(vregA, vregB)	alt_rr2r(vmulesb, vregB, vregA, vregB)



/*	Vector Mul High and ADD Signed Half-word with Saturate
#define	vmhaddshs_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vmhaddshs, vregA, vregB, vregC, vregD)
*/


/*	Vector Mul High Round and ADD Signed Half-word with Saturate
#define	vmhraddshs_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vmhraddshs, vregA, vregB, vregC, vregD)
*/


/*	Vector Mul Low and ADD Half-word Modular
#define	vmladdhm_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vmladduhm, vregA, vregB, vregC, vregD)
*/


/*	Vector Multiply-ADD Floating-Point
#define	vmaddfp_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vmaddfp, vregA, vregB, vregC, vregD)
*/


/*	Vector Negative Multiply-SUB Floating-Point
#define	vnmsubfp_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vnmsubfp, vregA, vregB, vregC, vregD)
*/


/*	Vector Multiply-Sum Unsigned Modular
#define	vmsumuhm_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vmsumuhm, vregA, vregB, vregC, vregD)
#define	vmsumubm_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vmsumubm, vregA, vregB, vregC, vregD)
*/


/*	Vector Multiply-Sum Signed Half-word Modular
#define	vmsumshm_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vmsumshm, vregA, vregB, vregC, vregD)
*/


/*	Vector Multiply-Sum Mixed Byte Modular
#define	vmsummbm_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vmsummbm, vregA, vregB, vregC, vregD)
*/


/*	Vector Multiply-Sum Signed Half-word Saturate
#define	vmsumshs_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vmsumshs, vregA, vregB, vregC, vregD)
*/


/*	Vector Multiply-Sum Unsigned Half-word Saturate
#define	vmsumuhs_r2r(vregA, vregB, vregC, vregD)	\
	alt_rr2r(vmsumuhs, vregA, vregB, vregC, vregD)
*/



/*	Vector Sum Across Signed Word Saturate
*/
#define	vsumsws_r2r(vregA, vregB)	alt_rr2r(vsumsws, vregB, vregA, vregB)


/*	Vector Sum Across Partial (1/2) Signed Word Saturate
*/
#define	vsum2sws_r2r(vregA, vregB)	alt_rr2r(vsum2sws, vregB, vregA, vregB)


/*	Vector Sum Across Partial (1/4) Unsigned Word Saturate
*/
#define	vsum4ubs_r2r(vregA, vregB)	alt_rr2r(vsum4ubs, vregB, vregA, vregB)


/*	Vector Sum Across Partial (1/4) Signed Saturate
*/
#define	vsum4sbs_r2r(vregA, vregB)	alt_rr2r(vsum4sbs, vregB, vregA, vregB)
#define	vsum4shs_r2r(vregA, vregB)	alt_rr2r(vsum4shs, vregB, vregA, vregB)




/*	4x32, 8x16, and 16x8 Unsigned Vector AVG
*/
#define	vavguw_r2r(vregA, vregB)	alt_rr2r(vavguw, vregB, vregA, vregB)
#define	vavguh_r2r(vregA, vregB)	alt_rr2r(vavguh, vregB, vregA, vregB)
#define	vavgub_r2r(vregA, vregB)	alt_rr2r(vavgub, vregB, vregA, vregB)


/*	4x32, 8x16, and 16x8 Signed Vector AVG
*/
#define	vavgsw_r2r(vregA, vregB)	alt_rr2r(vavgsw, vregB, vregA, vregB)
#define	vavgsh_r2r(vregA, vregB)	alt_rr2r(vavgsh, vregB, vregA, vregB)
#define	vavgsb_r2r(vregA, vregB)	alt_rr2r(vavgsb, vregB, vregA, vregB)



/*	4x32, 8x16, and 16x8 Unsigned Vector MAX
*/
#define	vmaxuw_r2r(vregA, vregB)	alt_rr2r(vmaxuw, vregB, vregA, vregB)
#define	vmaxuh_r2r(vregA, vregB)	alt_rr2r(vmaxuh, vregB, vregA, vregB)
#define	vmaxub_r2r(vregA, vregB)	alt_rr2r(vmaxub, vregB, vregA, vregB)


/*	4x32, 8x16, and 16x8 Signed Vector MAX
*/
#define	vmaxsw_r2r(vregA, vregB)	alt_rr2r(vmaxsw, vregB, vregA, vregB)
#define	vmaxsh_r2r(vregA, vregB)	alt_rr2r(vmaxsh, vregB, vregA, vregB)
#define	vmaxsb_r2r(vregA, vregB)	alt_rr2r(vmaxsb, vregB, vregA, vregB)


/*	4x32 Floating-Point Vector MAXs
*/
#define	vmaxfp_r2r(vregA, vregB)	alt_rr2r(vmaxfp, vregB, vregA, vregB)



/*	4x32, 8x16, and 16x8 Unsigned Vector MIN
*/
#define	vminuw_r2r(vregA, vregB)	alt_rr2r(vminuw, vregB, vregA, vregB)
#define	vminuh_r2r(vregA, vregB)	alt_rr2r(vminuh, vregB, vregA, vregB)
#define	vminub_r2r(vregA, vregB)	alt_rr2r(vminub, vregB, vregA, vregB)


/*	4x32, 8x16, and 16x8 Signed Vector MIN
*/
#define	vminsw_r2r(vregA, vregB)	alt_rr2r(vminsw, vregB, vregA, vregB)
#define	vminsh_r2r(vregA, vregB)	alt_rr2r(vminsh, vregB, vregA, vregB)
#define	vminsb_r2r(vregA, vregB)	alt_rr2r(vminsb, vregB, vregA, vregB)


/*	4x32 Floating-Point Vector MINs
*/
#define	vminfp_r2r(vregA, vregB)	alt_rr2r(vminfp, vregB, vregA, vregB)



/*	4x32 Floating-Point Vector Recipricol Estimate
*/
#define	vrefp_r2r(vregB)		alt_rr2r(vrefp, vregB, vregB)

/*	4x32 Floating-Point Vector Recipricol Square Root Estimate
*/
#define	vrsqrtefp_r2r(vregB)		alt_rr2r(vrsqrtefp, vregB, vregB)

/*	4x32 Floating-Point Vector Log2 Estimate
*/
#define	vlogefp_r2r(vregB)		alt_rr2r(vlogefp, vregB, vregB)

/*	4x32 Floating-Point Vector Log2 Estimate
*/
#define	vexptefp_r2r(vregB)		alt_rr2r(vexptefp, vregB, vregB)



/*	1x64 bitwise AND
*/
#define	vand_r2r(vregA, vregB)		alt_rr2r(vand, vregB, vregA, vregB)


/*	1x64 bitwise AND with Compliment of the destination
*/
#define	vandc_r2r(vregA, vregB)		alt_rr2r(vandc, vregB, vregA, vregB)


/*	1x64 bitwise AND with Not of the source
*/
#define	vandn_r2r(vregA, vregB)		alt_rr2r(vandc, vregA, vregB, vregB)


/*	1x64 bitwise OR
*/
#define	vor_r2r(vregA, vregB)		alt_rr2r(vor, vregB, vregA, vregB)


/*	1x64 bitwise NOR
*/
#define	vnor_r2r(vregA, vregB)		alt_rr2r(vnor, vregB, vregA, vregB)


/*	1x64 bitwise eXclusive OR
*/
#define	vxor_r2r(vregA, vregB)		alt_rr2r(vxor, vregB, vregA, vregB)



/*	4x32, 8x16, and 16x8 Vector CoMPare for EQuality
	(resulting fields are either 0 or -1)
*/
#define	vcmpeqw_r2r(vregA, vregB)	alt_rr2r(vcmpequw, vregB, vregA, vregB)
#define	vcmpeqh_r2r(vregA, vregB)	alt_rr2r(vcmpequh, vregB, vregA, vregB)
#define	vcmpeqb_r2r(vregA, vregB)	alt_rr2r(vcmpequb, vregB, vregA, vregB)
#define	vcmpequw_r2r(vregA, vregB)	alt_rr2r(vcmpequw, vregB, vregA, vregB)
#define	vcmpequh_r2r(vregA, vregB)	alt_rr2r(vcmpequh, vregB, vregA, vregB)
#define	vcmpequb_r2r(vregA, vregB)	alt_rr2r(vcmpequb, vregB, vregA, vregB)


/*	4x32 Floating-Point Vector CoMPare for EQuality
	(resulting fields are either 0 or -1)
*/
#define	vcmpeqfp_r2r(vregA, vregB)	alt_rr2r(vcmpeqfp, vregB, vregA, vregB)



/*	4x32, 8x16, and 16x8 Vector CoMPare for Greater Than Signed
	(resulting fields are either 0 or -1)
*/
#define	vcmpgtsw_r2r(vregA, vregB)	alt_rr2r(vcmpgtsw, vregB, vregA, vregB)
#define	vcmpgtsh_r2r(vregA, vregB)	alt_rr2r(vcmpgtsh, vregB, vregA, vregB)
#define	vcmpgtsb_r2r(vregA, vregB)	alt_rr2r(vcmpgtsb, vregB, vregA, vregB)


/*	4x32, 8x16, and 16x8 Vector CoMPare for Greater Than Unsigned
	(resulting fields are either 0 or -1)
*/
#define	vcmpgtuw_r2r(vregA, vregB)	alt_rr2r(vcmpgtuw, vregB, vregA, vregB)
#define	vcmpgtuh_r2r(vregA, vregB)	alt_rr2r(vcmpgtuh, vregB, vregA, vregB)
#define	vcmpgtub_r2r(vregA, vregB)	alt_rr2r(vcmpgtub, vregB, vregA, vregB)


/*	4x32 Floating-Point Vector CoMPare for Greater Than
	(resulting fields are either 0 or -1)
*/
#define	vcmpgtfp_r2r(vregA, vregB)	alt_rr2r(vcmpgtfp, vregB, vregA, vregB)


/*	4x32 Floating-Point Vector CoMPare for Greater Than or Equal
	(resulting fields are either 0 or -1)
*/
#define	vcmpgeqfp_r2r(vregA, vregB)	alt_rr2r(vcmpgeqfp, vregB, vregA, vregB)


/*	4x32 Floating-Point Vector CoMPare Bounds
	(resulting fields are either 0 or -1)
*/
#define	vcmpbfp_r2r(vregA, vregB)	alt_rr2r(vcmpbfp, vregB, vregA, vregB)



/*	4x32, 8x16, and 16x8 Vector Shift Left Logical
*/
#define	vslw_r2r(vregA, vregB)		alt_rr2r(vslw, vregB, vregA, vregB)
#define	vslh_r2r(vregA, vregB)		alt_rr2r(vslh, vregB, vregA, vregB)
#define	vslb_r2r(vregA, vregB)		alt_rr2r(vslb, vregB, vregA, vregB)


/*	4x32, 8x16, and 16x8 Vector Shift Right Logical
*/
#define	vsrw_r2r(vregA, vregB)		alt_rr2r(vsrw, vregB, vregA, vregB)
#define	vsrh_r2r(vregA, vregB)		alt_rr2r(vsrh, vregB, vregA, vregB)
#define	vsrb_r2r(vregA, vregB)		alt_rr2r(vsrb, vregB, vregA, vregB)


/*	4x32, 8x16, and 16x8 Vector Shift Right Arithmetic
*/
#define	vsraw_r2r(vregA, vregB)		alt_rr2r(vsraw, vregB, vregA, vregB)
#define	vsrah_r2r(vregA, vregB)		alt_rr2r(vsrah, vregB, vregA, vregB)
#define	vsrab_r2r(vregA, vregB)		alt_rr2r(vsrab, vregB, vregA, vregB)



/*	4x32, 8x16, and 16x8 Vector Rotate Left Logical
*/
#define	vrlw_r2r(vregA, vregB)		alt_rr2r(vrlw, vregB, vregA, vregB)
#define	vrlh_r2r(vregA, vregB)		alt_rr2r(vrlh, vregB, vregA, vregB)
#define	vrlb_r2r(vregA, vregB)		alt_rr2r(vrlb, vregB, vregA, vregB)



/*	Vector Round to Floating-Point Integer:
	Nearest, toward Zero, toward Positive Infinity, toward Minus Infinity
*/
#define	vrfin_r2r(vregB)		alt_rr2r(vrfin, vregB, vregB)
#define	vrfiz_r2r(vregB)		alt_rr2r(vrfiz, vregB, vregB)
#define	vrfip_r2r(vregB)		alt_rr2r(vrfip, vregB, vregB)
#define	vrfim_r2r(vregB)		alt_rr2r(vrfim, vregB, vregB)


/*	Vector Convert from Fixed-Point Integer:
	Unsigned, Signed, Unsigned Saturate, Signed Saturate
*/
#define	vcfux_r2r(vregB)		alt_rr2r(vcfux, vregB, 0, vregB)
#define	vcfsx_r2r(vregB)		alt_rr2r(vcfsx, vregB, 0, vregB)
#define	vctuxs_r2r(vregB)		alt_rr2r(vctuxs, vregB, 0, vregB)
#define	vctsxs_r2r(vregB)		alt_rr2r(vctsxs, vregB, 0, vregB)


/*	Load Vector Element Indexed by address (rA + rB) into vregD
	(see page 4-26 of AltiVec PEM).
*/
#define	lvx_m2r(mem, vregD)		alt_m2r(lvx, mem, vregD)
#define	lveqx_m2r(mem, vregD)		alt_m2r(lveqx, mem, vregD)
#define	lvedx_m2r(mem, vregD)		alt_m2r(lvx, mem, vregD)
#define	lvewx_m2r(mem, vregD)		alt_m2r(lvewx, mem, vregD)
#define	lvehx_m2r(mem, vregD)		alt_m2r(lvehx, mem, vregD)
#define	lvebx_m2r(mem, vregD)		alt_m2r(lvebx, mem, vregD)

#define lvsl_m2r(mem, vregD)		alt_m2r(lvsl, mem, vregD)
#define lvsr_m2r(mem, vregD)		alt_m2r(lvsr, mem, vregD)

/*	Load indexed via general registers.
*/
#ifdef NOTDEFD
                storeop(off, knownlocation, 0);
                __asm__ __volatile__ ("ld %0, %1" \
                                      : "=r" (regb) \
                                      : "X" (knownlocation) );
                __asm__ __volatile__ ("li %0, %1" \
                                      : "=r" (rega) \
                                      : "X" (s) );
                __asm__ __volatile__ ("lvx " vreg ", %0, %1" \
                                      : /* nothing */ \
                                      : "r" (rega), "r" (regb) );
#endif


#define USETHIS
#ifdef USETHIS
	#define loadrr_m2r(s, frag, off, vreg)				\
	{								\
		static p128_t knownlocation;				\
		unsigned long rega;					\
		unsigned long regb;					\
		register long treg;					\
									\
		/* Store frag into knownlocation */			\
		__asm__ __volatile__ (					\
			"la %0, %1\n\t"					\
			"stvx " #frag ", 0, %0"				\
			: "=r" (treg)					\
			: "X" (knownlocation)				\
		);							\
		/* Load frag into regb, add offset,			\
		   and load address of s into rega */			\
		__asm__ __volatile__ (					\
			"lwz %0, %2\n\t"				\
			"addi %0, %0, " #off "\n\t"			\
			"la %1, %3\n\t" 				\
			"lvx " #vreg ", %0, %1"				\
			: "=r" (regb), "=r" (rega) 			\
			: "X" (knownlocation), "X" (s)			\
		);							\
	}

#else

#undef DEBUG_LOADRR
#ifdef DEBUG_LOADRR
	#define loadrr_m2r(s, frag, off, vreg)				\
	{								\
		unsigned long knownlocation;				\
		unsigned long rega;					\
		unsigned long regb;					\
		register long treg;					\
		static p128_t output;					\
									\
		/* Store frag into knownlocation */			\
		__asm__ __volatile__ (					\
			"la %0, %1\n\t"					\
			"stvx " #frag ", 0, %0"				\
			: "=r" (treg)					\
			: "X" (knownlocation)				\
		);							\
	printf("frag stored in knownlocation = %08lx\n", knownlocation); \
	printf("rega=%08lx, regb=%08lx\n", rega, regb); \
		/* Load frag into regb, add offset,			\
		   and load address of s into rega */			\
		__asm__ __volatile__ (					\
			"lwz %0, %1"					\
			: "=r" (regb)		 			\
			: "X" (knownlocation)				\
		);							\
	printf("regb=%08lx\n", regb); \
	printf("knownlocation=%08lx\n", knownlocation); \
		__asm__ __volatile__ (					\
			"addi %0, %1, " #off				\
			: "=r" (regb)					\
			: "0" (regb)					\
		);							\
	printf("rega=%08lx\n", rega); \
	printf("regb=%08lx\n", regb); \
		__asm__ __volatile__ (					\
			"la %0, %1"	 				\
			: "=r" (rega) 					\
			: "X" (s)					\
		);							\
	printf("rega=%p\n", rega); \
	printf("&s=%p\n", &s); \
		__asm__ __volatile__ (					\
			"lvx " #vreg ", %0, %1"				\
			:						\
			: "r" (regb), "r" (rega) 			\
		);							\
	printf("rega=%08lx, regb=%08lx\n", rega, regb); \
	printf("knownlocation=%08lx, s=%p\n", knownlocation, s); \
	stvx_r2m(vreg, output); \
	printf("output=%016llx\n", output.uq[0]); \
	printf("output=%016llx\n", output.uq[1]); \
	}
#else
#ifdef TESTING
	#define loadrr_m2r(s, frag, off, vreg)				\
	{								\
		unsigned long knownlocation;				\
		unsigned long rega;					\
		unsigned long regb;					\
		register long treg;					\
									\
		/* Store frag into knownlocation */			\
		__asm__ __volatile__ (					\
			"la %0, %1"					\
			: "=r" (treg)					\
			: "X" (knownlocation)				\
		);							\
		__asm__ __volatile__ (					\
			"stvx " #frag ", 0, %0"				\
			:						\
			: "r" (treg)					\
		);							\
		/* Load frag into regb, add offset,			\
		   and load address of s into rega */			\
		__asm__ __volatile__ (					\
			"lwz %0, %1"					\
			: "=r" (regb)		 			\
			: "X" (knownlocation)				\
		);							\
		__asm__ __volatile__ (					\
			"addi %0, %1, " #off				\
			: "=r" (regb)					\
			: "0" (regb)					\
		);							\
		__asm__ __volatile__ (					\
			"la %0, %1"	 				\
			: "=r" (rega) 					\
			: "X" (s)					\
		);							\
		__asm__ __volatile__ (					\
			"lvx " #vreg ", %0, %1"				\
			:						\
			: "r" (regb), "r" (rega) 			\
		);							\
	}

#else
	#define loadrr_m2r(s, frag, off, vreg)				\
	{								\
		unsigned long knownlocation;				\
		register unsigned long rega;				\
		register unsigned long regb;				\
		register unsigned long treg;				\
									\
		/* Store frag into knownlocation */			\
		__asm__ __volatile__ (					\
			"la %0, %1\n\t"					\
			"stvx " #frag ", 0, %2"				\
			: "=r" (treg)					\
			: "X" (knownlocation), "0" (treg)		\
			: "memory"					\
		);							\
		__asm__ __volatile__ (					\
			"lwz %0, %1"					\
			: "=r" (regb)		 			\
			: "X" (knownlocation)				\
		);							\
		__asm__ __volatile__ (					\
			"addi %0, %1, " #off				\
			: "=r" (regb)					\
			: "0" (regb)					\
		);							\
		__asm__ __volatile__ (					\
			"la %0, %1\n\t"	 				\
			"lvx " #vreg ", %2, %3"				\
			: "=r" (rega) 					\
			: "X" (s), "r" (regb), "0" (rega)		\
		);							\
	}


#endif
#endif

#endif



/*	Load Vector Element Indexed by address (rA + rB) into vregD with
	LRU hint (see page 4-26 of AltiVec PEM).
#define	lvqxl_m2r(regA, regB, vregD)	alt_m2r(lvxl, regA, regB, vregD)
#define	lvxl_m2r(regA, regB, vregD)	alt_m2r(lvxl, regA, regB, vregD)
*/


/*	Load Vector for Shift Left (see page 4-27 of AltiVec PEM).
#define	lvsl_m2r(regA, regB, vregD)	alt_m2r(lvsl, regA, regB, vregD)
*/


/*	Load Vector for Shift Right (see page 4-27 of AltiVec PEM).
#define	lvsr_m2r(regA, regB, vregD)	alt_m2r(lvsr, regA, regB, vregD)
*/



/*	Store Vector Element to memory Indexed by address (rA + rB)
	(see page 4-28 of AltiVec PEM).
*/
#define	stvx_r2m(vregS, mem)		alt_r2m(stvx, vregS, mem)
#define	stveqx_r2m(vregS, mem)		alt_r2m(stvx, vregS, mem)
#define	stvedx_r2m(vregS, mem)		alt_r2m(stvx, vregS, mem)
#define	stvewx_r2m(vregS, mem)		alt_r2m(stvewx, vregS, mem)
#define	stvehx_r2m(vregS, mem)		alt_r2m(stvehx, vregS, mem)
#define	stvebx_r2m(vregS, mem)		alt_r2m(stvebx, vregS, mem)

#define	stvx_r2x(vregS, mem)		alt_r2x(stvx, vregS, mem)

/*	Store indexed via general registers.
*/
#define storerr_r2m(vreg, s, frag, off)			\
{							\
	static p128_t knownlocation;			\
	unsigned long rega;				\
	unsigned long regb;				\
	register long treg;				\
							\
	/* Store frag into knownlocation */		\
	__asm__ __volatile__ (				\
		"la %0, %1\n\t"				\
		"stvx " #frag ", 0, %0"			\
		: "=r" (treg)				\
		: "X" (knownlocation)			\
	);						\
	/* Load frag into regb, add offset,		\
	   and load address of s into rega */		\
	__asm__ __volatile__ (				\
		"lwz %0, %2\n\t"			\
		"addi %0, %0, " #off "\n\t"		\
		"la %1, %3\n\t" 			\
		"stvewx " #vreg ", %0, %1"		\
		  : "=r" (regb), "=r" (rega) 		\
		  : "X" (knownlocation), "X" (s)	\
	);						\
}



/*	Store Vector Element to memory Indexed by address (rA + rB) with
	LRU hint (see page 4-28 of AltiVec PEM).
#define	stvqxl_r2m(vregS, regA, regB)	alt_r2m(stvxl, vregS, regA, regB)
#define	stvxl_r2m(vregS, regA, regB)	alt_r2m(stvxl, vregS, regA, regB)
*/




/*	OLD COMMENT: 2x32->4x16 and 4x16->8x8 PACK Unsigned
	(packs source and dest fields into dest in that order)
	(see page 4-29 of AltiVec PEM).
*/
#define	vpkuwum_r2r(vregA, vregB)	alt_rr2r(vpkuwum, vregA, vregB, vregB)
#define	vpkuhum_r2r(vregA, vregB)	alt_rr2r(vpkuhum, vregA, vregB, vregB)


/*	OLD COMMENT: 2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
	(packs source and dest fields into dest in that order)
	(see page 4-30 of AltiVec PEM).
*/
#define	vpkswss_r2r(vregA, vregB)	alt_rr2r(vpkswss, vregA, vregB, vregB)
#define	vpkshss_r2r(vregA, vregB)	alt_rr2r(vpkshss, vregA, vregB, vregB)


/*	OLD COMMENT: 4x16->8x8 PACK and Unsigned Saturate
	(packs source and dest fields into dest in that order)
	(see page 4-29 of AltiVec PEM).
*/
#define	vpkuwus_r2r(vregA, vregB)	alt_rr2r(vpkuwus, vregA, vregB, vregB)
#define	vpkuhus_r2r(vregA, vregB)	alt_rr2r(vpkuhus, vregA, vregB, vregB)


/*	OLD COMMENT: 2x32s->4x16 and 4x16s->8x8 PACK and Unsigned Saturate
	(packs source and dest fields into dest in that order)
	(see page 4-29 of AltiVec PEM).
*/
#define	vpkswus_r2r(vregA, vregB)	alt_rr2r(vpkswus, vregA, vregB, vregB)
#define	vpkshus_r2r(vregA, vregB)	alt_rr2r(vpkshus, vregA, vregB, vregB)


/*	OLD COMMENT: PACK Pixel
	(packs source and dest fields into dest in that order)
	(see page 4-30 of AltiVec PEM).
#define	vpkpx_r2r(vregA, vregB)		alt_rr2r(vpkpx, vregA, vregB, vregB)
*/



/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
	(interleaves low half of dest with low half of source
	 as padding in each result field)
*/
#define	vupklsh_r2r(vregS, vregD)	alt_r2r(vupklsh, vregS, vregD)
#define	vupklsb_r2r(vregS, vregD)	alt_r2r(vupklsb, vregS, vregD)


/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
	(interleaves high half of dest with high half of source
	 as padding in each result field)
*/
#define	vupkhsh_r2r(vregS, vregD)	alt_r2r(vupkhsh, vregS, vregD)
#define	vupkhsb_r2r(vregS, vregD)	alt_r2r(vupkhsb, vregS, vregD)


/*	OLD COMMENT: UnPacK High PiXel
	(see page 4-31 of AltiVec PEM).
#define	vupkhpx_r2r(vregB, vregD)	alt_rr2r(vupkhpx, vregB, vregD)
#define	vupkhpx_r2r(vregB, vregD)	alt_rr2r(vupkhpx, vregB, vregD)
*/


/*	OLD COMMENT: UnPacK Low PiXel
	(see page 4-31 of AltiVec PEM).
#define	vupklpx_r2r(vregB, vregD)	alt_rr2r(vupklpx, vregB, vregD)
#define	vupklpx_r2r(vregB, vregD)	alt_rr2r(vupklpx, vregB, vregD)
*/



/*	Vector MeRGe High
	The order of the operands is reversed here to match the INTRLVHIGH
	pseudo-op definition.  It would be better to change that definition,
	but I'm not going to take the time to do that now.
*/
#define vmrghw_r2r(vregA, vregB)	alt_rr2r(vmrghw, vregA, vregB, vregB)
#define vmrghh_r2r(vregA, vregB)	alt_rr2r(vmrghh, vregA, vregB, vregB)
#define vmrghb_r2r(vregA, vregB)	alt_rr2r(vmrghb, vregA, vregB, vregB)


/*	Vector MeRGe Low
	The order of the operands is reversed here to match the INTRLVLOW
	pseudo-op definition.  It would be better to change that definition,
	but I'm not going to take the time to do that now.
*/
#define vmrglw_r2r(vregA, vregB)	alt_rr2r(vmrglw, vregA, vregB, vregB)
#define vmrglh_r2r(vregA, vregB)	alt_rr2r(vmrglh, vregA, vregB, vregB)
#define vmrglb_r2r(vregA, vregB)	alt_rr2r(vmrglb, vregA, vregB, vregB)



/*	Vector SPLaT indexed element
*/
#define vspltw_i2r(UIMM, vregB)		alt_i2r(vspltw, UIMM, vregB, vregB)
#define vsplth_i2r(UIMM, vregB)		alt_i2r(vsplth, UIMM, vregB, vregB)
#define vspltb_i2r(UIMM, vregB)		alt_i2r(vspltb, UIMM, vregB, vregB)


/*	Vector SPLaT Signed Immediate
*/
#define vspltisw_r2r(SIMM, vregD)	alt_i2r(vspltisw, SIMM, vregD)
#define vspltish_r2r(SIMM, vregD)	alt_i2r(vspltish, SIMM, vregD)
#define vspltisb_r2r(SIMM, vregD)	alt_i2r(vspltisb, SIMM, vregD)



/*	Vector PERMute
*/
#define vperm_r2r(vregA, vregB, vregC, vregD)	\
    __asm__ __volatile__ ("vperm " #vregD ", " #vregA ", " #vregB ", " #vregC)



/*	Vector SELect
*/
#define vsel_r2r(vregA, vregB, vregC, vregD)	\
    __asm__ __volatile__ ("vsel " #vregD ", " #vregA ", " #vregB ", " #vregC)


/*	Vector Shift Left/Right (by reg, up to 7 bits)
*/
#define	vsl_r2r(vregA, vregB)		alt_rr2r(vsl, vregB, vregA, vregB)
#define	vsr_r2r(vregA, vregB)		alt_rr2r(vsr, vregB, vregA, vregB)

/*	Vector Shift Left/Right (by reg, up to 15 bytes)
*/
#define	vslo_r2r(vregA, vregB)		alt_rr2r(vslo, vregB, vregA, vregB)
#define	vsro_r2r(vregA, vregB)		alt_rr2r(vsro, vregB, vregA, vregB)

/* Left off here with Vector Shifts p4-34 */



/*	derived functions...
*/

#define	vdivub_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.b[0]) _reg.ub[0] /= _mem.ub[0]; \
		if (_mem.b[1]) _reg.ub[1] /= _mem.ub[1]; \
		if (_mem.b[2]) _reg.ub[2] /= _mem.ub[2]; \
		if (_mem.b[3]) _reg.ub[3] /= _mem.ub[3]; \
		if (_mem.b[4]) _reg.ub[4] /= _mem.ub[4]; \
		if (_mem.b[5]) _reg.ub[5] /= _mem.ub[5]; \
		if (_mem.b[6]) _reg.ub[6] /= _mem.ub[6]; \
		if (_mem.b[7]) _reg.ub[7] /= _mem.ub[7]; \
		if (_mem.b[8]) _reg.ub[8] /= _mem.ub[8]; \
		if (_mem.b[9]) _reg.ub[9] /= _mem.ub[9]; \
		if (_mem.b[10]) _reg.ub[10] /= _mem.ub[10]; \
		if (_mem.b[11]) _reg.ub[11] /= _mem.ub[11]; \
		if (_mem.b[12]) _reg.ub[12] /= _mem.ub[12]; \
		if (_mem.b[13]) _reg.ub[13] /= _mem.ub[13]; \
		if (_mem.b[14]) _reg.ub[14] /= _mem.ub[14]; \
		if (_mem.b[15]) _reg.ub[15] /= _mem.ub[15]; \
		lvx_m2r(_reg, regd); \
	}

#define	vdivuh_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.w[0]) _reg.uw[0] /= _mem.uw[0]; \
		if (_mem.w[1]) _reg.uw[1] /= _mem.uw[1]; \
		if (_mem.w[2]) _reg.uw[2] /= _mem.uw[2]; \
		if (_mem.w[3]) _reg.uw[3] /= _mem.uw[3]; \
		if (_mem.w[4]) _reg.uw[4] /= _mem.uw[4]; \
		if (_mem.w[5]) _reg.uw[5] /= _mem.uw[5]; \
		if (_mem.w[6]) _reg.uw[6] /= _mem.uw[6]; \
		if (_mem.w[7]) _reg.uw[7] /= _mem.uw[7]; \
		lvx_m2r(_reg, regd); \
	}

#define	vdivuw_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.d[0]) _reg.ud[0] /= _mem.ud[0]; \
		if (_mem.d[1]) _reg.ud[1] /= _mem.ud[1]; \
		if (_mem.d[2]) _reg.ud[2] /= _mem.ud[2]; \
		if (_mem.d[3]) _reg.ud[3] /= _mem.ud[3]; \
		lvx_m2r(_reg, regd); \
	}

#define	vdivsb_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.b[0]) _reg.b[0] /= _mem.b[0]; \
		if (_mem.b[1]) _reg.b[1] /= _mem.b[1]; \
		if (_mem.b[2]) _reg.b[2] /= _mem.b[2]; \
		if (_mem.b[3]) _reg.b[3] /= _mem.b[3]; \
		if (_mem.b[4]) _reg.b[4] /= _mem.b[4]; \
		if (_mem.b[5]) _reg.b[5] /= _mem.b[5]; \
		if (_mem.b[6]) _reg.b[6] /= _mem.b[6]; \
		if (_mem.b[7]) _reg.b[7] /= _mem.b[7]; \
		if (_mem.b[8]) _reg.b[8] /= _mem.b[8]; \
		if (_mem.b[9]) _reg.b[9] /= _mem.b[9]; \
		if (_mem.b[10]) _reg.b[10] /= _mem.b[10]; \
		if (_mem.b[11]) _reg.b[11] /= _mem.b[11]; \
		if (_mem.b[12]) _reg.b[12] /= _mem.b[12]; \
		if (_mem.b[13]) _reg.b[13] /= _mem.b[13]; \
		if (_mem.b[14]) _reg.b[14] /= _mem.b[14]; \
		if (_mem.b[15]) _reg.b[15] /= _mem.b[15]; \
		lvx_m2r(_reg, regd); \
	}

#define	vdivsh_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.w[0]) _reg.w[0] /= _mem.w[0]; \
		if (_mem.w[1]) _reg.w[1] /= _mem.w[1]; \
		if (_mem.w[2]) _reg.w[2] /= _mem.w[2]; \
		if (_mem.w[3]) _reg.w[3] /= _mem.w[3]; \
		if (_mem.w[4]) _reg.w[4] /= _mem.w[4]; \
		if (_mem.w[5]) _reg.w[5] /= _mem.w[5]; \
		if (_mem.w[6]) _reg.w[6] /= _mem.w[6]; \
		if (_mem.w[7]) _reg.w[7] /= _mem.w[7]; \
		lvx_m2r(_reg, regd); \
	}

#define	vdivsw_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.d[0]) _reg.d[0] /= _mem.d[0]; \
		if (_mem.d[1]) _reg.d[1] /= _mem.d[1]; \
		if (_mem.d[2]) _reg.d[2] /= _mem.d[2]; \
		if (_mem.d[3]) _reg.d[3] /= _mem.d[3]; \
		lvx_m2r(_reg, regd); \
	}



#define	vmodub_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.b[0]) _reg.ub[0] %= _mem.ub[0]; \
		if (_mem.b[1]) _reg.ub[1] %= _mem.ub[1]; \
		if (_mem.b[2]) _reg.ub[2] %= _mem.ub[2]; \
		if (_mem.b[3]) _reg.ub[3] %= _mem.ub[3]; \
		if (_mem.b[4]) _reg.ub[4] %= _mem.ub[4]; \
		if (_mem.b[5]) _reg.ub[5] %= _mem.ub[5]; \
		if (_mem.b[6]) _reg.ub[6] %= _mem.ub[6]; \
		if (_mem.b[7]) _reg.ub[7] %= _mem.ub[7]; \
		if (_mem.b[8]) _reg.ub[8] %= _mem.ub[8]; \
		if (_mem.b[9]) _reg.ub[9] %= _mem.ub[9]; \
		if (_mem.b[10]) _reg.ub[10] %= _mem.ub[10]; \
		if (_mem.b[11]) _reg.ub[11] %= _mem.ub[11]; \
		if (_mem.b[12]) _reg.ub[12] %= _mem.ub[12]; \
		if (_mem.b[13]) _reg.ub[13] %= _mem.ub[13]; \
		if (_mem.b[14]) _reg.ub[14] %= _mem.ub[14]; \
		if (_mem.b[15]) _reg.ub[15] %= _mem.ub[15]; \
		lvx_m2r(_reg, regd); \
	}

#define	vmoduh_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.w[0]) _reg.uw[0] %= _mem.uw[0]; \
		if (_mem.w[1]) _reg.uw[1] %= _mem.uw[1]; \
		if (_mem.w[2]) _reg.uw[2] %= _mem.uw[2]; \
		if (_mem.w[3]) _reg.uw[3] %= _mem.uw[3]; \
		if (_mem.w[4]) _reg.uw[4] %= _mem.uw[4]; \
		if (_mem.w[5]) _reg.uw[5] %= _mem.uw[5]; \
		if (_mem.w[6]) _reg.uw[6] %= _mem.uw[6]; \
		if (_mem.w[7]) _reg.uw[7] %= _mem.uw[7]; \
		lvx_m2r(_reg, regd); \
	}

#define	vmoduw_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.d[0]) _reg.ud[0] %= _mem.ud[0]; \
		if (_mem.d[1]) _reg.ud[1] %= _mem.ud[1]; \
		if (_mem.d[2]) _reg.ud[2] %= _mem.ud[2]; \
		if (_mem.d[3]) _reg.ud[3] %= _mem.ud[3]; \
		lvx_m2r(_reg, regd); \
	}

#define	vmodsb_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.b[0]) _reg.b[0] %= _mem.b[0]; \
		if (_mem.b[1]) _reg.b[1] %= _mem.b[1]; \
		if (_mem.b[2]) _reg.b[2] %= _mem.b[2]; \
		if (_mem.b[3]) _reg.b[3] %= _mem.b[3]; \
		if (_mem.b[4]) _reg.b[4] %= _mem.b[4]; \
		if (_mem.b[5]) _reg.b[5] %= _mem.b[5]; \
		if (_mem.b[6]) _reg.b[6] %= _mem.b[6]; \
		if (_mem.b[7]) _reg.b[7] %= _mem.b[7]; \
		if (_mem.b[8]) _reg.b[8] %= _mem.b[8]; \
		if (_mem.b[9]) _reg.b[9] %= _mem.b[9]; \
		if (_mem.b[10]) _reg.b[10] %= _mem.b[10]; \
		if (_mem.b[11]) _reg.b[11] %= _mem.b[11]; \
		if (_mem.b[12]) _reg.b[12] %= _mem.b[12]; \
		if (_mem.b[13]) _reg.b[13] %= _mem.b[13]; \
		if (_mem.b[14]) _reg.b[14] %= _mem.b[14]; \
		if (_mem.b[15]) _reg.b[15] %= _mem.b[15]; \
		lvx_m2r(_reg, regd); \
	}

#define	vmodsh_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.w[0]) _reg.w[0] %= _mem.w[0]; \
		if (_mem.w[1]) _reg.w[1] %= _mem.w[1]; \
		if (_mem.w[2]) _reg.w[2] %= _mem.w[2]; \
		if (_mem.w[3]) _reg.w[3] %= _mem.w[3]; \
		if (_mem.w[4]) _reg.w[4] %= _mem.w[4]; \
		if (_mem.w[5]) _reg.w[5] %= _mem.w[5]; \
		if (_mem.w[6]) _reg.w[6] %= _mem.w[6]; \
		if (_mem.w[7]) _reg.w[7] %= _mem.w[7]; \
		lvx_m2r(_reg, regd); \
	}

#define	vmodsw_r2r(regs, regd) \
	{ \
		static p128_t _reg; \
		static p128_t _mem; \
		stvx_r2m(regd, _reg); \
		stvx_r2m(regs, _mem); \
		if (_mem.d[0]) _reg.d[0] %= _mem.d[0]; \
		if (_mem.d[1]) _reg.d[1] %= _mem.d[1]; \
		if (_mem.d[2]) _reg.d[2] %= _mem.d[2]; \
		if (_mem.d[3]) _reg.d[3] %= _mem.d[3]; \
		lvx_m2r(_reg, regd); \
	}
#endif

