/*	ia32.h

	MultiMedia eXtensions GCC interface library for IA32.

	To use this library, simply include this header file
	and compile with GCC.  You MUST have inlining enabled
	in order for mmx_ok() to work; this can be done by
	simply using -O on the GCC command line.

	Compiling with -DMMX_TRACE will cause detailed trace
	output to be sent to stderr for each mmx operation.
	This adds lots of code, and obviously slows execution to
	a crawl, but can be very useful for debugging.

	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
	AND FITNESS FOR ANY PARTICULAR PURPOSE.

	June 11, 1998 by H. Dietz and R. Fisher
*/



/*	Helper functions for the instruction macros that follow...
	(note that memory-to-register, m2r, instructions are nearly
	 as efficient as register-to-register, r2r, instructions;
	 however, memory-to-memory instructions are really simulated
	 as a convenience, and are only 1/3 as efficient)
*/

#define	ia32_m2r(op, sign, mem, reg)	\
	reg##.##sign##d=##reg##.##sign##d op (##mem##).##sign##d

#define	ia32_r2m(op, sign, reg, mem)	\
	mem##.##sign##d=(##mem##).##sign##d op reg##.##sign##d

#define	ia32_i2r(op, sign, immed, regd)	\
	regd##.##sign##d=##regd##.##sign##d op immed

#define	ia32_r2r(op, sign, regs, regd) 	\
	regd##.##sign##d=##regd##.##sign##d op regs##.##sign##d

#define	ia32_m2m(op, sign, mems, memd) 	\
	memd##.##sign##d=(##memd##).##sign##d op (##mems##).##sign##d


/*	1x32 MOVe Long
*/
#define movl_x2r(base, reg)	reg##.ud=((*((p32_t *)(##base##.ud))).ud)
#define movl_r2x(reg, base)	(*((p32_t *)(##base##.ud))).ud=reg##.ud
#define	movl_m2r(var, reg)	reg##.ud=(##var##).ud
#define	movl_r2m(reg, var)	(##var##).ud=##reg##.ud
#define	movl_r2r(regs, regd)	regd##.ud=##regs##.ud


/*	1x32, 2x16, and 4x8 Parallel ADDs
*/
#define	addl_m2r(var, reg)	ia32_m2r(+, u, var, reg)
#define	addl_r2r(regs, regd)	ia32_r2r(+, u, regs, regd)
#define	addl(vars, vard)	ia32_m2m(+, u, vars, vard)


/*	2x32, 4x16, and 8x8 Parallel SUBs
*/
#define	subl_m2r(var, reg)	ia32_m2r(-, u, var, reg)
#define	subl_r2r(regs, regd)	ia32_r2r(-, u, regs, regd)
#define	subl(vars, vard)	ia32_m2m(-, u, vars, vard)


/*	4x16 Parallel MULs giving Low 4x16 portions of results
*/
#define	mulll_m2r(var, reg)	ia32_m2r(*, u, var, reg)
#define	mulll_r2r(regs, regd)	ia32_r2r(*, u, regs, regd)
#define	mulll(vars, vard)	ia32_m2m(*, u, vars, vard)


/*	4x16 Parallel MULs giving High 4x16 portions of results
*/
#define	mulh_m2r(var, reg)	ia32_m2r(pmulhw, u, var, reg)
#define	mulh_r2r(regs, regd)	ia32_r2r(pmulhw, u, regs, regd)
#define	mulh(vars, vard)	ia32_m2m(pmulhw, u, vars, vard)


/*	4x16 Unsigned Parallel DIVs
*/
#define	divul_m2r(var, reg)	ia32_m2r(/, u, var, reg)
#define	divul_r2r(regs, regd)	ia32_r2r(/, u, regs, regd)
#define	divul(vars, vard)	ia32_m2m(/, u, vars, vard)


/*	4x16 Signed Parallel DIVs
*/
#define	divl_m2r(var, reg)	ia32_m2r(/, , var, reg)
#define	divl_r2r(regs, regd)	ia32_r2r(/, , regs, regd)
#define	divl(vars, vard)	ia32_m2m(/, , vars, vard)


/*	4x16 Unsigned Parallel MODs
*/
#define	modul_m2r(var, reg)	ia32_m2r(%, u, var, reg)
#define	modul_r2r(regs, regd)	ia32_r2r(%, u, regs, regd)
#define	modul(vars, vard)	ia32_m2m(%, u, vars, vard)


/*	4x16 Unsigned Parallel MODs
*/
#define	modl_m2r(var, reg)	ia32_m2r(%, , var, reg)
#define	modl_r2r(regs, regd)	ia32_r2r(%, , regs, regd)
#define	modl(vars, vard)	ia32_m2m(%, , vars, vard)



/*	1x32 bitwise AND
*/
#define	andl_m2r(var, reg)	ia32_m2r(&, u, var, reg)
#define	andl_r2r(regs, regd)	ia32_r2r(&, u, regs, regd)
#define	andl(vars, vard)	ia32_m2m(&, u, vars, vard)


/*	1x32 bitwise AND with Not the destination
	This doesn't exist in IA32?
*/
#define	andnl_m2r(var, reg)	ia32_m2r(&~, u, var, reg)
#define	andnl_r2r(regs, regd)	ia32_r2r(&~, u, regs, regd)
#define	andnl(vars, vard)	ia32_m2m(&~, u, vars, vard)


/*	1x32 bitwise OR
*/
#define	orl_m2r(var, reg)	ia32_m2r(|, u, var, reg)
#define	orl_r2r(regs, regd)	ia32_r2r(|, u, regs, regd)
#define	orl(vars, vard)		ia32_m2m(|, u, vars, vard)


/*	1x64 bitwise eXclusive OR
*/
#define	xorl_m2r(var, reg)	ia32_m2r(^, u, var, reg)
#define	xorl_r2r(regs, regd)	ia32_r2r(^, u, regs, regd)
#define	xorl(vars, vard)	ia32_m2m(^, u, vars, vard)


/*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
	(resulting fields are either 0 or -1)
*/
#define	cmpeql_m2r(var, reg)	ia32_m2r(==, u, var, reg)
#define	cmpeql_r2r(regs, regd)	ia32_r2r(==, u, regs, regd)
#define	cmpeql(vars, vard)	ia32_m2m(==, u, vars, vard)


/*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
	(resulting fields are either 0 or -1)
*/
#define	cmpgtl_m2r(var, reg)	ia32_m2r(>, , var, reg)
#define	cmpgtl_r2r(regs, regd)	ia32_r2r(>, , regs, regd)
#define	cmpgtl(vars, vard)	ia32_m2m(>, , vars, vard)


/*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
	(resulting fields are either 0 or -1)
*/
#define	cmpgtul_m2r(var, reg)	ia32_m2r(>, u, var, reg)
#define	cmpgtul_r2r(regs, regd)	ia32_r2r(>, u, regs, regd)
#define	cmpgtul(vars, vard)	ia32_m2m(>, u, vars, vard)


/*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
*/
#define	slll_m2r(var, reg)	ia32_m2r(<<, u, var, reg)
#define	slll_i2r(immed, regd)	ia32_i2r(<<, u, immed, regd)
#define	slll_r2r(regs, regd)	ia32_r2r(<<, u, regs, regd)
#define	slll(vars, vard)	ia32_m2m(<<, u, vars, vard)


/*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
*/
#define	srll_m2r(var, reg)	ia32_m2r(>>, u, var, reg)
#define	srll_i2r(immed, regd)	ia32_i2r(>>, u, immed, regd)
#define	srll_r2r(regs, regd)	ia32_r2r(>>, u, regs, regd)
#define	srll(vars, vard)	ia32_m2m(>>, u, vars, vard)


/*	2x32 and 4x16 Parallel Shift Right Arithmetic
*/
#define	sral_m2r(var, reg)	ia32_m2r(>>, , var, reg)
#define	sral_i2r(immed, regd)	ia32_i2r(>>, , immed, regd)
#define	sral_r2r(regs, regd)	ia32_r2r(>>, , regs, regd)
#define	sral(vars, vard)	ia32_m2m(>>, , vars, vard)

