/* lp.Sc */

/* For original commentary, see file "commentary" */


#define SP
/* #define DP      */
#define ROLL
/* #define UNROLL  */
/* #define PRECOMP */

/***************************************************************/
/* Timer options. You MUST uncomment one of the options below  */
/* or compile, for example, with the '-DUNIX' option.          */
/***************************************************************/
#define UNIX

#ifdef SP
#define REAL float
#define ZERO 0.0
#define ONE 1.0
#define PREC "Single "
#endif

#ifdef DP
#define REAL double
#define ZERO 0.0e0
#define ONE 1.0e0
#define PREC "Double "
#endif

#ifdef ROLL
#define ROLLING "Rolled "
#endif
#ifdef UNROLL
#define ROLLING "Unrolled "
#endif


/*PRECOMPILE  PRECOMPILE  PRECOMPILE  PRECOMPILE  PRECOMPILE  PRECOMPILE*/
 
#ifdef PRECOMP 
    #define precompiler "INSERT COMPILER NAME HERE"
    #define preoptions  "INSERT OPTIMISATION OPTIONS HERE"
#endif


#define NTIMES 10

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>


static REAL atime[9][15];

void print_time (int row);
void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma);
void dgefa (REAL a[], int lda, int n, int ipvt[], int *info);
void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job);
void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]);
void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy);
REAL epslon (REAL x);
int idamax (int n, REAL dx[], int incx);
void dscal (int n, REAL da, REAL dx[], int incx);
REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy);



main (int argc, char *argv[])
{
	static REAL aa[200*200],a[200*201],b[200],x[200];       
	REAL cray,ops,total,norma,normx;
	REAL resid,residn,eps,t1,tm2,epsn,x1,x2;
	REAL mflops;
	static int ipvt[200],n,i,j,ntimes,info,lda,ldaa;
	int  pass, loop;
	REAL overhead1, overhead2, time1, time2, dtime();
	FILE    *outfile;
	char compiler[80] = " ", options[256] = " ", general[10][80] = {" "}; 
	char Endit[10] = " ";
	char *getinput = "Yes";

	if (argc > 1)
	 {
	    switch (argv[1][0])
	     {
		case 'N':
		  getinput = "No";
		  break;
		case 'n':
		  getinput = "No";
		  break;
	    }
	}

       
	outfile = fopen("Linpackc.res","a+");
	if (outfile == NULL)
	{
	    printf ("Cannot open results file \n\n");
	    printf("Press RETURN to exit\n");
	    fgets(Endit,10,stdin);
	    exit (0);
	}
    
	lda = 201;
	ldaa = 200;
	cray = .056; 
	n = 100;

	fprintf(stdout,ROLLING);fprintf(stdout,PREC);
	fprintf(stdout,"Precision Linpack Benchmark - in 'C/C++'\n\n");

       if (getinput == "No")
	{
	   printf ("No run time input data\n\n");
	}

	
	#ifdef PRECOMP
	     fprintf(stdout,"Compiler      %s\n", precompiler);
	     fprintf(stdout,"Optimisation  %s\n", preoptions);
	#else
	     printf("\n");
	#endif

	ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n);

	matgen(a,lda,n,b,&norma);
	t1 = dtime();
	dgefa(a,lda,n,ipvt,&info);
	atime[0][0] = dtime() - t1;
	t1 = dtime();
	dgesl(a,lda,n,ipvt,b,0);
	atime[1][0] = dtime() - t1;
	total = atime[0][0] + atime[1][0];

/*     compute a residual to verify results.  */ 

	for (i = 0; i < n; i++) {
		x[i] = b[i];
	}
	matgen(a,lda,n,b,&norma);
	for (i = 0; i < n; i++) {
		b[i] = -b[i];
	}
	dmxpy(n,b,n,lda,x,a);
	resid = 0.0;
	normx = 0.0;
	for (i = 0; i < n; i++) {
		resid = (resid > fabs((double)b[i])) 
			? resid : fabs((double)b[i]);
		normx = (normx > fabs((double)x[i])) 
			? normx : fabs((double)x[i]);
	}
	eps = epslon(ONE);
	residn = resid/( n*norma*normx*eps );
	epsn = eps;
	x1 = x[0] - 1;
	x2 = x[n-1] - 1;
	
	printf("norm resid      resid           machep");
	printf("         x[0]-1          x[n-1]-1\n");
	printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n",
	       (double)residn, (double)resid, (double)epsn, 
	       (double)x1, (double)x2);

	fprintf(stderr,"Times are reported for matrices of order        %5d\n",n);
	fprintf(stderr,"1 pass times for array with leading dimension of%5d\n\n",lda);
	fprintf(stderr,"      dgefa      dgesl      total     Mflops       unit");
	fprintf(stderr,"      ratio\n");

	atime[2][0] = total;
	if (total > 0.0)
	{
	    atime[3][0] = ops/(1.0e6*total);
	    atime[4][0] = 2.0/atime[3][0];
	}
	else
	{
	    atime[3][0] = 0.0;
	    atime[4][0] = 0.0;
	}
	atime[5][0] = total/cray;
       
	print_time(0);

/************************************************************************
 *       Calculate overhead of executing matgen procedure              *
 ************************************************************************/
       
	fprintf (stderr,"\nCalculating matgen overhead\n");
	pass = -20;
	loop = NTIMES;
	do
	{
	    time1 = dtime();
	    pass = pass + 1;        
	    for ( i = 0 ; i < loop ; i++)
	    {
		 matgen(a,lda,n,b,&norma);
	    }
	    time2 = dtime();
	    overhead1 = (time2 - time1);
	    fprintf (stderr,"%10d times %6.2f seconds\n", loop, overhead1);
	    if (overhead1 > 5.0)
	    {
		pass = 0;
	    }
	    if (pass < 0)
	    {
		if (overhead1 < 0.1)
		{
		    loop = loop * 10;
		}
		else
		{
		    loop = loop * 2;
		}
	    }
	}
	while (pass < 0);
	
	overhead1 = overhead1 / (double)loop;

	fprintf (stderr,"Overhead for 1 matgen %12.5f seconds\n\n", overhead1);

/************************************************************************
 *           Calculate matgen/dgefa passes for 5 seconds                *
 ************************************************************************/
       
	fprintf (stderr,"Calculating matgen/dgefa passes for 5 seconds\n");
	pass = -20;
	ntimes = NTIMES;
	do
	{
	    time1 = dtime();
	    pass = pass + 1;        
	    for ( i = 0 ; i < ntimes ; i++)
	    {
		matgen(a,lda,n,b,&norma);
		dgefa(a,lda,n,ipvt,&info );
	    }
	    time2 = dtime() - time1;
	    fprintf (stderr,"%10d times %6.2f seconds\n", ntimes, time2);
	    if (time2 > 5.0)
	    {
		pass = 0;
	    }
	    if (pass < 0)
	    {
		if (time2 < 0.1)
		{
		    ntimes = ntimes * 10;
		}
		else
		{
		    ntimes = ntimes * 2;
		}
	    }
	}
	while (pass < 0);
	
	ntimes =  5.0 * (double)ntimes / time2;
	if (ntimes == 0) ntimes = 1;

	fprintf (stderr,"Passes used %10d \n\n", ntimes);
	fprintf(stderr,"Times for array with leading dimension of%4d\n\n",lda);
	fprintf(stderr,"      dgefa      dgesl      total     Mflops       unit");
	fprintf(stderr,"      ratio\n");        

/************************************************************************
 *                              Execute 5 passes                        *
 ************************************************************************/
      
	tm2 = ntimes * overhead1;
	atime[3][6] = 0;

	for (j=1 ; j<6 ; j++)
	{
	
	    t1 = dtime();

	    for (i = 0; i < ntimes; i++)
	    {
		matgen(a,lda,n,b,&norma);
		dgefa(a,lda,n,ipvt,&info );
	    }

	    atime[0][j] = (dtime() - t1 - tm2)/ntimes;

	    t1 = dtime();      
	
	    for (i = 0; i < ntimes; i++)
	    {
		dgesl(a,lda,n,ipvt,b,0);
	    }

	    atime[1][j] = (dtime() - t1)/ntimes;
	    total       = atime[0][j] + atime[1][j];
	    atime[2][j] = total;
	    atime[3][j] = ops/(1.0e6*total);
	    atime[4][j] = 2.0/atime[3][j];
	    atime[5][j] = total/cray;
	    atime[3][6] = atime[3][6] + atime[3][j];
	    
	    print_time(j);
	}
	atime[3][6] = atime[3][6] / 5.0;
	fprintf (stderr,"Average                          %11.2f\n",
					       (double)atime[3][6]);        
	
	fprintf (stderr,"\nCalculating matgen2 overhead\n");

/************************************************************************
 *             Calculate overhead of executing matgen procedure         *
 ************************************************************************/

	time1 = dtime();        
	for ( i = 0 ; i < loop ; i++)
	{
	    matgen(aa,ldaa,n,b,&norma);    
	}
	time2 = dtime();
	overhead2 = (time2 - time1);
	overhead2 = overhead2 / (double)loop;
	
	fprintf (stderr,"Overhead for 1 matgen %12.5f seconds\n\n", overhead2);
	fprintf(stderr,"Times for array with leading dimension of%4d\n\n",ldaa);
	fprintf(stderr,"      dgefa      dgesl      total     Mflops       unit");
	fprintf(stderr,"      ratio\n");

/************************************************************************
 *                              Execute 5 passes                        *
 ************************************************************************/
	      
	tm2 = ntimes * overhead2;
	atime[3][12] = 0;

	for (j=7 ; j<12 ; j++)
	{
	
	    t1 = dtime();

	    for (i = 0; i < ntimes; i++)
	    {
		matgen(aa,ldaa,n,b,&norma);
		dgefa(aa,ldaa,n,ipvt,&info  );
	    }

	    atime[0][j] = (dtime() - t1 - tm2)/ntimes;

	    t1 = dtime();      
	
	    for (i = 0; i < ntimes; i++)
	    {
		dgesl(aa,ldaa,n,ipvt,b,0);
	    }

	    atime[1][j] = (dtime() - t1)/ntimes;
	    total       = atime[0][j] + atime[1][j];
	    atime[2][j] = total;
	    atime[3][j] = ops/(1.0e6*total);
	    atime[4][j] = 2.0/atime[3][j];
	    atime[5][j] = total/cray;
	    atime[3][12] = atime[3][12] + atime[3][j];

	    print_time(j);
	}
	atime[3][12] = atime[3][12] / 5.0; 
	fprintf (stderr,"Average                          %11.2f\n",
					      (double)atime[3][12]);  

/************************************************************************
 *           Use minimum average as overall Mflops rating               *
 ************************************************************************/
      
	mflops = atime[3][6];
	if (atime[3][12] < mflops) mflops = atime[3][12];
       
	fprintf(stderr,"\n");
	fprintf(stderr,ROLLING);fprintf(stderr,PREC);
	fprintf(stderr," Precision %11.2f Mflops \n\n",mflops);

/************************************************************************
 *             Type details of hardware, software etc.                  *
 ************************************************************************/

 if (getinput == "Yes")
   {

     printf ("Enter the following which will be added with results to file LINPACKC.RES\n");
     printf ("When submitting a number of results you need only provide details once\n");
     printf ("but a cross reference such as an abbreviated CPU type would be useful.\n");    
     printf ("You can kill (exit or close) the program now and no data will be added.\n\n");
    
     printf ("Date:       ");
     fgets(general[0],80,stdin);
    
     printf ("Computer:   ");
     fgets(general[1],80,stdin);
    
     printf ("CPU chip:   ");
     fgets(general[2],80,stdin);
     
     printf ("Clock MHz:  ");
     fgets(general[3],80,stdin);
     
     printf ("Cache size: ");
     fgets(general[4],80,stdin);
     
     printf ("H/W options:");
     fgets(general[5],80,stdin);
      
     printf ("OS version: ");
     fgets(general[6],80,stdin);
    
     #ifdef PRECOMP
	strcpy (compiler, precompiler);
	strcpy (options, preoptions);
     #else
	printf ("Compiler:   ");
	fgets(compiler,80,stdin);
    
	printf ("Options:    ");
	fgets(options,256,stdin);
     #endif
     
     printf ("Your name:  ");
     fgets(general[7],80,stdin);
     
     printf ("From:       ");
     fgets(general[8],80,stdin);
     
     printf ("Email:      ");
     fgets(general[9],80,stdin);
   }
  else
   {
     #ifdef PRECOMP
	strcpy (compiler, precompiler);
	strcpy (options, preoptions);
     #endif
   }
     
    
/************************************************************************
 *              Add results to output file LLloops.txt                  *
 ************************************************************************/
    fprintf (outfile, "\n"); 
    fprintf (outfile, "##############################################\n");
    fprintf (outfile, "LINPACK BENCHMARK in 'C/C++'    n @ 100\n\n");
    fprintf (outfile, "Date         %s\n", general[0]);
    fprintf (outfile, "Model        %s\n", general[1]);
    fprintf (outfile, "CPU          %s\n", general[2]);
    fprintf (outfile, "Clock MHz    %s\n", general[3]);
    fprintf (outfile, "Cache        %s\n", general[4]);
    fprintf (outfile, "H/W options  %s\n", general[5]);
    fprintf (outfile, "OS           %s\n", general[6]);
    fprintf (outfile, "Compiler     %s\n", compiler);
    fprintf (outfile, "Options      %s\n", options);
    fprintf (outfile, "Run by       %s\n", general[7]);
    fprintf (outfile, "From         %s\n", general[8]);
    fprintf (outfile, "Email        %s\n", general[9]);
    fprintf (outfile, "\n");
  
    fprintf(outfile, "Rolling            %s\n",ROLLING);
    fprintf(outfile, "Precision          %s\n",PREC); 
    fprintf(outfile, "norm. resid        %16.1f\n",(double)residn);
    fprintf(outfile, "resid              %16.8e\n",(double)resid);
    fprintf(outfile, "machep             %16.8e\n",(double)epsn);
    fprintf(outfile, "x[0]-1             %16.8e\n",(double)x1);
    fprintf(outfile, "x[n-1]-1           %16.8e\n",(double)x2);
    fprintf(outfile, "matgen 1 seconds   %16.5f\n",overhead1);
    fprintf(outfile, "matgen 2 seconds   %16.5f\n",overhead2); 
    fprintf(outfile, "Repetitions        %16d\n",ntimes);
    fprintf(outfile, "Leading dimension  %16d\n",lda);  
    fprintf(outfile, "                              dgefa     dgesl "
		     "    total    Mflops\n");
    fprintf(outfile, "1 pass seconds     %16.5f %9.5f %9.5f\n",
		      atime[0][0], atime[1][0], atime[2][0]);
		      
    for (i=1 ; i<6 ; i++)
    {                 
	fprintf(outfile, "Repeat seconds     %16.5f %9.5f %9.5f %9.2f\n",                
		       atime[0][i], atime[1][i], atime[2][i], atime[3][i]);
    }
    fprintf(outfile, "Average            %46.2f\n",atime[3][6]);
    
    fprintf(outfile, "Leading dimension  %16d\n",ldaa);
     
    for (i=7 ; i<12 ; i++)
    {                 
	fprintf(outfile, "Repeat seconds     %16.5f %9.5f %9.5f %9.2f\n",                
		       atime[0][i], atime[1][i], atime[2][i], atime[3][i]);
    }
    fprintf(outfile, "Average            %46.2f\n\n",atime[3][12]); 

    fclose (outfile);
				
    printf ("\n");
	  printf ("A new results file will have been created in the same directory as the\n");
    printf (".EXE files if one did not already exist. If you made a mistake on input, \n");
    printf ("you can use a text editor to correct it, delete the results or copy \n");
    printf ("them to a different file name. If you intend to run multiple tests you\n");
    printf ("you may wish to rename LINPACKC.RES with a more informative title.\n\n");
    printf ("Please submit feedback and results files to aburto@nosc.mil or to\n");
    printf ("Roy_Longbottom@compuserve.com\n\n");

     
}
     
/*----------------------*/ 
void print_time (int row)
{
fprintf(stderr,"%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n",   (double)atime[0][row],
       (double)atime[1][row], (double)atime[2][row], (double)atime[3][row], 
       (double)atime[4][row], (double)atime[5][row]);
       return;
}
      
/*----------------------*/ 

void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma)


/* We would like to declare a[][lda], but c does not allow it.  In this
function, references to a[i][j] are written a[lda*i+j].  */

{
	int init, i, j;

	init = 1325;
	*norma = 0.0;
	for (j = 0; j < n; j++) {
		for (i = 0; i < n; i++) {
			init = 3125*init % 65536;
			a[lda*j+i] = (init - 32768.0)/16384.0;                        
			*norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma;
			
			/* alternative for some compilers
			if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]);
			*/
		}
	}
	for (i = 0; i < n; i++) {
	  b[i] = 0.0;
	}
	for (j = 0; j < n; j++) {
		for (i = 0; i < n; i++) {
			b[i] = b[i] + a[lda*j+i];
		}
	}
	return;
}

/*----------------------*/ 
void dgefa(REAL a[], int lda, int n, int ipvt[], int *info)
/* We would like to declare a[][lda], but c does not allow it.  In this
   function, references to a[i][j] are written a[lda*i+j].  */

/*
     dgefa factors a double precision matrix by gaussian elimination.

     dgefa is usually called by dgeco, but it can be called
     directly with a saving in time if  rcond  is not needed.
     (time for dgeco) = (1 + 9/n)*(time for dgefa) .

     on entry

	a       REAL precision[n][lda]
		the matrix to be factored.

	lda     integer
		the leading dimension of the array  a .

	n       integer
		the order of the matrix  a .

     on return

	a       an upper triangular matrix and the multipliers
		which were used to obtain it.
		the factorization can be written  a = l*u  where
		l  is a product of permutation and unit lower
		triangular matrices and  u  is upper triangular.

	ipvt    integer[n]
		an integer vector of pivot indices.

	info    integer
		= 0  normal value.
		= k  if  u[k][k] .eq. 0.0 .  this is not an error
		     condition for this subroutine, but it does
		     indicate that dgesl or dgedi will divide by zero
		     if called.  use  rcond  in dgeco for a reliable
		     indication of singularity.

     linpack. this version dated 08/14/78 .
     cleve moler, university of new mexico, argonne national lab.

     functions

     blas daxpy,dscal,idamax
*/

{
/*     internal variables       */

REAL t;
int j,k,kp1,l,nm1;


/*     gaussian elimination with partial pivoting       */

	*info = 0;
	nm1 = n - 1;
	if (nm1 >=  0) {
		for (k = 0; k < nm1; k++) {
			kp1 = k + 1;

			/* find l = pivot index */

			l = idamax(n-k,&a[lda*k+k],1) + k;
			ipvt[k] = l;

			/* zero pivot implies this column already 
			   triangularized */

			if (a[lda*k+l] != ZERO) {

				/* interchange if necessary */

				if (l != k) {
					t = a[lda*k+l];
					a[lda*k+l] = a[lda*k+k];
					a[lda*k+k] = t; 
				}

				/* compute multipliers */

				t = -ONE/a[lda*k+k];
				dscal(n-(k+1),t,&a[lda*k+k+1],1);

				/* row elimination with column indexing */

				for (j = kp1; j < n; j++) {
					t = a[lda*j+l];
					if (l != k) {
						a[lda*j+l] = a[lda*j+k];
						a[lda*j+k] = t;
					}
					daxpy(n-(k+1),t,&a[lda*k+k+1],1,
					      &a[lda*j+k+1],1);
				} 
			}
			else { 
				*info = k;
			}
		} 
	}
	ipvt[n-1] = n-1;
	if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1;
	return;
}

/*----------------------*/ 

void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job )


/* We would like to declare a[][lda], but c does not allow it.  In this
function, references to a[i][j] are written a[lda*i+j].  */

/*
     dgesl solves the double precision system
     a * x = b  or  trans(a) * x = b
     using the factors computed by dgeco or dgefa.

     on entry

	a       double precision[n][lda]
		the output from dgeco or dgefa.

	lda     integer
		the leading dimension of the array  a .

	n       integer
		the order of the matrix  a .

	ipvt    integer[n]
		the pivot vector from dgeco or dgefa.

	b       double precision[n]
		the right hand side vector.

	job     integer
		= 0         to solve  a*x = b ,
		= nonzero   to solve  trans(a)*x = b  where
			    trans(a)  is the transpose.

    on return

	b       the solution vector  x .

     error condition

	a division by zero will occur if the input factor contains a
	zero on the diagonal.  technically this indicates singularity
	but it is often caused by improper arguments or improper
	setting of lda .  it will not occur if the subroutines are
	called correctly and if dgeco has set rcond .gt. 0.0
	or dgefa has set info .eq. 0 .

     to compute  inverse(a) * c  where  c  is a matrix
     with  p  columns
	   dgeco(a,lda,n,ipvt,rcond,z)
	   if (!rcond is too small){
		for (j=0,j<p,j++)
			dgesl(a,lda,n,ipvt,c[j][0],0);
	   }

     linpack. this version dated 08/14/78 .
     cleve moler, university of new mexico, argonne national lab.

     functions

     blas daxpy,ddot
*/
{
/*     internal variables       */

	REAL t;
	int k,kb,l,nm1;

	nm1 = n - 1;
	if (job == 0) {

		/* job = 0 , solve  a * x = b
		   first solve  l*y = b         */

		if (nm1 >= 1) {
			for (k = 0; k < nm1; k++) {
				l = ipvt[k];
				t = b[l];
				if (l != k){ 
					b[l] = b[k];
					b[k] = t;
				}       
				daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 );
			}
		} 

		/* now solve  u*x = y */

		for (kb = 0; kb < n; kb++) {
		    k = n - (kb + 1);
		    b[k] = b[k]/a[lda*k+k];
		    t = -b[k];
		    daxpy(k,t,&a[lda*k+0],1,&b[0],1 );
		}
	}
	else { 

		/* job = nonzero, solve  trans(a) * x = b
		   first solve  trans(u)*y = b                  */

		for (k = 0; k < n; k++) {
			t = ddot(k,&a[lda*k+0],1,&b[0],1);
			b[k] = (b[k] - t)/a[lda*k+k];
		}

		/* now solve trans(l)*x = y     */

		if (nm1 >= 1) {
			for (kb = 1; kb < nm1; kb++) {
				k = n - (kb+1);
				b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1);
				l = ipvt[k];
				if (l != k) {
					t = b[l];
					b[l] = b[k];
					b[k] = t;
				}
			}
		}
	}
	return;
}

/*----------------------*/ 

void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy)
/*
     constant times a vector plus a vector.
     jack dongarra, linpack, 3/11/78.
*/

{
	int i,ix,iy,m,mp1;

	mp1 = 0;
	m = 0;

	if(n <= 0) return;
	if (da == ZERO) return;

	if(incx != 1 || incy != 1) {

		/* code for unequal increments or equal increments
		   not equal to 1                                       */

		ix = 0;
		iy = 0;
		if(incx < 0) ix = (-n+1)*incx;
		if(incy < 0)iy = (-n+1)*incy;
		for (i = 0;i < n; i++) {
			dy[iy] = dy[iy] + da*dx[ix];
			ix = ix + incx;
			iy = iy + incy;
		     
		}
		return;
	}
	
	/* code for both increments equal to 1 */
	

#ifdef ROLL

#ifdef SWAR

	{
		register float *p = &(dx[0]);
		register float *q = &(dy[0]);
		register float *e = &(dx[n]);

		/* Handle odd-sized vectors */
		for (i=(n%VECTSIZE); i>0; --i) {
			*(q++) += (da * *(p++));
		}
		while (p < e) {
			swar_saxpy((p64_t *)p, (p64_t *)q, &da);
			p += VECTSIZE;
				q += VECTSIZE;
		}
	}

#else

	for (i = 0;i < n; i++) {
		dy[i] = dy[i] + da*dx[i];
	}

#endif

#endif

#ifdef UNROLL

	m = n % 4;
	if ( m != 0) {
		for (i = 0; i < m; i++) 
			dy[i] = dy[i] + da*dx[i];
			
		if (n < 4) return;
	}
	for (i = m; i < n; i = i + 4) {
		dy[i] = dy[i] + da*dx[i];
		dy[i+1] = dy[i+1] + da*dx[i+1];
		dy[i+2] = dy[i+2] + da*dx[i+2];
		dy[i+3] = dy[i+3] + da*dx[i+3];
		
	}

#endif
return;
}
   
/*----------------------*/ 

REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy)
/*
     forms the dot product of two vectors.
     jack dongarra, linpack, 3/11/78.
*/

{
	REAL dtemp;
	int i,ix,iy,m,mp1;

	mp1 = 0;
	m = 0;

	dtemp = ZERO;

	if(n <= 0) return(ZERO);

	if(incx != 1 || incy != 1) {

		/* code for unequal increments or equal increments
		   not equal to 1                                       */

		ix = 0;
		iy = 0;
		if (incx < 0) ix = (-n+1)*incx;
		if (incy < 0) iy = (-n+1)*incy;
		for (i = 0;i < n; i++) {
			dtemp = dtemp + dx[ix]*dy[iy];
			ix = ix + incx;
			iy = iy + incy;
		       
		}
		return(dtemp);
	}

	/* code for both increments equal to 1 */


#ifdef ROLL

#ifdef SWAR

	{
		register float *p = &(dx[0]);
		register float *q = &(dy[0]);
		register float *e = &(dx[n]);

		for (i=(n%VECTSIZE); i>0; --i) {
			dtemp += (*(p++) * *(q++));
		}
		while (p < e) {
			swar_sdot((p64_t *)p, (p64_t *)q, &dtemp);
			p += VECTSIZE;
			q += VECTSIZE;
		}
	}
	return(dtemp);

#else

	for (i=0;i < n; i++)
		dtemp = dtemp + dx[i]*dy[i];
	       
	return(dtemp);

#endif

#endif

#ifdef UNROLL


	m = n % 5;
	if (m != 0) {
		for (i = 0; i < m; i++)
			dtemp = dtemp + dx[i]*dy[i];
		if (n < 5) return(dtemp);
	}
	for (i = m; i < n; i = i + 5) {
		dtemp = dtemp + dx[i]*dy[i] +
		dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] +
		dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4];
	}
	return(dtemp);

#endif

}

/*----------------------*/ 
void dscal(int n, REAL da, REAL dx[], int incx)

/*     scales a vector by a constant.
      jack dongarra, linpack, 3/11/78.
*/

{
	int i,m,mp1,nincx;

	mp1 = 0;
	m = 0;

	if(n <= 0)return;
	if(incx != 1) {

		/* code for increment not equal to 1 */

		nincx = n*incx;
		for (i = 0; i < nincx; i = i + incx)
			dx[i] = da*dx[i];
			
		return;
	}

	/* code for increment equal to 1 */


#ifdef ROLL

#ifdef SWAR

	{
		register float *p = &(dx[0]);
		register float *e = &(dx[n]);

		for (i=(n%VECTSIZE); i>0; --i) {
			*(p++) *= da;
		}
		while (p < e) {
			swar_sscal((p64_t *)p, &da);
			p += VECTSIZE;
		}
	}

#else

	for (i = 0; i < n; i++)
		dx[i] = da*dx[i];

#endif

#endif

#ifdef UNROLL


	m = n % 5;
	if (m != 0) {
		for (i = 0; i < m; i++)
			dx[i] = da*dx[i];
		if (n < 5) return;
	}
	for (i = m; i < n; i = i + 5){
		dx[i] = da*dx[i];
		dx[i+1] = da*dx[i+1];
		dx[i+2] = da*dx[i+2];
		dx[i+3] = da*dx[i+3];
		dx[i+4] = da*dx[i+4];
	}

#endif

}

/*----------------------*/ 
int idamax(int n, REAL dx[], int incx)

/*
     finds the index of element having max. absolute value.
     jack dongarra, linpack, 3/11/78.
*/


{
	REAL dmax;
	int i, ix, itemp;

	if( n < 1 ) return(-1);
	if(n ==1 ) return(0);
	if(incx != 1) {

		/* code for increment not equal to 1 */

		ix = 1;
		dmax = fabs((double)dx[0]);
		ix = ix + incx;
		for (i = 1; i < n; i++) {
			if(fabs((double)dx[ix]) > dmax)  {
				itemp = i;
				dmax = fabs((double)dx[ix]);
			}
			ix = ix + incx;
		}
	}
	else {

		/* code for increment equal to 1 */

		itemp = 0;
		dmax = fabs((double)dx[0]);
		for (i = 1; i < n; i++) {
			if(fabs((double)dx[i]) > dmax) {
				itemp = i;
				dmax = fabs((double)dx[i]);
			}
		}
	}
	return (itemp);
}

/*----------------------*/ 
REAL epslon (REAL x)

/*
     estimate unit roundoff in quantities of size x.
*/

{
	REAL a,b,c,eps;
/*
     this program should function properly on all systems
     satisfying the following two assumptions,
	1.  the base used in representing dfloating point
	    numbers is not a power of three.
	2.  the quantity  a  in statement 10 is represented to 
	    the accuracy used in dfloating point variables
	    that are stored in memory.
     the statement number 10 and the go to 10 are intended to
     force optimizing compilers to generate code satisfying 
     assumption 2.
     under these assumptions, it should be true that,
	    a  is not exactly equal to four-thirds,
	    b  has a zero for its last bit or digit,
	    c  is not exactly equal to one,
	    eps  measures the separation of 1.0 from
		 the next larger dfloating point number.
     the developers of eispack would appreciate being informed
     about any systems where these assumptions do not hold.

     *****************************************************************
     this routine is one of the auxiliary routines used by eispack iii
     to avoid machine dependencies.
     *****************************************************************

     this version dated 4/6/83.
*/

	a = 4.0e0/3.0e0;
	eps = ZERO;
	while (eps == ZERO) {
		b = a - ONE;
		c = b + b + b;
		eps = fabs((double)(c-ONE));
	}
	return(eps*fabs((double)x));
}
 
/*----------------------*/ 
void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[])

/* We would like to declare m[][ldm], but c does not allow it.  In this
   function, references to m[i][j] are written m[ldm*i+j].  */

/*
   purpose:
     multiply matrix m times vector x and add the result to vector y.

   parameters:

     n1 integer, number of elements in vector y, and number of rows in
	 matrix m

     y double [n1], vector of length n1 to which is added 
	 the product m*x

     n2 integer, number of elements in vector x, and number of columns
	 in matrix m

     ldm integer, leading dimension of array m

     x double [n2], vector of length n2

     m double [ldm][n2], matrix of n1 rows and n2 columns

 ----------------------------------------------------------------------
*/
{
	int j,i,jmin;
	/* cleanup odd vector */

	j = n2 % 2;
	if (j >= 1) {
		j = j - 1;
		for (i = 0; i < n1; i++) 
			y[i] = (y[i]) + x[j]*m[ldm*j+i];
	} 

	/* cleanup odd group of two vectors */

	j = n2 % 4;
	if (j >= 2) {
		j = j - 1;
		for (i = 0; i < n1; i++)
			y[i] = ( (y[i])
			       + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
	} 

	/* cleanup odd group of four vectors */

	j = n2 % 8;
	if (j >= 4) {
		j = j - 1;
		for (i = 0; i < n1; i++)
			y[i] = ((( (y[i])
			       + x[j-3]*m[ldm*(j-3)+i]) 
			       + x[j-2]*m[ldm*(j-2)+i])
			       + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
	} 

	/* cleanup odd group of eight vectors */

	j = n2 % 16;
	if (j >= 8) {
		j = j - 1;
		for (i = 0; i < n1; i++)
			y[i] = ((((((( (y[i])
			       + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i])
			       + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i])
			       + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i])
			       + x[j-1]*m[ldm*(j-1)+i]) + x[j]  *m[ldm*j+i];
	} 
	
	/* main loop - groups of sixteen vectors */

	jmin = (n2%16)+16;
	for (j = jmin-1; j < n2; j = j + 16) {
		for (i = 0; i < n1; i++) 
			y[i] = ((((((((((((((( (y[i])
				+ x[j-15]*m[ldm*(j-15)+i]) 
				+ x[j-14]*m[ldm*(j-14)+i])
				+ x[j-13]*m[ldm*(j-13)+i]) 
				+ x[j-12]*m[ldm*(j-12)+i])
				+ x[j-11]*m[ldm*(j-11)+i]) 
				+ x[j-10]*m[ldm*(j-10)+i])
				+ x[j- 9]*m[ldm*(j- 9)+i]) 
				+ x[j- 8]*m[ldm*(j- 8)+i])
				+ x[j- 7]*m[ldm*(j- 7)+i]) 
				+ x[j- 6]*m[ldm*(j- 6)+i])
				+ x[j- 5]*m[ldm*(j- 5)+i]) 
				+ x[j- 4]*m[ldm*(j- 4)+i])
				+ x[j- 3]*m[ldm*(j- 3)+i]) 
				+ x[j- 2]*m[ldm*(j- 2)+i])
				+ x[j- 1]*m[ldm*(j- 1)+i]) 
				+ x[j]   *m[ldm*j+i];
	}
	return;
} 

/*****************************************************/
/* Various timer routines.                           */
/* Al Aburto, aburto@nosc.mil, 18 Feb 1997           */
/*                                                   */
/* t = dtime() outputs the current time in seconds.  */
/* Use CAUTION as some of these routines will mess   */
/* up when timing across the hour mark!!!            */
/*                                                   */
/* For timing I use the 'user' time whenever         */
/* possible. Using 'user+sys' time is a separate     */
/* issue.                                            */
/*                                                   */
/* Example Usage:                                    */
/* [timer options added here]                        */
/* main()                                            */
/* {                                                 */
/*  double starttime,benchtime,dtime();              */
/*                                                   */
/*  starttime = dtime();                             */ 
/*  [routine to time]                                */
/*  benchtime = dtime() - starttime;                 */
/* }                                                 */
/*                                                   */
/* [timer code below added here]                     */
/*****************************************************/

/*********************************/
/* Timer code.                   */
/*********************************/
/*******************/
/*  Amiga dtime()  */
/*******************/
#ifdef Amiga
#include <ctype.h>
#define HZ 50

REAL dtime()
{
 REAL q;

 struct tt
       {
	long  days;
	long  minutes;
	long  ticks;
       } tt;

 DateStamp(&tt);

 q = ((REAL)(tt.ticks + (tt.minutes * 60L * 50L))) / (REAL)HZ;

 return q;
}
#endif

/*****************************************************/
/*  UNIX dtime(). This is the preferred UNIX timer.  */
/*  Provided by: Markku Kolkka, mk59200@cc.tut.fi    */
/*  HP-UX Addition by: Bo Thide', bt@irfu.se         */
/*****************************************************/
#ifdef UNIX
#include <sys/time.h>
#include <sys/resource.h>

#ifdef hpux
#include <sys/syscall.h>
#define getrusage(a,b) syscall(SYS_getrusage,a,b)
#endif

struct rusage rusage;

REAL dtime()
{
 REAL q;

 getrusage(RUSAGE_SELF,&rusage);

 q = (REAL)(rusage.ru_utime.tv_sec);
 q = q + (REAL)(rusage.ru_utime.tv_usec) * 1.0e-06;
	
 return q;
}
#endif

/***************************************************/
/*  UNIX_Old dtime(). This is the old UNIX timer.  */
/*  Use only if absolutely necessary as HZ may be  */
/*  ill defined on your system.                    */
/***************************************************/
#ifdef UNIX_Old
#include <sys/types.h>
#include <sys/times.h>
#include <sys/param.h>

#ifndef HZ
#define HZ 60
#endif

struct tms tms;

REAL dtime()
{
 REAL q;

 times(&tms);

 q = (REAL)(tms.tms_utime) / (REAL)HZ;
	
 return q;
}
#endif

/*********************************************************/
/*  VMS dtime() for VMS systems.                         */
/*  Provided by: RAMO@uvphys.phys.UVic.CA                */
/*  Some people have run into problems with this timer.  */
/*********************************************************/
#ifdef VMS
#include time

#ifndef HZ
#define HZ 100
#endif

struct tbuffer_t
       {
	int proc_user_time;
	int proc_system_time;
	int child_user_time;
	int child_system_time;
       };
struct tbuffer_t tms;

REAL dtime()
{
 REAL q;

 times(&tms);

 q = (REAL)(tms.proc_user_time) / (REAL)HZ;
	
 return q;
}
#endif

/******************************/
/*  BORLAND C dtime() for DOS */
/******************************/
#ifdef BORLAND_C
#include <ctype.h>
#include <dos.h>
#include <time.h>

#define HZ 100
struct time tnow;

REAL dtime()
{
 REAL q;

 gettime(&tnow);

 q = 60.0 * (REAL)(tnow.ti_min);
 q = q + (REAL)(tnow.ti_sec);
 q = q + (REAL)(tnow.ti_hund)/(REAL)HZ;
	
 return q;
}
#endif

/***************************************/
/*  Microsoft C (MSC) dtime() for DOS  */
/*  Also suitable for Watcom C/C++ and */
/*  some other PC compilers            */
/***************************************/
#ifdef MSC
#include <time.h>
#include <ctype.h>

#define HZ CLOCKS_PER_SEC
clock_t tnow;

REAL dtime()
{
 REAL q;

 tnow = clock();
 q = (REAL)tnow / (REAL)HZ;
 return q;
}
#endif

/*************************************/
/*  Macintosh (MAC) Think C dtime()  */
/*************************************/
#ifdef MAC
#include <time.h>

#define HZ 60

REAL dtime()
{
 REAL q;

 q = (REAL)clock() / (REAL)HZ;
	
 return q;
}
#endif

/************************************************************/
/*  iPSC/860 (IPSC) dtime() for i860.                       */
/*  Provided by: Dan Yergeau, yergeau@gloworm.Stanford.EDU  */
/************************************************************/
#ifdef IPSC
extern double dclock();

REAL dtime()
{
 REAL q;

 q = dclock();
	
 return q;
}
#endif

/**************************************************/
/*  FORTRAN dtime() for Cray type systems.        */
/*  This is the preferred timer for Cray systems. */
/**************************************************/
#ifdef FORTRAN_SEC

fortran double second();

REAL dtime()
{
 REAL q;

 second(&q);
	
 return q;
}
#endif

/***********************************************************/
/*  UNICOS C dtime() for Cray UNICOS systems.  Don't use   */
/*  unless absolutely necessary as returned time includes  */
/*  'user+system' time.  Provided by: R. Mike Dority,      */
/*  dority@craysea.cray.com                                */
/***********************************************************/
#ifdef CTimer
#include <time.h>

REAL dtime()
{
 REAL q;
 clock_t   clock(void);

 q = (REAL)clock() / (REAL)CLOCKS_PER_SEC;

 return q;
}
#endif

/********************************************/
/* Another UNIX timer using gettimeofday(). */
/* However, getrusage() is preferred.       */
/********************************************/
#ifdef GTODay
#include <sys/time.h>

struct timeval tnow;

REAL dtime()
{
 REAL q;

 gettimeofday(&tnow,NULL);
 q = (REAL)tnow.tv_sec + (REAL)tnow.tv_usec * 1.0e-6;

 return q;
}
#endif

/*****************************************************/
/*  Fujitsu UXP/M timer.                             */
/*  Provided by: Mathew Lim, ANUSF, M.Lim@anu.edu.au */
/*****************************************************/
#ifdef UXPM
#include <sys/types.h>
#include <sys/timesu.h>
struct tmsu rusage;

REAL dtime()
{
 REAL q;

 timesu(&rusage);

 q = (REAL)(rusage.tms_utime) * 1.0e-06;
	
 return q;
}
#endif

/**********************************************/
/*    Macintosh (MAC_TMgr) Think C dtime()    */
/*   requires Think C Language Extensions or  */
/*    #include <MacHeaders> in the prefix     */
/*  provided by Francis H Schiffer 3rd (fhs)  */
/*         skipschiffer@genie.geis.com        */
/**********************************************/
#ifdef MAC_TMgr
#include <Timer.h>
#include <stdlib.h>

static TMTask   mgrTimer;
static Boolean  mgrInited = false;
static REAL     mgrClock;

#define RMV_TIMER RmvTime( (QElemPtr)&mgrTimer )
#define MAX_TIME  1800000000L
/* MAX_TIME limits time between calls to */
/* dtime( ) to no more than 30 minutes   */
/* this limitation could be removed by   */
/* creating a completion routine to sum  */
/* 30 minute segments (fhs 1994 feb 9)   */

static void Remove_timer( )
{
 RMV_TIMER;
 mgrInited = false;
}

REAL dtime( )
{
 if( mgrInited ) {
	RMV_TIMER;
	mgrClock += (MAX_TIME + mgrTimer.tmCount)*1.0e-6;
 } else {
	if( _atexit( &Remove_timer ) == 0 ) mgrInited = true;
	mgrClock = 0.0;
}
	if( mgrInited ) {
		mgrTimer.tmAddr = NULL;
		mgrTimer.tmCount = 0;
		mgrTimer.tmWakeUp = 0;
		mgrTimer.tmReserved = 0;
		InsTime( (QElemPtr)&mgrTimer );
		PrimeTime( (QElemPtr)&mgrTimer, -MAX_TIME );
	}
	return( mgrClock );
}
#endif

/***********************************************************/
/*  Parsytec GCel timer.                                   */
/*  Provided by: Georg Wambach, gw@informatik.uni-koeln.de */
/***********************************************************/
#ifdef PARIX
#include <sys/time.h>

REAL dtime()
{
 REAL q;

 q = (REAL) (TimeNowHigh()) / (REAL) CLK_TCK_HIGH;

 return q;
}
#endif

/************************************************/
/*  Sun Solaris POSIX dtime() routine           */
/*  Provided by: Case Larsen, CTLarsen.lbl.gov  */
/************************************************/
#ifdef POSIX
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/rusage.h>

#ifdef __hpux
#include <sys/syscall.h>
#endif

struct rusage rusage;

REAL dtime()
{
 REAL q;

 getrusage(RUSAGE_SELF,&rusage);

 q = (REAL)(rusage.ru_utime.tv_sec);
 q = q + (REAL)(rusage.ru_utime.tv_nsec) * 1.0e-09;
	
 return q;
}
#endif


/****************************************************/
/*  Windows NT (32 bit) dtime() routine             */
/*  Provided by: Piers Haken, piersh@microsoft.com  */
/****************************************************/
#ifdef WIN32
#include <windows.h>

REAL dtime(void)
{
 REAL q;

 q = (REAL)GetTickCount() * 1.0e-03;
	
 return q;
}
#endif

/*****************************************************/
/* Time according to POSIX.1  -  <J.Pelan@qub.ac.uk> */
/* Ref: "POSIX Programmer's Guide"  O'Reilly & Assoc.*/
/*****************************************************/
#ifdef POSIX1
#define _POSIX_SOURCE 1
#include <unistd.h>
#include <limits.h>
#include <sys/times.h>

struct tms tms;

REAL dtime()
{
 REAL q;
 times(&tms);
 q = (REAL)tms.tms_utime / (REAL)CLK_TCK;
 return q;
}
#endif
/* lp.Sc - Compile with Scc -c -k -mK6-2 -O5 */
void swar_saxpy(float:[VECTSIZE] x, float:[VECTSIZE] y, float s)
{
	static ss = s;

	y += (ss * x);
}

void swar_sdot(float:[VECTSIZE] x, float:[VECTSIZE] y, float s)
{
	s += (x * y);
}

void swar_sscal(float:[VECTSIZE] x, float s)
{
	x = x * s;
}

