/*

				 INTRODUCTION

This is a simple attempt to test raw FPU speed of x86 machines under Linux.

If you have comments, please send Email to:
    mann@cs.toronto.edu (Richard Mann)

The tests are based on repeated calls of the simple operation, DAXPY (see
routine below) that computes Y=Y+aX, where X and Y are vectors and a is a
scalar.  For the tests here, one million computations were performed, each
using a vector of length 128.  This code is adapted from the LINPACK
benchmark.

Optimizations were performed to attempt to take advantage of the floating
point pipelines (if possible).  First I tried a simple unrolling of loops
(UNROLL4 and UNROLL8 below), but this was not too successful.  Next I tried a
more explicit partition (PIPE4 and PIPE8 below) which separated the multiplies
from the adds and stores to reduce dependencies.  I verified this code by
looking at the assembler output (from GCC).

The speed is reported in MegaMACs/s (millions of multiply/accumulates per
second).


				 COMPILATION

To compile (linux):
        gcc -DSP -O fputest.c
        gcc -DSP -DUNROLL4 -O fputest.c
        gcc -DSP -DUNROLL8 -O fputest.c
        gcc -DSP -DPIPE4 -O fputest.c
        gcc -DSP -DPIPE8 -O fputest.c
        gcc -DDP -O fputest.c
        gcc -DDP -DUNROLL4 -O fputest.c
        gcc -DDP -DUNROLL8 -O fputest.c
        gcc -DDP -DPIPE4 -O fputest.c
        gcc -DDP -DPIPE8 -O fputest.c
Then run the file "a.out".

To get assembler code (for diagnosing the pipelining, etc):
	gcc -S -DSP -O fputest.c
	... etc ...

This code was compiled with gcc 2.7.2.


REVISION HISTORY

18 Jul 97 Original posting to comp.sys.ibm.ps.hardware.chips
22 Jul 97 Added statements to print compilation options
          (from "David Parsons" <orc@pell.chi.il.us>)
29 Jul 97 Changed printout of "MFLOPS" to "MegaMAC/s" (millions of multiply
          accumulate cycles/second) to avoid confusion with other
          benchmarks.  The test remains the same.

*/

#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include <sys/resource.h>

#ifdef DP
#define REAL double
#elif SP
#define REAL float
#endif


void daxpy(int n, REAL da, REAL* dx, REAL* dy)
{
 int i,m;
 REAL t0, t1, t2, t3, t4, t5, t6, t7;

#ifdef UNROLL4
 m = n % 4;
 if ( m != 0) {
  for (i = 0; i < m; i++) 
  dy[i] = dy[i] + da*dx[i];
  if (n < 4) return;
 }
 for (i = m; i < n; i = i + 4) {
  dy[i] = dy[i] + da*dx[i];
  dy[i+1] = dy[i+1] + da*dx[i+1];
  dy[i+2] = dy[i+2] + da*dx[i+2];
  dy[i+3] = dy[i+3] + da*dx[i+3];
 }
#elif UNROLL8
 m = n % 8;
 if ( m != 0) {
  for (i = 0; i < m; i++) 
  dy[i] = dy[i] + da*dx[i];
  if (n < 8) return;
 }
 for (i = m; i < n; i = i + 8) {
  dy[i] = dy[i] + da*dx[i];
  dy[i+1] = dy[i+1] + da*dx[i+1];
  dy[i+2] = dy[i+2] + da*dx[i+2];
  dy[i+3] = dy[i+3] + da*dx[i+3];
  dy[i+4] = dy[i+4] + da*dx[i+4];
  dy[i+5] = dy[i+5] + da*dx[i+5];
  dy[i+6] = dy[i+6] + da*dx[i+6];
  dy[i+7] = dy[i+7] + da*dx[i+7];
 }
#elif PIPE4
 m = n % 4;
 if ( m != 0) {
  for (i = 0; i < m; i++) 
  dy[i] = dy[i] + da*dx[i];
  if (n < 4) return;
 }
 for (i = m; i < n; i = i + 4) {
  t0 = da*dx[i];
  t1 = da*dx[i+1];
  t2 = da*dx[i+2];
  t3 = da*dx[i+3];
  t0 = dy[i] + t0;
  t1 = dy[i+1] + t1;
  t2 = dy[i+2] + t2;
  t3 = dy[i+3] + t3;
  dy[i] = t0;
  dy[i+1] = t1;
  dy[i+2] = t2;
  dy[i+3] = t3;
 }
#elif PIPE8
 m = n % 8;
 if ( m != 0) {
  for (i = 0; i < m; i++) 
  dy[i] = dy[i] + da*dx[i];
  if (n < 8) return;
 }
 for (i = m; i < n; i = i + 8) {
  t0 = da*dx[i];
  t1 = da*dx[i+1];
  t2 = da*dx[i+2];
  t3 = da*dx[i+3];
  t4 = da*dx[i+4];
  t5 = da*dx[i+5];
  t6 = da*dx[i+6];
  t7 = da*dx[i+7];
  t0 = dy[i] + t0;
  t1 = dy[i+1] + t1;
  t2 = dy[i+2] + t2;
  t3 = dy[i+3] + t3;
  t4 = dy[i+4] + t4;
  t5 = dy[i+5] + t5;
  t6 = dy[i+6] + t6;
  t7 = dy[i+7] + t7;
  dy[i] = t0;
  dy[i+1] = t1;
  dy[i+2] = t2;
  dy[i+3] = t3;
  dy[i+4] = t4;
  dy[i+5] = t5;
  dy[i+6] = t6;
  dy[i+7] = t7;
 }
#else
 for (i = 0;i < n; i++) {
  dy[i] += da*dx[i];
 }
#endif
}

float second()
{
 struct rusage ru;
 float t;
 getrusage(RUSAGE_SELF,&ru) ;
 t = (float) (ru.ru_utime.tv_sec+ru.ru_stime.tv_sec) + 
 ((float) (ru.ru_utime.tv_usec+ru.ru_stime.tv_usec))/1.0e6 ;
 return t ;
}

int main()
{
#define NREPS 1000000
#define NDIMS 128
 float t1, t2;
 REAL x[NDIMS], y[NDIMS];
 REAL a=0.25; /* arbitrary small nonzero number */
 int i;

#if 0
    printf("fputest...\n");
#else
    printf("fputest");
#ifdef SP
    printf(" SP");
#endif
#ifdef DP
    printf(" DP");
#endif
#ifdef UNROLL4
    printf(" UNROLL4");
#endif
#ifdef UNROLL8
    printf(" UNROLL8");
#endif
#ifdef PIPE4
    printf(" PIPE4");
#endif
#ifdef PIPE8
    printf(" PIPE8");
#endif
    putchar('\n');
#endif

 for (i=0; i<NDIMS; i++) {
  x[i]=2.0; /* arbitrary number */
  y[i]=0.0;
 }
 t1=second();
 for (i=0; i<NREPS; i++) {
  daxpy((int)NDIMS,a,x,y);
 }
 t2=second();
 printf("time: %.2f, MACs (multiply/accumulates): %d, ",
        (t2-t1), NREPS*NDIMS);
 printf("MegaMACs/sec: %.2f\n",
        ((float)NREPS*(float)NDIMS)/(t2-t1)/1e6);
 return(0);
}
