perf/ffbench.c

   1 // This small program computes a Fast Fourier Transform.  It tests
   2 // Valgrind's handling of FP operations.  It is representative of all
   3 // programs that do a lot of FP operations.
   4
   5 // Licensing: This program is closely based on the one of the same name from
   6 // http://www.fourmilab.ch/.  The front page of that site says:
   7 //
   8 //   "Except for a few clearly-marked exceptions, all the material on this
   9 //   site is in the public domain and may be used in any manner without
  10 //   permission, restriction, attribution, or compensation."
  11
  12 /*
  13
  14         Two-dimensional FFT benchmark
  15
  16         Designed and implemented by John Walker in April of 1989.
  17
  18         This  benchmark executes a specified number of passes (default
  19         20) through a loop in which each  iteration  performs  a  fast
  20         Fourier transform of a square matrix (default size 256x256) of
  21         complex numbers (default precision double),  followed  by  the
  22         inverse  transform.   After  all loop iterations are performed
  23         the results are checked against known correct values.
  24
  25         This benchmark is intended for use on C implementations  which
  26         define  "int"  as  32 bits or longer and permit allocation and
  27         direct addressing of arrays larger than one megabyte.
  28
  29         If CAPOUT is defined,  the  result  after  all  iterations  is
  30         written  as  a  CA  Lab  pattern  file.   This is intended for
  31         debugging in case horribly wrong results  are  obtained  on  a
  32         given machine.
  33
  34         Archival  timings  are  run  with the definitions below set as
  35         follows: Float = double, Asize = 256, Passes = 20, CAPOUT  not
  36         defined.
  37
  38         Time (seconds)              System
  39
  40             2393.93       Sun 3/260, SunOS 3.4, C, "-f68881 -O".
  41                           (John Walker).
  42
  43             1928          Macintosh IIx, MPW C 3.0, "-mc68020
  44                           -mc68881 -elems881 -m".  (Hugh Hoover).
  45
  46             1636.1        Sun 4/110, "cc -O3 -lm".  (Michael McClary).
  47                           The suspicion is that this is software
  48                           floating point.
  49
  50             1556.7        Macintosh II, A/UX, "cc -O -lm"
  51                           (Michael McClary).
  52
  53             1388.8        Sun 386i/250, SunOS 4.0.1 C
  54                           "-O /usr/lib/trig.il".  (James Carrington).
  55
  56             1331.93       Sun 3/60, SunOS 4.0.1, C,
  57                           "-O4 -f68881 /usr/lib/libm.il"
  58                           (Bob Elman).
  59
  60             1204.0        Apollo Domain DN4000, C, "-cpu 3000 -opt 4".
  61                           (Sam Crupi).
  62
  63             1174.66       Compaq 386/25, SCO Xenix 386 C.
  64                           (Peter Shieh).
  65
  66             1068          Compaq 386/25, SCO Xenix 386,
  67                           Metaware High C.  (Robert Wenig).
  68
  69             1064.0        Sun 3/80, SunOS 4.0.3 Beta C
  70                           "-O3 -f68881 /usr/lib/libm.il".  (James Carrington).
  71
  72             1061.4        Compaq 386/25, SCO Xenix, High C 1.4.
  73                           (James Carrington).
  74
  75             1059.79       Compaq 386/25, 387/25, High C 1.4,
  76                           DOS|Extender 2.2, 387 inline code
  77                           generation.  (Nathan Bender).
  78
  79              777.14       Compaq 386/25, IIT 3C87-25 (387 Compatible),
  80                           High C 1.5, DOS|Extender 2.2, 387 inline
  81                           code generation.  (Nathan Bender).
  82
  83              751          Compaq DeskPro 386/33, High C 1.5 + DOS|Extender,
  84                           387 code generation.  (James Carrington).
  85
  86              431.44       Compaq 386/25, Weitek 3167-25, DOS 3.31,
  87                           High C 1.4, DOS|Extender, Weitek code generation.
  88                           (Nathan Bender).
  89
  90              344.9        Compaq 486/25, Metaware High C 1.6, Phar Lap
  91                           DOS|Extender, in-line floating point.  (Nathan
  92                           Bender).
  93
  94              324.2        Data General Motorola 88000, 16 Mhz, Gnu C.
  95
  96              323.1        Sun 4/280, C, "-O4".  (Eric Hill).
  97
  98              254          Compaq SystemPro 486/33, High C 1.5 + DOS|Extender,
  99                           387 code generation.  (James Carrington).
 100
 101              242.8        Silicon Graphics Personal IRIS, MIPS R2000A,
 102                           12.5 Mhz, "-O3" (highest level optimisation).
 103                           (Mike Zentner).
 104
 105              233.0        Sun SPARCStation 1, C, "-O4", SunOS 4.0.3.
 106                           (Nathan Bender).
 107
 108              187.30       DEC PMAX 3100, MIPS 2000 chip.
 109                           (Robert Wenig).
 110
 111              120.46       Sun SparcStation 2, C, "-O4", SunOS 4.1.1.
 112                           (John Walker).
 113
 114              120.21       DEC 3MAX, MIPS 3000, "-O4".
 115
 116               98.0        Intel i860 experimental environment,
 117                           OS/2, data caching disabled.  (Kern
 118                           Sibbald).
 119
 120               34.9        Silicon Graphics Indigo², MIPS R4400,
 121                           175 Mhz, IRIX 5.2, "-O".
 122
 123               32.4        Pentium 133, Windows NT, Microsoft Visual
 124                           C++ 4.0.
 125
 126               17.25       Silicon Graphics Indigo², MIPS R4400,
 127                           175 Mhz, IRIX 6.5, "-O3".
 128
 129               14.10       Dell Dimension XPS R100, Pentium II 400 MHz,
 130                           Windows 98, Microsoft Visual C 5.0.
 131
 132               10.7        Hewlett-Packard Kayak XU 450Mhz Pentium II,
 133                           Microsoft Visual C++ 6.0, Windows NT 4.0sp3.  (Nathan Bender).
 134
 135                5.09       Sun Ultra 2, UltraSPARC V9, 300 MHz, gcc -O3.
 136
 137                0.846      Dell Inspiron 9100, Pentium 4, 3.4 GHz, gcc -O3.
 138
 139 */
 140
 141 #include <stdio.h>
 142 #include <stdlib.h>
 143 #include <math.h>
 144 #include <string.h>
 145
 146 /*  The  program  may  be  run  with  Float defined as either float or
 147     double.  With IEEE arithmetic, the same answers are generated  for
 148     either floating point mode.  */
 149
 150 #define Float    double            /* Floating point type used in FFT */
 151
 152 #define Asize    256               /* Array edge size */
 153 #define Passes   20                /* Number of FFT/Inverse passes */
 154
 155 #define max(a,b) ((a)>(b)?(a):(b))
 156 #define min(a,b) ((a)<=(b)?(a):(b))
 157
 158 /*
 159
 160         Multi-dimensional fast Fourier transform
 161
 162         Adapted from Press et al., "Numerical Recipes in C".
 163
 164 */
 165
 166 #define SWAP(a,b) tempr=(a); (a)=(b); (b)=tempr
 167
 168 static void fourn(Float data[], int nn[], int ndim, int isign)
 169 {
 170         register int i1, i2, i3;
 171         int i2rev, i3rev, ip1, ip2, ip3, ifp1, ifp2;
 172         int ibit, idim, k1, k2, n, nprev, nrem, ntot;
 173         Float tempi, tempr;
 174         double theta, wi, wpi, wpr, wr, wtemp;
 175
 176         ntot = 1;
 177         for (idim = 1; idim <= ndim; idim++)
 178            ntot *= nn[idim];
 179         nprev = 1;
 180         for (idim = ndim; idim >= 1; idim--) {
 181            n = nn[idim];
 182            nrem = ntot / (n * nprev);
 183            ip1 = nprev << 1;
 184            ip2 = ip1 * n;
 185            ip3 = ip2 * nrem;
 186            i2rev = 1;
 187            for (i2 = 1; i2 <= ip2; i2 += ip1) {
 188               if (i2 < i2rev) {
 189                  for (i1 = i2; i1 <= i2 + ip1 - 2; i1 += 2) {
 190                     for (i3 = i1; i3 <= ip3; i3 += ip2) {
 191                        i3rev = i2rev + i3 - i2;
 192                        SWAP(data[i3], data[i3rev]);
 193                        SWAP(data[i3 + 1], data[i3rev + 1]);
 194                     }
 195                  }
 196               }
 197               ibit = ip2 >> 1;
 198               while (ibit >= ip1 && i2rev > ibit) {
 199                  i2rev -= ibit;
 200                  ibit >>= 1;
 201               }
 202               i2rev += ibit;
 203            }
 204            ifp1 = ip1;
 205            while (ifp1 < ip2) {
 206               ifp2 = ifp1 << 1;
 207               theta = isign * 6.28318530717959 / (ifp2 / ip1);
 208               wtemp = sin(0.5 * theta);
 209               wpr = -2.0 * wtemp * wtemp;
 210               wpi = sin(theta);
 211               wr = 1.0;
 212               wi = 0.0;
 213               for (i3 = 1; i3 <= ifp1; i3 += ip1) {
 214                  for (i1 = i3; i1 <= i3 + ip1 - 2; i1 += 2) {
 215                     for (i2 = i1; i2 <= ip3; i2 += ifp2) {
 216                        k1 = i2;
 217                        k2 = k1 + ifp1;
 218                        tempr = wr * data[k2] - wi * data[k2 + 1];
 219                        tempi = wr * data[k2 + 1] + wi * data[k2];
 220                        data[k2] = data[k1] - tempr;
 221                        data[k2 + 1] = data[k1 + 1] - tempi;
 222                        data[k1] += tempr;
 223                        data[k1 + 1] += tempi;
 224                     }
 225                  }
 226                  wr = (wtemp = wr) * wpr - wi * wpi + wr;
 227                  wi = wi * wpr + wtemp * wpi + wi;
 228               }
 229               ifp1 = ifp2;
 230            }
 231            nprev *= n;
 232         }
 233 }
 234 #undef SWAP
 235
 236 int main()
 237 {
 238         int i, j, k, l, m, npasses = Passes, faedge;
 239         Float *fdata /* , *fd */ ;
 240         static int nsize[] = {0, 0, 0};
 241         long fanum, fasize;
 242         double mapbase, mapscale, /* x, */ rmin, rmax, imin, imax;
 243
 244         faedge = Asize;            /* FFT array edge size */
 245         fanum = faedge * faedge;   /* Elements in FFT array */
 246         fasize = ((fanum + 1) * 2 * sizeof(Float)); /* FFT array size */
 247         nsize[1] = nsize[2] = faedge;
 248
 249         fdata = (Float *) malloc(fasize);
 250         if (fdata == NULL) {
 251            fprintf(stdout, "Can't allocate data array.\n");
 252            exit(1);
 253         }
 254
 255         /*  Generate data array to process.  */
 256
 257 #define Re(x,y) fdata[1 + (faedge * (x) + (y)) * 2]
 258 #define Im(x,y) fdata[2 + (faedge * (x) + (y)) * 2]
 259
 260         memset(fdata, 0, fasize);
 261         for (i = 0; i < faedge; i++) {
 262            for (j = 0; j < faedge; j++) {
 263               if (((i & 15) == 8) || ((j & 15) == 8))
 264                  Re(i, j) = 128.0;
 265            }
 266         }
 267
 268         for (i = 0; i < npasses; i++) {
 269 /*printf("Pass %d\n", i);*/
 270            /* Transform image to frequency domain. */
 271            fourn(fdata, nsize, 2, 1);
 272
 273            /*  Back-transform to image. */
 274            fourn(fdata, nsize, 2, -1);
 275         }
 276
 277         {
 278            double r, ij, ar, ai;
 279            rmin = 1e10; rmax = -1e10;
 280            imin = 1e10; imax = -1e10;
 281            ar = 0;
 282            ai = 0;
 283
 284            for (i = 1; i <= fanum; i += 2) {
 285               r = fdata[i];
 286               ij = fdata[i + 1];
 287               ar += r;
 288               ai += ij;
 289               rmin = min(r, rmin);
 290               rmax = max(r, rmax);
 291               imin = min(ij, imin);
 292               imax = max(ij, imax);
 293            }
 294 #ifdef DEBUG
 295            printf("Real min %.4g, max %.4g.  Imaginary min %.4g, max %.4g.\n",
 296               rmin, rmax, imin, imax);
 297            printf("Average real %.4g, imaginary %.4g.\n",
 298               ar / fanum, ai / fanum);
 299 #endif
 300            mapbase = rmin;
 301            mapscale = 255 / (rmax - rmin);
 302         }
 303
 304         /* See if we got the right answers. */
 305
 306         m = 0;
 307         for (i = 0; i < faedge; i++) {
 308            for (j = 0; j < faedge; j++) {
 309               k = (Re(i, j) - mapbase) * mapscale;
 310               l = (((i & 15) == 8) || ((j & 15) == 8)) ? 255 : 0;
 311               if (k != l) {
 312                  m++;
 313                  fprintf(stdout,
 314                     "Wrong answer at (%d,%d)!  Expected %d, got %d.\n",
 315                     i, j, l, k);
 316               }
 317            }
 318         }
 319         if (m == 0) {
 320            fprintf(stdout, "%d passes.  No errors in results.\n", npasses);
 321         } else {
 322            fprintf(stdout, "%d passes.  %d errors in results.\n",
 323               npasses, m);
 324         }
 325
 326 #ifdef CAPOUT
 327
 328         /* Output the result of the transform as a CA Lab pattern
 329            file for debugging. */
 330
 331         {
 332 #define SCRX  322
 333 #define SCRY  200
 334 #define SCRN  (SCRX * SCRY)
 335            unsigned char patarr[SCRY][SCRX];
 336            FILE *fp;
 337
 338 /*  Map user external state numbers to internal state index  */
 339
 340 #define UtoI(x)     (((((x) >> 1) & 0x7F) | ((x) << 7)) & 0xFF)
 341
 342            /* Copy data from FFT buffer to map. */
 343
 344            memset(patarr, 0, sizeof patarr);
 345            l = (SCRX - faedge) / 2;
 346            m = (faedge > SCRY) ? 0 : ((SCRY - faedge) / 2);
 347            for (i = 1; i < faedge; i++) {
 348               for (j = 0; j < min(SCRY, faedge); j++) {
 349                  k = (Re(i, j) - mapbase) * mapscale;
 350                  patarr[j + m][i + l] = UtoI(k);
 351               }
 352            }
 353
 354            /* Dump pattern map to file. */
 355
 356            fp = fopen("fft.cap", "w");
 357            if (fp == NULL) {
 358               fprintf(stdout, "Cannot open output file.\n");
 359               exit(0);
 360            }
 361            putc(':', fp);
 362            putc(1, fp);
 363            fwrite(patarr, SCRN, 1, fp);
 364            putc(6, fp);
 365            fclose(fp);
 366         }
 367 #endif
 368
 369         return 0;
 370 }