tests/fp/fp-bench.c

   1 /*
   2  * fp-bench.c - A collection of simple floating point microbenchmarks.
   3  *
   4  * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
   5  *
   6  * License: GNU GPL, version 2 or later.
   7  *   See the COPYING file in the top-level directory.
   8  */
   9 #ifndef HW_POISON_H
  10 #error Must define HW_POISON_H to work around TARGET_* poisoning
  11 #endif
  12
  13 #include "qemu/osdep.h"
  14 #include <math.h>
  15 #include <fenv.h>
  16 #include "qemu/timer.h"
  17 #include "qemu/int128.h"
  18 #include "fpu/softfloat.h"
  19
  20 /* amortize the computation of random inputs */
  21 #define OPS_PER_ITER     50000
  22
  23 #define MAX_OPERANDS 3
  24
  25 #define SEED_A 0xdeadfacedeadface
  26 #define SEED_B 0xbadc0feebadc0fee
  27 #define SEED_C 0xbeefdeadbeefdead
  28
  29 enum op {
  30     OP_ADD,
  31     OP_SUB,
  32     OP_MUL,
  33     OP_DIV,
  34     OP_FMA,
  35     OP_SQRT,
  36     OP_CMP,
  37     OP_MAX_NR,
  38 };
  39
  40 static const char * const op_names[] = {
  41     [OP_ADD] = "add",
  42     [OP_SUB] = "sub",
  43     [OP_MUL] = "mul",
  44     [OP_DIV] = "div",
  45     [OP_FMA] = "mulAdd",
  46     [OP_SQRT] = "sqrt",
  47     [OP_CMP] = "cmp",
  48     [OP_MAX_NR] = NULL,
  49 };
  50
  51 enum precision {
  52     PREC_SINGLE,
  53     PREC_DOUBLE,
  54     PREC_QUAD,
  55     PREC_FLOAT32,
  56     PREC_FLOAT64,
  57     PREC_FLOAT128,
  58     PREC_MAX_NR,
  59 };
  60
  61 enum rounding {
  62     ROUND_EVEN,
  63     ROUND_ZERO,
  64     ROUND_DOWN,
  65     ROUND_UP,
  66     ROUND_TIEAWAY,
  67     N_ROUND_MODES,
  68 };
  69
  70 static const char * const round_names[] = {
  71     [ROUND_EVEN] = "even",
  72     [ROUND_ZERO] = "zero",
  73     [ROUND_DOWN] = "down",
  74     [ROUND_UP] = "up",
  75     [ROUND_TIEAWAY] = "tieaway",
  76 };
  77
  78 enum tester {
  79     TESTER_SOFT,
  80     TESTER_HOST,
  81     TESTER_MAX_NR,
  82 };
  83
  84 static const char * const tester_names[] = {
  85     [TESTER_SOFT] = "soft",
  86     [TESTER_HOST] = "host",
  87     [TESTER_MAX_NR] = NULL,
  88 };
  89
  90 union fp {
  91     float f;
  92     double d;
  93     float32 f32;
  94     float64 f64;
  95     float128 f128;
  96     uint64_t u64;
  97 };
  98
  99 struct op_state;
 100
 101 typedef float (*float_func_t)(const struct op_state *s);
 102 typedef double (*double_func_t)(const struct op_state *s);
 103
 104 union fp_func {
 105     float_func_t float_func;
 106     double_func_t double_func;
 107 };
 108
 109 typedef void (*bench_func_t)(void);
 110
 111 struct op_desc {
 112     const char * const name;
 113 };
 114
 115 #define DEFAULT_DURATION_SECS 1
 116
 117 static uint64_t random_ops[MAX_OPERANDS] = {
 118     SEED_A, SEED_B, SEED_C,
 119 };
 120
 121 static float128 random_quad_ops[MAX_OPERANDS] = {
 122     {SEED_A, SEED_B}, {SEED_B, SEED_C}, {SEED_C, SEED_A},
 123 };
 124 static float_status soft_status;
 125 static enum precision precision;
 126 static enum op operation;
 127 static enum tester tester;
 128 static uint64_t n_completed_ops;
 129 static unsigned int duration = DEFAULT_DURATION_SECS;
 130 static int64_t ns_elapsed;
 131 /* disable optimizations with volatile */
 132 static volatile union fp res;
 133
 134 /*
 135  * From: https://en.wikipedia.org/wiki/Xorshift
 136  * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
 137  * guaranteed to be >= INT_MAX).
 138  */
 139 static uint64_t xorshift64star(uint64_t x)
 140 {
 141     x ^= x >> 12; /* a */
 142     x ^= x << 25; /* b */
 143     x ^= x >> 27; /* c */
 144     return x * UINT64_C(2685821657736338717);
 145 }
 146
 147 static void update_random_ops(int n_ops, enum precision prec)
 148 {
 149     int i;
 150
 151     for (i = 0; i < n_ops; i++) {
 152
 153         switch (prec) {
 154         case PREC_SINGLE:
 155         case PREC_FLOAT32:
 156         {
 157             uint64_t r = random_ops[i];
 158             do {
 159                 r = xorshift64star(r);
 160             } while (!float32_is_normal(r));
 161             random_ops[i] = r;
 162             break;
 163         }
 164         case PREC_DOUBLE:
 165         case PREC_FLOAT64:
 166         {
 167             uint64_t r = random_ops[i];
 168             do {
 169                 r = xorshift64star(r);
 170             } while (!float64_is_normal(r));
 171             random_ops[i] = r;
 172             break;
 173         }
 174         case PREC_QUAD:
 175         case PREC_FLOAT128:
 176         {
 177             float128 r = random_quad_ops[i];
 178             uint64_t hi = r.high;
 179             uint64_t lo = r.low;
 180             do {
 181                 hi = xorshift64star(hi);
 182                 lo = xorshift64star(lo);
 183                 r = make_float128(hi, lo);
 184             } while (!float128_is_normal(r));
 185             random_quad_ops[i] = r;
 186             break;
 187         }
 188         default:
 189             g_assert_not_reached();
 190         }
 191     }
 192 }
 193
 194 static void fill_random(union fp *ops, int n_ops, enum precision prec,
 195                         bool no_neg)
 196 {
 197     int i;
 198
 199     for (i = 0; i < n_ops; i++) {
 200         switch (prec) {
 201         case PREC_SINGLE:
 202         case PREC_FLOAT32:
 203             ops[i].f32 = make_float32(random_ops[i]);
 204             if (no_neg && float32_is_neg(ops[i].f32)) {
 205                 ops[i].f32 = float32_chs(ops[i].f32);
 206             }
 207             break;
 208         case PREC_DOUBLE:
 209         case PREC_FLOAT64:
 210             ops[i].f64 = make_float64(random_ops[i]);
 211             if (no_neg && float64_is_neg(ops[i].f64)) {
 212                 ops[i].f64 = float64_chs(ops[i].f64);
 213             }
 214             break;
 215         case PREC_QUAD:
 216         case PREC_FLOAT128:
 217             ops[i].f128 = random_quad_ops[i];
 218             if (no_neg && float128_is_neg(ops[i].f128)) {
 219                 ops[i].f128 = float128_chs(ops[i].f128);
 220             }
 221             break;
 222         default:
 223             g_assert_not_reached();
 224         }
 225     }
 226 }
 227
 228 /*
 229  * The main benchmark function. Instead of (ab)using macros, we rely
 230  * on the compiler to unfold this at compile-time.
 231  */
 232 static void bench(enum precision prec, enum op op, int n_ops, bool no_neg)
 233 {
 234     int64_t tf = get_clock() + duration * 1000000000LL;
 235
 236     while (get_clock() < tf) {
 237         union fp ops[MAX_OPERANDS];
 238         int64_t t0;
 239         int i;
 240
 241         update_random_ops(n_ops, prec);
 242         switch (prec) {
 243         case PREC_SINGLE:
 244             fill_random(ops, n_ops, prec, no_neg);
 245             t0 = get_clock();
 246             for (i = 0; i < OPS_PER_ITER; i++) {
 247                 float a = ops[0].f;
 248                 float b = ops[1].f;
 249                 float c = ops[2].f;
 250
 251                 switch (op) {
 252                 case OP_ADD:
 253                     res.f = a + b;
 254                     break;
 255                 case OP_SUB:
 256                     res.f = a - b;
 257                     break;
 258                 case OP_MUL:
 259                     res.f = a * b;
 260                     break;
 261                 case OP_DIV:
 262                     res.f = a / b;
 263                     break;
 264                 case OP_FMA:
 265                     res.f = fmaf(a, b, c);
 266                     break;
 267                 case OP_SQRT:
 268                     res.f = sqrtf(a);
 269                     break;
 270                 case OP_CMP:
 271                     res.u64 = isgreater(a, b);
 272                     break;
 273                 default:
 274                     g_assert_not_reached();
 275                 }
 276             }
 277             break;
 278         case PREC_DOUBLE:
 279             fill_random(ops, n_ops, prec, no_neg);
 280             t0 = get_clock();
 281             for (i = 0; i < OPS_PER_ITER; i++) {
 282                 double a = ops[0].d;
 283                 double b = ops[1].d;
 284                 double c = ops[2].d;
 285
 286                 switch (op) {
 287                 case OP_ADD:
 288                     res.d = a + b;
 289                     break;
 290                 case OP_SUB:
 291                     res.d = a - b;
 292                     break;
 293                 case OP_MUL:
 294                     res.d = a * b;
 295                     break;
 296                 case OP_DIV:
 297                     res.d = a / b;
 298                     break;
 299                 case OP_FMA:
 300                     res.d = fma(a, b, c);
 301                     break;
 302                 case OP_SQRT:
 303                     res.d = sqrt(a);
 304                     break;
 305                 case OP_CMP:
 306                     res.u64 = isgreater(a, b);
 307                     break;
 308                 default:
 309                     g_assert_not_reached();
 310                 }
 311             }
 312             break;
 313         case PREC_FLOAT32:
 314             fill_random(ops, n_ops, prec, no_neg);
 315             t0 = get_clock();
 316             for (i = 0; i < OPS_PER_ITER; i++) {
 317                 float32 a = ops[0].f32;
 318                 float32 b = ops[1].f32;
 319                 float32 c = ops[2].f32;
 320
 321                 switch (op) {
 322                 case OP_ADD:
 323                     res.f32 = float32_add(a, b, &soft_status);
 324                     break;
 325                 case OP_SUB:
 326                     res.f32 = float32_sub(a, b, &soft_status);
 327                     break;
 328                 case OP_MUL:
 329                     res.f = float32_mul(a, b, &soft_status);
 330                     break;
 331                 case OP_DIV:
 332                     res.f32 = float32_div(a, b, &soft_status);
 333                     break;
 334                 case OP_FMA:
 335                     res.f32 = float32_muladd(a, b, c, 0, &soft_status);
 336                     break;
 337                 case OP_SQRT:
 338                     res.f32 = float32_sqrt(a, &soft_status);
 339                     break;
 340                 case OP_CMP:
 341                     res.u64 = float32_compare_quiet(a, b, &soft_status);
 342                     break;
 343                 default:
 344                     g_assert_not_reached();
 345                 }
 346             }
 347             break;
 348         case PREC_FLOAT64:
 349             fill_random(ops, n_ops, prec, no_neg);
 350             t0 = get_clock();
 351             for (i = 0; i < OPS_PER_ITER; i++) {
 352                 float64 a = ops[0].f64;
 353                 float64 b = ops[1].f64;
 354                 float64 c = ops[2].f64;
 355
 356                 switch (op) {
 357                 case OP_ADD:
 358                     res.f64 = float64_add(a, b, &soft_status);
 359                     break;
 360                 case OP_SUB:
 361                     res.f64 = float64_sub(a, b, &soft_status);
 362                     break;
 363                 case OP_MUL:
 364                     res.f = float64_mul(a, b, &soft_status);
 365                     break;
 366                 case OP_DIV:
 367                     res.f64 = float64_div(a, b, &soft_status);
 368                     break;
 369                 case OP_FMA:
 370                     res.f64 = float64_muladd(a, b, c, 0, &soft_status);
 371                     break;
 372                 case OP_SQRT:
 373                     res.f64 = float64_sqrt(a, &soft_status);
 374                     break;
 375                 case OP_CMP:
 376                     res.u64 = float64_compare_quiet(a, b, &soft_status);
 377                     break;
 378                 default:
 379                     g_assert_not_reached();
 380                 }
 381             }
 382             break;
 383         case PREC_FLOAT128:
 384             fill_random(ops, n_ops, prec, no_neg);
 385             t0 = get_clock();
 386             for (i = 0; i < OPS_PER_ITER; i++) {
 387                 float128 a = ops[0].f128;
 388                 float128 b = ops[1].f128;
 389                 float128 c = ops[2].f128;
 390
 391                 switch (op) {
 392                 case OP_ADD:
 393                     res.f128 = float128_add(a, b, &soft_status);
 394                     break;
 395                 case OP_SUB:
 396                     res.f128 = float128_sub(a, b, &soft_status);
 397                     break;
 398                 case OP_MUL:
 399                     res.f128 = float128_mul(a, b, &soft_status);
 400                     break;
 401                 case OP_DIV:
 402                     res.f128 = float128_div(a, b, &soft_status);
 403                     break;
 404                 case OP_FMA:
 405                     res.f128 = float128_muladd(a, b, c, 0, &soft_status);
 406                     break;
 407                 case OP_SQRT:
 408                     res.f128 = float128_sqrt(a, &soft_status);
 409                     break;
 410                 case OP_CMP:
 411                     res.u64 = float128_compare_quiet(a, b, &soft_status);
 412                     break;
 413                 default:
 414                     g_assert_not_reached();
 415                 }
 416             }
 417             break;
 418         default:
 419             g_assert_not_reached();
 420         }
 421         ns_elapsed += get_clock() - t0;
 422         n_completed_ops += OPS_PER_ITER;
 423     }
 424 }
 425
 426 #define GEN_BENCH(name, type, prec, op, n_ops)          \
 427     static void __attribute__((flatten)) name(void)     \
 428     {                                                   \
 429         bench(prec, op, n_ops, false);                  \
 430     }
 431
 432 #define GEN_BENCH_NO_NEG(name, type, prec, op, n_ops)   \
 433     static void __attribute__((flatten)) name(void)     \
 434     {                                                   \
 435         bench(prec, op, n_ops, true);                   \
 436     }
 437
 438 #define GEN_BENCH_ALL_TYPES(opname, op, n_ops)                          \
 439     GEN_BENCH(bench_ ## opname ## _float, float, PREC_SINGLE, op, n_ops) \
 440     GEN_BENCH(bench_ ## opname ## _double, double, PREC_DOUBLE, op, n_ops) \
 441     GEN_BENCH(bench_ ## opname ## _float32, float32, PREC_FLOAT32, op, n_ops) \
 442     GEN_BENCH(bench_ ## opname ## _float64, float64, PREC_FLOAT64, op, n_ops) \
 443     GEN_BENCH(bench_ ## opname ## _float128, float128, PREC_FLOAT128, op, n_ops)
 444
 445 GEN_BENCH_ALL_TYPES(add, OP_ADD, 2)
 446 GEN_BENCH_ALL_TYPES(sub, OP_SUB, 2)
 447 GEN_BENCH_ALL_TYPES(mul, OP_MUL, 2)
 448 GEN_BENCH_ALL_TYPES(div, OP_DIV, 2)
 449 GEN_BENCH_ALL_TYPES(fma, OP_FMA, 3)
 450 GEN_BENCH_ALL_TYPES(cmp, OP_CMP, 2)
 451 #undef GEN_BENCH_ALL_TYPES
 452
 453 #define GEN_BENCH_ALL_TYPES_NO_NEG(name, op, n)                         \
 454     GEN_BENCH_NO_NEG(bench_ ## name ## _float, float, PREC_SINGLE, op, n) \
 455     GEN_BENCH_NO_NEG(bench_ ## name ## _double, double, PREC_DOUBLE, op, n) \
 456     GEN_BENCH_NO_NEG(bench_ ## name ## _float32, float32, PREC_FLOAT32, op, n) \
 457     GEN_BENCH_NO_NEG(bench_ ## name ## _float64, float64, PREC_FLOAT64, op, n) \
 458     GEN_BENCH_NO_NEG(bench_ ## name ## _float128, float128, PREC_FLOAT128, op, n)
 459
 460 GEN_BENCH_ALL_TYPES_NO_NEG(sqrt, OP_SQRT, 1)
 461 #undef GEN_BENCH_ALL_TYPES_NO_NEG
 462
 463 #undef GEN_BENCH_NO_NEG
 464 #undef GEN_BENCH
 465
 466 #define GEN_BENCH_FUNCS(opname, op)                             \
 467     [op] = {                                                    \
 468         [PREC_SINGLE]    = bench_ ## opname ## _float,          \
 469         [PREC_DOUBLE]    = bench_ ## opname ## _double,         \
 470         [PREC_FLOAT32]   = bench_ ## opname ## _float32,        \
 471         [PREC_FLOAT64]   = bench_ ## opname ## _float64,        \
 472         [PREC_FLOAT128]   = bench_ ## opname ## _float128,      \
 473     }
 474
 475 static const bench_func_t bench_funcs[OP_MAX_NR][PREC_MAX_NR] = {
 476     GEN_BENCH_FUNCS(add, OP_ADD),
 477     GEN_BENCH_FUNCS(sub, OP_SUB),
 478     GEN_BENCH_FUNCS(mul, OP_MUL),
 479     GEN_BENCH_FUNCS(div, OP_DIV),
 480     GEN_BENCH_FUNCS(fma, OP_FMA),
 481     GEN_BENCH_FUNCS(sqrt, OP_SQRT),
 482     GEN_BENCH_FUNCS(cmp, OP_CMP),
 483 };
 484
 485 #undef GEN_BENCH_FUNCS
 486
 487 static void run_bench(void)
 488 {
 489     bench_func_t f;
 490
 491     f = bench_funcs[operation][precision];
 492     g_assert(f);
 493     f();
 494 }
 495
 496 /* @arr must be NULL-terminated */
 497 static int find_name(const char * const *arr, const char *name)
 498 {
 499     int i;
 500
 501     for (i = 0; arr[i] != NULL; i++) {
 502         if (strcmp(name, arr[i]) == 0) {
 503             return i;
 504         }
 505     }
 506     return -1;
 507 }
 508
 509 static void usage_complete(int argc, char *argv[])
 510 {
 511     gchar *op_list = g_strjoinv(", ", (gchar **)op_names);
 512     gchar *tester_list = g_strjoinv(", ", (gchar **)tester_names);
 513
 514     fprintf(stderr, "Usage: %s [options]\n", argv[0]);
 515     fprintf(stderr, "options:\n");
 516     fprintf(stderr, " -d = duration, in seconds. Default: %d\n",
 517             DEFAULT_DURATION_SECS);
 518     fprintf(stderr, " -h = show this help message.\n");
 519     fprintf(stderr, " -o = floating point operation (%s). Default: %s\n",
 520             op_list, op_names[0]);
 521     fprintf(stderr, " -p = floating point precision (single, double, quad[soft only]). "
 522             "Default: single\n");
 523     fprintf(stderr, " -r = rounding mode (even, zero, down, up, tieaway). "
 524             "Default: even\n");
 525     fprintf(stderr, " -t = tester (%s). Default: %s\n",
 526             tester_list, tester_names[0]);
 527     fprintf(stderr, " -z = flush inputs to zero (soft tester only). "
 528             "Default: disabled\n");
 529     fprintf(stderr, " -Z = flush output to zero (soft tester only). "
 530             "Default: disabled\n");
 531
 532     g_free(tester_list);
 533     g_free(op_list);
 534 }
 535
 536 static int round_name_to_mode(const char *name)
 537 {
 538     int i;
 539
 540     for (i = 0; i < N_ROUND_MODES; i++) {
 541         if (!strcmp(round_names[i], name)) {
 542             return i;
 543         }
 544     }
 545     return -1;
 546 }
 547
 548 static void QEMU_NORETURN die_host_rounding(enum rounding rounding)
 549 {
 550     fprintf(stderr, "fatal: '%s' rounding not supported on this host\n",
 551             round_names[rounding]);
 552     exit(EXIT_FAILURE);
 553 }
 554
 555 static void set_host_precision(enum rounding rounding)
 556 {
 557     int rhost;
 558
 559     switch (rounding) {
 560     case ROUND_EVEN:
 561         rhost = FE_TONEAREST;
 562         break;
 563     case ROUND_ZERO:
 564         rhost = FE_TOWARDZERO;
 565         break;
 566     case ROUND_DOWN:
 567         rhost = FE_DOWNWARD;
 568         break;
 569     case ROUND_UP:
 570         rhost = FE_UPWARD;
 571         break;
 572     case ROUND_TIEAWAY:
 573         die_host_rounding(rounding);
 574         return;
 575     default:
 576         g_assert_not_reached();
 577     }
 578
 579     if (fesetround(rhost)) {
 580         die_host_rounding(rounding);
 581     }
 582 }
 583
 584 static void set_soft_precision(enum rounding rounding)
 585 {
 586     signed char mode;
 587
 588     switch (rounding) {
 589     case ROUND_EVEN:
 590         mode = float_round_nearest_even;
 591         break;
 592     case ROUND_ZERO:
 593         mode = float_round_to_zero;
 594         break;
 595     case ROUND_DOWN:
 596         mode = float_round_down;
 597         break;
 598     case ROUND_UP:
 599         mode = float_round_up;
 600         break;
 601     case ROUND_TIEAWAY:
 602         mode = float_round_ties_away;
 603         break;
 604     default:
 605         g_assert_not_reached();
 606     }
 607     soft_status.float_rounding_mode = mode;
 608 }
 609
 610 static void parse_args(int argc, char *argv[])
 611 {
 612     int c;
 613     int val;
 614     int rounding = ROUND_EVEN;
 615
 616     for (;;) {
 617         c = getopt(argc, argv, "d:ho:p:r:t:zZ");
 618         if (c < 0) {
 619             break;
 620         }
 621         switch (c) {
 622         case 'd':
 623             duration = atoi(optarg);
 624             break;
 625         case 'h':
 626             usage_complete(argc, argv);
 627             exit(EXIT_SUCCESS);
 628         case 'o':
 629             val = find_name(op_names, optarg);
 630             if (val < 0) {
 631                 fprintf(stderr, "Unsupported op '%s'\n", optarg);
 632                 exit(EXIT_FAILURE);
 633             }
 634             operation = val;
 635             break;
 636         case 'p':
 637             if (!strcmp(optarg, "single")) {
 638                 precision = PREC_SINGLE;
 639             } else if (!strcmp(optarg, "double")) {
 640                 precision = PREC_DOUBLE;
 641             } else if (!strcmp(optarg, "quad")) {
 642                 precision = PREC_QUAD;
 643             } else {
 644                 fprintf(stderr, "Unsupported precision '%s'\n", optarg);
 645                 exit(EXIT_FAILURE);
 646             }
 647             break;
 648         case 'r':
 649             rounding = round_name_to_mode(optarg);
 650             if (rounding < 0) {
 651                 fprintf(stderr, "fatal: invalid rounding mode '%s'\n", optarg);
 652                 exit(EXIT_FAILURE);
 653             }
 654             break;
 655         case 't':
 656             val = find_name(tester_names, optarg);
 657             if (val < 0) {
 658                 fprintf(stderr, "Unsupported tester '%s'\n", optarg);
 659                 exit(EXIT_FAILURE);
 660             }
 661             tester = val;
 662             break;
 663         case 'z':
 664             soft_status.flush_inputs_to_zero = 1;
 665             break;
 666         case 'Z':
 667             soft_status.flush_to_zero = 1;
 668             break;
 669         }
 670     }
 671
 672     /* set precision and rounding mode based on the tester */
 673     switch (tester) {
 674     case TESTER_HOST:
 675         set_host_precision(rounding);
 676         break;
 677     case TESTER_SOFT:
 678         set_soft_precision(rounding);
 679         switch (precision) {
 680         case PREC_SINGLE:
 681             precision = PREC_FLOAT32;
 682             break;
 683         case PREC_DOUBLE:
 684             precision = PREC_FLOAT64;
 685             break;
 686         case PREC_QUAD:
 687             precision = PREC_FLOAT128;
 688             break;
 689         default:
 690             g_assert_not_reached();
 691         }
 692         break;
 693     default:
 694         g_assert_not_reached();
 695     }
 696 }
 697
 698 static void pr_stats(void)
 699 {
 700     printf("%.2f MFlops\n", (double)n_completed_ops / ns_elapsed * 1e3);
 701 }
 702
 703 int main(int argc, char *argv[])
 704 {
 705     parse_args(argc, argv);
 706     run_bench();
 707     pr_stats();
 708     return 0;
 709 }