libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c

   1 /* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
   2    vector_length.  */
   3
   4 #include <limits.h>
   5 #include <openacc.h>
   6 #include <gomp-constants.h>
   7
   8 /* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
   9    not behaving as expected for -O0.  */
  10 #pragma acc routine seq
  11 static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
  12 {
  13   if (acc_on_device ((int) acc_device_host))
  14     return 0;
  15   else if (acc_on_device ((int) acc_device_nvidia))
  16     return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
  17   else
  18     __builtin_abort ();
  19 }
  20
  21 #pragma acc routine seq
  22 static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
  23 {
  24   if (acc_on_device ((int) acc_device_host))
  25     return 0;
  26   else if (acc_on_device ((int) acc_device_nvidia))
  27     return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
  28   else
  29     __builtin_abort ();
  30 }
  31
  32 #pragma acc routine seq
  33 static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
  34 {
  35   if (acc_on_device ((int) acc_device_host))
  36     return 0;
  37   else if (acc_on_device ((int) acc_device_nvidia))
  38     return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
  39   else
  40     __builtin_abort ();
  41 }
  42
  43
  44 int main ()
  45 {
  46   acc_init (acc_device_default);
  47
  48   /* Non-positive value.  */
  49
  50   /* GR, WS, VS.  */
  51   {
  52 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
  53     int gangs_actual = GANGS;
  54     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
  55     gangs_min = workers_min = vectors_min = INT_MAX;
  56     gangs_max = workers_max = vectors_max = INT_MIN;
  57 #pragma acc parallel copy (gangs_actual) \
  58   reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
  59   num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
  60     {
  61       /* We're actually executing with num_gangs (1).  */
  62       gangs_actual = 1;
  63       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
  64         {
  65           /* <https://gcc.gnu.org/PR80547>.  */
  66 #if 0
  67           gangs_min = gangs_max = acc_gang ();
  68           workers_min = workers_max = acc_worker ();
  69           vectors_min = vectors_max = acc_vector ();
  70 #else
  71           int gangs = acc_gang ();
  72           gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
  73           gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
  74           int workers = acc_worker ();
  75           workers_min = (workers_min < workers) ? workers_min : workers;
  76           workers_max = (workers_max > workers) ? workers_max : workers;
  77           int vectors = acc_vector ();
  78           vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
  79           vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
  80 #endif
  81         }
  82     }
  83     if (gangs_actual != 1)
  84       __builtin_abort ();
  85     if (gangs_min != 0 || gangs_max != gangs_actual - 1
  86         || workers_min != 0 || workers_max != 0
  87         || vectors_min != 0 || vectors_max != 0)
  88       __builtin_abort ();
  89 #undef GANGS
  90   }
  91
  92   /* GP, WS, VS.  */
  93   {
  94 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
  95     int gangs_actual = GANGS;
  96     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
  97     gangs_min = workers_min = vectors_min = INT_MAX;
  98     gangs_max = workers_max = vectors_max = INT_MIN;
  99 #pragma acc parallel copy (gangs_actual) \
 100   num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
 101     {
 102       /* We're actually executing with num_gangs (1).  */
 103       gangs_actual = 1;
 104 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 105       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
 106         {
 107           gangs_min = gangs_max = acc_gang ();
 108           workers_min = workers_max = acc_worker ();
 109           vectors_min = vectors_max = acc_vector ();
 110         }
 111     }
 112     if (gangs_actual != 1)
 113       __builtin_abort ();
 114     if (gangs_min != 0 || gangs_max != gangs_actual - 1
 115         || workers_min != 0 || workers_max != 0
 116         || vectors_min != 0 || vectors_max != 0)
 117       __builtin_abort ();
 118 #undef GANGS
 119   }
 120
 121   /* GR, WP, VS.  */
 122   {
 123 #define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
 124     int workers_actual = WORKERS;
 125     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 126     gangs_min = workers_min = vectors_min = INT_MAX;
 127     gangs_max = workers_max = vectors_max = INT_MIN;
 128 #pragma acc parallel copy (workers_actual) \
 129   num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
 130     {
 131       /* We're actually executing with num_workers (1).  */
 132       workers_actual = 1;
 133 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 134       for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
 135         {
 136           gangs_min = gangs_max = acc_gang ();
 137           workers_min = workers_max = acc_worker ();
 138           vectors_min = vectors_max = acc_vector ();
 139         }
 140     }
 141     if (workers_actual != 1)
 142       __builtin_abort ();
 143     if (gangs_min != 0 || gangs_max != 0
 144         || workers_min != 0 || workers_max != workers_actual - 1
 145         || vectors_min != 0 || vectors_max != 0)
 146       __builtin_abort ();
 147 #undef WORKERS
 148   }
 149
 150   /* GR, WS, VP.  */
 151   {
 152 #define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
 153     int vectors_actual = VECTORS;
 154     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 155     gangs_min = workers_min = vectors_min = INT_MAX;
 156     gangs_max = workers_max = vectors_max = INT_MIN;
 157 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \
 158   vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
 159     {
 160       /* We're actually executing with vector_length (1), just the GCC nvptx
 161          back end enforces vector_length (32).  */
 162       if (acc_on_device (acc_device_nvidia))
 163         vectors_actual = 32;
 164       else
 165         vectors_actual = 1;
 166 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 167       for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
 168         {
 169           gangs_min = gangs_max = acc_gang ();
 170           workers_min = workers_max = acc_worker ();
 171           vectors_min = vectors_max = acc_vector ();
 172         }
 173     }
 174     if (acc_get_device_type () == acc_device_nvidia)
 175       {
 176         if (vectors_actual != 32)
 177           __builtin_abort ();
 178       }
 179     else
 180       if (vectors_actual != 1)
 181         __builtin_abort ();
 182     if (gangs_min != 0 || gangs_max != 0
 183         || workers_min != 0 || workers_max != 0
 184         || vectors_min != 0 || vectors_max != vectors_actual - 1)
 185       __builtin_abort ();
 186 #undef VECTORS
 187   }
 188
 189
 190   /* High value.  */
 191
 192   /* GR, WS, VS.  */
 193   {
 194     /* There is no actual limit for the number of gangs, so we try with a
 195        rather high value.  */
 196     int gangs = 12345;
 197     int gangs_actual = gangs;
 198     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 199     gangs_min = workers_min = vectors_min = INT_MAX;
 200     gangs_max = workers_max = vectors_max = INT_MIN;
 201 #pragma acc parallel copy (gangs_actual) \
 202   reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
 203   num_gangs (gangs)
 204     {
 205       if (acc_on_device (acc_device_host))
 206         {
 207           /* We're actually executing with num_gangs (1).  */
 208           gangs_actual = 1;
 209         }
 210       /* As we're executing GR not GP, don't multiply with a "gangs_actual"
 211          factor.  */
 212       for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
 213         {
 214           gangs_min = gangs_max = acc_gang ();
 215           workers_min = workers_max = acc_worker ();
 216           vectors_min = vectors_max = acc_vector ();
 217         }
 218     }
 219     if (gangs_actual < 1)
 220       __builtin_abort ();
 221     if (gangs_min != 0 || gangs_max != gangs_actual - 1
 222         || workers_min != 0 || workers_max != 0
 223         || vectors_min != 0 || vectors_max != 0)
 224       __builtin_abort ();
 225   }
 226
 227   /* GP, WS, VS.  */
 228   {
 229     /* There is no actual limit for the number of gangs, so we try with a
 230        rather high value.  */
 231     int gangs = 12345;
 232     int gangs_actual = gangs;
 233     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 234     gangs_min = workers_min = vectors_min = INT_MAX;
 235     gangs_max = workers_max = vectors_max = INT_MIN;
 236 #pragma acc parallel copy (gangs_actual) \
 237   num_gangs (gangs)
 238     {
 239       if (acc_on_device (acc_device_host))
 240         {
 241           /* We're actually executing with num_gangs (1).  */
 242           gangs_actual = 1;
 243         }
 244 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 245       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
 246         {
 247           gangs_min = gangs_max = acc_gang ();
 248           workers_min = workers_max = acc_worker ();
 249           vectors_min = vectors_max = acc_vector ();
 250         }
 251     }
 252     if (gangs_actual < 1)
 253       __builtin_abort ();
 254     if (gangs_min != 0 || gangs_max != gangs_actual - 1
 255         || workers_min != 0 || workers_max != 0
 256         || vectors_min != 0 || vectors_max != 0)
 257       __builtin_abort ();
 258   }
 259
 260   /* GR, WP, VS.  */
 261   {
 262     /* We try with an outrageously large value. */
 263 #define WORKERS 2 << 20
 264     int workers_actual = WORKERS;
 265     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 266     gangs_min = workers_min = vectors_min = INT_MAX;
 267     gangs_max = workers_max = vectors_max = INT_MIN;
 268 #pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
 269   num_workers (WORKERS)
 270     {
 271       if (acc_on_device (acc_device_host))
 272         {
 273           /* We're actually executing with num_workers (1).  */
 274           workers_actual = 1;
 275         }
 276       else if (acc_on_device (acc_device_nvidia))
 277         {
 278           /* The GCC nvptx back end enforces num_workers (32).  */
 279           workers_actual = 32;
 280         }
 281       else
 282         __builtin_abort ();
 283 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 284       for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
 285         {
 286           gangs_min = gangs_max = acc_gang ();
 287           workers_min = workers_max = acc_worker ();
 288           vectors_min = vectors_max = acc_vector ();
 289         }
 290     }
 291     if (workers_actual < 1)
 292       __builtin_abort ();
 293     if (gangs_min != 0 || gangs_max != 0
 294         || workers_min != 0 || workers_max != workers_actual - 1
 295         || vectors_min != 0 || vectors_max != 0)
 296       __builtin_abort ();
 297 #undef WORKERS
 298   }
 299
 300   /* GR, WP, VS.  */
 301   {
 302     /* We try with an outrageously large value. */
 303     int workers = 2 << 20;
 304     /* For nvptx offloading, this one will not result in "using num_workers
 305        (32), ignoring runtime setting", and will in fact try to launch with
 306        "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
 307        error: invalid argument".  So, limit ourselves here.  */
 308     if (acc_get_device_type () == acc_device_nvidia)
 309       workers = 32;
 310     int workers_actual = workers;
 311     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 312     gangs_min = workers_min = vectors_min = INT_MAX;
 313     gangs_max = workers_max = vectors_max = INT_MIN;
 314 #pragma acc parallel copy (workers_actual) \
 315   num_workers (workers)
 316     {
 317       if (acc_on_device (acc_device_host))
 318         {
 319           /* We're actually executing with num_workers (1).  */
 320           workers_actual = 1;
 321         }
 322       else if (acc_on_device (acc_device_nvidia))
 323         {
 324           /* We're actually executing with num_workers (32).  */
 325           /* workers_actual = 32; */
 326         }
 327       else
 328         __builtin_abort ();
 329 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 330       for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
 331         {
 332           gangs_min = gangs_max = acc_gang ();
 333           workers_min = workers_max = acc_worker ();
 334           vectors_min = vectors_max = acc_vector ();
 335         }
 336     }
 337     if (workers_actual < 1)
 338       __builtin_abort ();
 339     if (gangs_min != 0 || gangs_max != 0
 340         || workers_min != 0 || workers_max != workers_actual - 1
 341         || vectors_min != 0 || vectors_max != 0)
 342       __builtin_abort ();
 343   }
 344
 345   /* GR, WS, VP.  */
 346   {
 347     /* We try with an outrageously large value. */
 348 #define VECTORS 2 << 20
 349     int vectors_actual = VECTORS;
 350     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 351     gangs_min = workers_min = vectors_min = INT_MAX;
 352     gangs_max = workers_max = vectors_max = INT_MIN;
 353 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
 354   vector_length (VECTORS)
 355     {
 356       if (acc_on_device (acc_device_host))
 357         {
 358           /* We're actually executing with vector_length (1).  */
 359           vectors_actual = 1;
 360         }
 361       else if (acc_on_device (acc_device_nvidia))
 362         {
 363           /* The GCC nvptx back end enforces vector_length (32).  */
 364           vectors_actual = 32;
 365         }
 366       else
 367         __builtin_abort ();
 368 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 369       for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
 370         {
 371           gangs_min = gangs_max = acc_gang ();
 372           workers_min = workers_max = acc_worker ();
 373           vectors_min = vectors_max = acc_vector ();
 374         }
 375     }
 376     if (vectors_actual < 1)
 377       __builtin_abort ();
 378     if (gangs_min != 0 || gangs_max != 0
 379         || workers_min != 0 || workers_max != 0
 380         || vectors_min != 0 || vectors_max != vectors_actual - 1)
 381       __builtin_abort ();
 382 #undef VECTORS
 383   }
 384
 385   /* GR, WS, VP.  */
 386   {
 387     /* We try with an outrageously large value. */
 388     int vectors = 2 << 20;
 389     int vectors_actual = vectors;
 390     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 391     gangs_min = workers_min = vectors_min = INT_MAX;
 392     gangs_max = workers_max = vectors_max = INT_MIN;
 393 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \
 394   vector_length (vectors)
 395     {
 396       if (acc_on_device (acc_device_host))
 397         {
 398           /* We're actually executing with vector_length (1).  */
 399           vectors_actual = 1;
 400         }
 401       else if (acc_on_device (acc_device_nvidia))
 402         {
 403           /* The GCC nvptx back end enforces vector_length (32).  */
 404           vectors_actual = 32;
 405         }
 406       else
 407         __builtin_abort ();
 408 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 409       for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
 410         {
 411           gangs_min = gangs_max = acc_gang ();
 412           workers_min = workers_max = acc_worker ();
 413           vectors_min = vectors_max = acc_vector ();
 414         }
 415     }
 416     if (vectors_actual < 1)
 417       __builtin_abort ();
 418     if (gangs_min != 0 || gangs_max != 0
 419         || workers_min != 0 || workers_max != 0
 420         || vectors_min != 0 || vectors_max != vectors_actual - 1)
 421       __builtin_abort ();
 422   }
 423
 424
 425   /* Composition of GP, WP, VP.  */
 426   {
 427     int gangs = 12345;
 428     /* With nvptx offloading, multi-level reductions apparently are very slow
 429        in the following case.  So, limit ourselves here.  */
 430     if (acc_get_device_type () == acc_device_nvidia)
 431       gangs = 3;
 432     int gangs_actual = gangs;
 433 #define WORKERS 3
 434     int workers_actual = WORKERS;
 435 #define VECTORS 11
 436     int vectors_actual = VECTORS;
 437     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 438     gangs_min = workers_min = vectors_min = INT_MAX;
 439     gangs_max = workers_max = vectors_max = INT_MIN;
 440 #pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \
 441   num_gangs (gangs) \
 442   num_workers (WORKERS) \
 443   vector_length (VECTORS)
 444     {
 445       if (acc_on_device (acc_device_host))
 446         {
 447           /* We're actually executing with num_gangs (1), num_workers (1),
 448              vector_length (1).  */
 449           gangs_actual = 1;
 450           workers_actual = 1;
 451           vectors_actual = 1;
 452         }
 453       else if (acc_on_device (acc_device_nvidia))
 454         {
 455           /* The GCC nvptx back end enforces vector_length (32).  */
 456           vectors_actual = 32;
 457         }
 458       else
 459         __builtin_abort ();
 460 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 461       for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
 462 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 463         for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
 464 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 465           for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
 466             {
 467               gangs_min = gangs_max = acc_gang ();
 468               workers_min = workers_max = acc_worker ();
 469               vectors_min = vectors_max = acc_vector ();
 470             }
 471     }
 472     if (gangs_min != 0 || gangs_max != gangs_actual - 1
 473         || workers_min != 0 || workers_max != workers_actual - 1
 474         || vectors_min != 0 || vectors_max != vectors_actual - 1)
 475       __builtin_abort ();
 476 #undef VECTORS
 477 #undef WORKERS
 478   }
 479
 480
 481   /* We can't test parallelized OpenACC kernels constructs in this way: use of
 482      the acc_gang, acc_worker, acc_vector functions will make the construct
 483      unparallelizable.  */
 484
 485
 486   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
 487      kernels.  */
 488   {
 489     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 490     gangs_min = workers_min = vectors_min = INT_MAX;
 491     gangs_max = workers_max = vectors_max = INT_MIN;
 492 #pragma acc kernels
 493     {
 494       /* This is to make the OpenACC kernels construct unparallelizable.  */
 495       asm volatile ("" : : : "memory");
 496
 497 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 498       for (int i = 100; i > -100; --i)
 499         {
 500           gangs_min = gangs_max = acc_gang ();
 501           workers_min = workers_max = acc_worker ();
 502           vectors_min = vectors_max = acc_vector ();
 503         }
 504     }
 505     if (gangs_min != 0 || gangs_max != 1 - 1
 506         || workers_min != 0 || workers_max != 1 - 1
 507         || vectors_min != 0 || vectors_max != 1 - 1)
 508       __builtin_abort ();
 509   }
 510
 511
 512   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
 513      kernels even when there are explicit num_gangs, num_workers, or
 514      vector_length clauses.  */
 515   {
 516     int gangs = 5;
 517 #define WORKERS 5
 518 #define VECTORS 13
 519     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
 520     gangs_min = workers_min = vectors_min = INT_MAX;
 521     gangs_max = workers_max = vectors_max = INT_MIN;
 522 #pragma acc kernels \
 523   num_gangs (gangs) \
 524   num_workers (WORKERS) \
 525   vector_length (VECTORS)
 526     {
 527       /* This is to make the OpenACC kernels construct unparallelizable.  */
 528       asm volatile ("" : : : "memory");
 529
 530 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
 531       for (int i = 100; i > -100; --i)
 532         {
 533           gangs_min = gangs_max = acc_gang ();
 534           workers_min = workers_max = acc_worker ();
 535           vectors_min = vectors_max = acc_vector ();
 536         }
 537     }
 538     if (gangs_min != 0 || gangs_max != 1 - 1
 539         || workers_min != 0 || workers_max != 1 - 1
 540         || vectors_min != 0 || vectors_max != 1 - 1)
 541       __builtin_abort ();
 542 #undef VECTORS
 543 #undef WORKERS
 544   }
 545
 546
 547   return 0;
 548 }