PR c/81417
[official-gcc.git] / libgomp / testsuite / libgomp.oacc-c-c++-common / parallel-dims.c
blob8308f7c541f80eb7a21e0d9e253019bc6269fde8
1 /* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
2 vector_length. */
4 #include <limits.h>
5 #include <openacc.h>
7 /* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
8 not behaving as expected for -O0. */
9 #pragma acc routine seq
10 static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
12 if (acc_on_device ((int) acc_device_host))
13 return 0;
14 else if (acc_on_device ((int) acc_device_nvidia))
16 unsigned int r;
17 asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
18 return r;
20 else
21 __builtin_abort ();
24 #pragma acc routine seq
25 static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
27 if (acc_on_device ((int) acc_device_host))
28 return 0;
29 else if (acc_on_device ((int) acc_device_nvidia))
31 unsigned int r;
32 asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
33 return r;
35 else
36 __builtin_abort ();
39 #pragma acc routine seq
40 static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
42 if (acc_on_device ((int) acc_device_host))
43 return 0;
44 else if (acc_on_device ((int) acc_device_nvidia))
46 unsigned int r;
47 asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
48 return r;
50 else
51 __builtin_abort ();
55 int main ()
57 acc_init (acc_device_default);
59 /* Non-positive value. */
61 /* GR, WS, VS. */
63 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
64 int gangs_actual = GANGS;
65 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
66 gangs_min = workers_min = vectors_min = INT_MAX;
67 gangs_max = workers_max = vectors_max = INT_MIN;
68 #pragma acc parallel copy (gangs_actual) \
69 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
70 num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
72 /* We're actually executing with num_gangs (1). */
73 gangs_actual = 1;
74 for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
76 /* <https://gcc.gnu.org/PR80547>. */
77 #if 0
78 gangs_min = gangs_max = acc_gang ();
79 workers_min = workers_max = acc_worker ();
80 vectors_min = vectors_max = acc_vector ();
81 #else
82 int gangs = acc_gang ();
83 gangs_min = (gangs_min < gangs) ? gangs_min : gangs;
84 gangs_max = (gangs_max > gangs) ? gangs_max : gangs;
85 int workers = acc_worker ();
86 workers_min = (workers_min < workers) ? workers_min : workers;
87 workers_max = (workers_max > workers) ? workers_max : workers;
88 int vectors = acc_vector ();
89 vectors_min = (vectors_min < vectors) ? vectors_min : vectors;
90 vectors_max = (vectors_max > vectors) ? vectors_max : vectors;
91 #endif
94 if (gangs_actual != 1)
95 __builtin_abort ();
96 if (gangs_min != 0 || gangs_max != gangs_actual - 1
97 || workers_min != 0 || workers_max != 0
98 || vectors_min != 0 || vectors_max != 0)
99 __builtin_abort ();
100 #undef GANGS
103 /* GP, WS, VS. */
105 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
106 int gangs_actual = GANGS;
107 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
108 gangs_min = workers_min = vectors_min = INT_MAX;
109 gangs_max = workers_max = vectors_max = INT_MIN;
110 #pragma acc parallel copy (gangs_actual) \
111 num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
113 /* We're actually executing with num_gangs (1). */
114 gangs_actual = 1;
115 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
116 for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
118 gangs_min = gangs_max = acc_gang ();
119 workers_min = workers_max = acc_worker ();
120 vectors_min = vectors_max = acc_vector ();
123 if (gangs_actual != 1)
124 __builtin_abort ();
125 if (gangs_min != 0 || gangs_max != gangs_actual - 1
126 || workers_min != 0 || workers_max != 0
127 || vectors_min != 0 || vectors_max != 0)
128 __builtin_abort ();
129 #undef GANGS
132 /* GR, WP, VS. */
134 #define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
135 int workers_actual = WORKERS;
136 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
137 gangs_min = workers_min = vectors_min = INT_MAX;
138 gangs_max = workers_max = vectors_max = INT_MIN;
139 #pragma acc parallel copy (workers_actual) \
140 num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
142 /* We're actually executing with num_workers (1). */
143 workers_actual = 1;
144 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
145 for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
147 gangs_min = gangs_max = acc_gang ();
148 workers_min = workers_max = acc_worker ();
149 vectors_min = vectors_max = acc_vector ();
152 if (workers_actual != 1)
153 __builtin_abort ();
154 if (gangs_min != 0 || gangs_max != 0
155 || workers_min != 0 || workers_max != workers_actual - 1
156 || vectors_min != 0 || vectors_max != 0)
157 __builtin_abort ();
158 #undef WORKERS
161 /* GR, WS, VP. */
163 #define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
164 int vectors_actual = VECTORS;
165 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
166 gangs_min = workers_min = vectors_min = INT_MAX;
167 gangs_max = workers_max = vectors_max = INT_MIN;
168 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \
169 vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
171 /* We're actually executing with vector_length (1), just the GCC nvptx
172 back end enforces vector_length (32). */
173 if (acc_on_device (acc_device_nvidia))
174 vectors_actual = 32;
175 else
176 vectors_actual = 1;
177 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
178 for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
180 gangs_min = gangs_max = acc_gang ();
181 workers_min = workers_max = acc_worker ();
182 vectors_min = vectors_max = acc_vector ();
185 if (acc_get_device_type () == acc_device_nvidia)
187 if (vectors_actual != 32)
188 __builtin_abort ();
190 else
191 if (vectors_actual != 1)
192 __builtin_abort ();
193 if (gangs_min != 0 || gangs_max != 0
194 || workers_min != 0 || workers_max != 0
195 || vectors_min != 0 || vectors_max != vectors_actual - 1)
196 __builtin_abort ();
197 #undef VECTORS
201 /* High value. */
203 /* GR, WS, VS. */
205 /* There is no actual limit for the number of gangs, so we try with a
206 rather high value. */
207 int gangs = 12345;
208 int gangs_actual = gangs;
209 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
210 gangs_min = workers_min = vectors_min = INT_MAX;
211 gangs_max = workers_max = vectors_max = INT_MIN;
212 #pragma acc parallel copy (gangs_actual) \
213 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
214 num_gangs (gangs)
216 if (acc_on_device (acc_device_host))
218 /* We're actually executing with num_gangs (1). */
219 gangs_actual = 1;
221 /* As we're executing GR not GP, don't multiply with a "gangs_actual"
222 factor. */
223 for (int i = 100 /* * gangs_actual */; i > -100 /* * gangs_actual */; --i)
225 gangs_min = gangs_max = acc_gang ();
226 workers_min = workers_max = acc_worker ();
227 vectors_min = vectors_max = acc_vector ();
230 if (gangs_actual < 1)
231 __builtin_abort ();
232 if (gangs_min != 0 || gangs_max != gangs_actual - 1
233 || workers_min != 0 || workers_max != 0
234 || vectors_min != 0 || vectors_max != 0)
235 __builtin_abort ();
238 /* GP, WS, VS. */
240 /* There is no actual limit for the number of gangs, so we try with a
241 rather high value. */
242 int gangs = 12345;
243 int gangs_actual = gangs;
244 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
245 gangs_min = workers_min = vectors_min = INT_MAX;
246 gangs_max = workers_max = vectors_max = INT_MIN;
247 #pragma acc parallel copy (gangs_actual) \
248 num_gangs (gangs)
250 if (acc_on_device (acc_device_host))
252 /* We're actually executing with num_gangs (1). */
253 gangs_actual = 1;
255 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
256 for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
258 gangs_min = gangs_max = acc_gang ();
259 workers_min = workers_max = acc_worker ();
260 vectors_min = vectors_max = acc_vector ();
263 if (gangs_actual < 1)
264 __builtin_abort ();
265 if (gangs_min != 0 || gangs_max != gangs_actual - 1
266 || workers_min != 0 || workers_max != 0
267 || vectors_min != 0 || vectors_max != 0)
268 __builtin_abort ();
271 /* GR, WP, VS. */
273 /* We try with an outrageously large value. */
274 #define WORKERS 2 << 20
275 int workers_actual = WORKERS;
276 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
277 gangs_min = workers_min = vectors_min = INT_MAX;
278 gangs_max = workers_max = vectors_max = INT_MIN;
279 #pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
280 num_workers (WORKERS)
282 if (acc_on_device (acc_device_host))
284 /* We're actually executing with num_workers (1). */
285 workers_actual = 1;
287 else if (acc_on_device (acc_device_nvidia))
289 /* The GCC nvptx back end enforces num_workers (32). */
290 workers_actual = 32;
292 else
293 __builtin_abort ();
294 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
295 for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
297 gangs_min = gangs_max = acc_gang ();
298 workers_min = workers_max = acc_worker ();
299 vectors_min = vectors_max = acc_vector ();
302 if (workers_actual < 1)
303 __builtin_abort ();
304 if (gangs_min != 0 || gangs_max != 0
305 || workers_min != 0 || workers_max != workers_actual - 1
306 || vectors_min != 0 || vectors_max != 0)
307 __builtin_abort ();
308 #undef WORKERS
311 /* GR, WP, VS. */
313 /* We try with an outrageously large value. */
314 int workers = 2 << 20;
315 /* For nvptx offloading, this one will not result in "using num_workers
316 (32), ignoring runtime setting", and will in fact try to launch with
317 "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
318 error: invalid argument". So, limit ourselves here. */
319 if (acc_get_device_type () == acc_device_nvidia)
320 workers = 32;
321 int workers_actual = workers;
322 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
323 gangs_min = workers_min = vectors_min = INT_MAX;
324 gangs_max = workers_max = vectors_max = INT_MIN;
325 #pragma acc parallel copy (workers_actual) \
326 num_workers (workers)
328 if (acc_on_device (acc_device_host))
330 /* We're actually executing with num_workers (1). */
331 workers_actual = 1;
333 else if (acc_on_device (acc_device_nvidia))
335 /* We're actually executing with num_workers (32). */
336 /* workers_actual = 32; */
338 else
339 __builtin_abort ();
340 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
341 for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
343 gangs_min = gangs_max = acc_gang ();
344 workers_min = workers_max = acc_worker ();
345 vectors_min = vectors_max = acc_vector ();
348 if (workers_actual < 1)
349 __builtin_abort ();
350 if (gangs_min != 0 || gangs_max != 0
351 || workers_min != 0 || workers_max != workers_actual - 1
352 || vectors_min != 0 || vectors_max != 0)
353 __builtin_abort ();
356 /* GR, WS, VP. */
358 /* We try with an outrageously large value. */
359 #define VECTORS 2 << 20
360 int vectors_actual = VECTORS;
361 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
362 gangs_min = workers_min = vectors_min = INT_MAX;
363 gangs_max = workers_max = vectors_max = INT_MIN;
364 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
365 vector_length (VECTORS)
367 if (acc_on_device (acc_device_host))
369 /* We're actually executing with vector_length (1). */
370 vectors_actual = 1;
372 else if (acc_on_device (acc_device_nvidia))
374 /* The GCC nvptx back end enforces vector_length (32). */
375 vectors_actual = 32;
377 else
378 __builtin_abort ();
379 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
380 for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
382 gangs_min = gangs_max = acc_gang ();
383 workers_min = workers_max = acc_worker ();
384 vectors_min = vectors_max = acc_vector ();
387 if (vectors_actual < 1)
388 __builtin_abort ();
389 if (gangs_min != 0 || gangs_max != 0
390 || workers_min != 0 || workers_max != 0
391 || vectors_min != 0 || vectors_max != vectors_actual - 1)
392 __builtin_abort ();
393 #undef VECTORS
396 /* GR, WS, VP. */
398 /* We try with an outrageously large value. */
399 int vectors = 2 << 20;
400 int vectors_actual = vectors;
401 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
402 gangs_min = workers_min = vectors_min = INT_MAX;
403 gangs_max = workers_max = vectors_max = INT_MIN;
404 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \
405 vector_length (vectors)
407 if (acc_on_device (acc_device_host))
409 /* We're actually executing with vector_length (1). */
410 vectors_actual = 1;
412 else if (acc_on_device (acc_device_nvidia))
414 /* The GCC nvptx back end enforces vector_length (32). */
415 vectors_actual = 32;
417 else
418 __builtin_abort ();
419 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
420 for (int i = 100 * vectors_actual; i > -100 * vectors_actual; --i)
422 gangs_min = gangs_max = acc_gang ();
423 workers_min = workers_max = acc_worker ();
424 vectors_min = vectors_max = acc_vector ();
427 if (vectors_actual < 1)
428 __builtin_abort ();
429 if (gangs_min != 0 || gangs_max != 0
430 || workers_min != 0 || workers_max != 0
431 || vectors_min != 0 || vectors_max != vectors_actual - 1)
432 __builtin_abort ();
436 /* Composition of GP, WP, VP. */
438 int gangs = 12345;
439 /* With nvptx offloading, multi-level reductions apparently are very slow
440 in the following case. So, limit ourselves here. */
441 if (acc_get_device_type () == acc_device_nvidia)
442 gangs = 3;
443 int gangs_actual = gangs;
444 #define WORKERS 3
445 int workers_actual = WORKERS;
446 #define VECTORS 11
447 int vectors_actual = VECTORS;
448 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
449 gangs_min = workers_min = vectors_min = INT_MAX;
450 gangs_max = workers_max = vectors_max = INT_MIN;
451 #pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \
452 num_gangs (gangs) \
453 num_workers (WORKERS) \
454 vector_length (VECTORS)
456 if (acc_on_device (acc_device_host))
458 /* We're actually executing with num_gangs (1), num_workers (1),
459 vector_length (1). */
460 gangs_actual = 1;
461 workers_actual = 1;
462 vectors_actual = 1;
464 else if (acc_on_device (acc_device_nvidia))
466 /* The GCC nvptx back end enforces vector_length (32). */
467 vectors_actual = 32;
469 else
470 __builtin_abort ();
471 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
472 for (int i = 100 * gangs_actual; i > -100 * gangs_actual; --i)
473 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
474 for (int j = 100 * workers_actual; j > -100 * workers_actual; --j)
475 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
476 for (int k = 100 * vectors_actual; k > -100 * vectors_actual; --k)
478 gangs_min = gangs_max = acc_gang ();
479 workers_min = workers_max = acc_worker ();
480 vectors_min = vectors_max = acc_vector ();
483 if (gangs_min != 0 || gangs_max != gangs_actual - 1
484 || workers_min != 0 || workers_max != workers_actual - 1
485 || vectors_min != 0 || vectors_max != vectors_actual - 1)
486 __builtin_abort ();
487 #undef VECTORS
488 #undef WORKERS
492 /* We can't test parallelized OpenACC kernels constructs in this way: use of
493 the acc_gang, acc_worker, acc_vector functions will make the construct
494 unparallelizable. */
497 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
498 kernels. */
500 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
501 gangs_min = workers_min = vectors_min = INT_MAX;
502 gangs_max = workers_max = vectors_max = INT_MIN;
503 #pragma acc kernels
505 /* This is to make the OpenACC kernels construct unparallelizable. */
506 asm volatile ("" : : : "memory");
508 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
509 for (int i = 100; i > -100; --i)
511 gangs_min = gangs_max = acc_gang ();
512 workers_min = workers_max = acc_worker ();
513 vectors_min = vectors_max = acc_vector ();
516 if (gangs_min != 0 || gangs_max != 1 - 1
517 || workers_min != 0 || workers_max != 1 - 1
518 || vectors_min != 0 || vectors_max != 1 - 1)
519 __builtin_abort ();
523 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
524 kernels even when there are explicit num_gangs, num_workers, or
525 vector_length clauses. */
527 int gangs = 5;
528 #define WORKERS 5
529 #define VECTORS 13
530 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
531 gangs_min = workers_min = vectors_min = INT_MAX;
532 gangs_max = workers_max = vectors_max = INT_MIN;
533 #pragma acc kernels \
534 num_gangs (gangs) \
535 num_workers (WORKERS) \
536 vector_length (VECTORS)
538 /* This is to make the OpenACC kernels construct unparallelizable. */
539 asm volatile ("" : : : "memory");
541 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
542 for (int i = 100; i > -100; --i)
544 gangs_min = gangs_max = acc_gang ();
545 workers_min = workers_max = acc_worker ();
546 vectors_min = vectors_max = acc_vector ();
549 if (gangs_min != 0 || gangs_max != 1 - 1
550 || workers_min != 0 || workers_max != 1 - 1
551 || vectors_min != 0 || vectors_max != 1 - 1)
552 __builtin_abort ();
553 #undef VECTORS
554 #undef WORKERS
558 return 0;