1 /* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
6 #include <gomp-constants.h>
8 /* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
9 not behaving as expected for -O0. */
10 #pragma acc routine seq
11 static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
13 if (acc_on_device ((int) acc_device_host
))
15 else if (acc_on_device ((int) acc_device_nvidia
))
16 return __builtin_goacc_parlevel_id (GOMP_DIM_GANG
);
21 #pragma acc routine seq
22 static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
24 if (acc_on_device ((int) acc_device_host
))
26 else if (acc_on_device ((int) acc_device_nvidia
))
27 return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER
);
32 #pragma acc routine seq
33 static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
35 if (acc_on_device ((int) acc_device_host
))
37 else if (acc_on_device ((int) acc_device_nvidia
))
38 return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR
);
46 acc_init (acc_device_default
);
48 /* Non-positive value. */
52 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
53 int gangs_actual
= GANGS
;
54 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
55 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
56 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
57 #pragma acc parallel copy (gangs_actual) \
58 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
59 num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
61 /* We're actually executing with num_gangs (1). */
63 for (int i
= 100 * gangs_actual
; i
> -100 * gangs_actual
; --i
)
65 /* <https://gcc.gnu.org/PR80547>. */
67 gangs_min
= gangs_max
= acc_gang ();
68 workers_min
= workers_max
= acc_worker ();
69 vectors_min
= vectors_max
= acc_vector ();
71 int gangs
= acc_gang ();
72 gangs_min
= (gangs_min
< gangs
) ? gangs_min
: gangs
;
73 gangs_max
= (gangs_max
> gangs
) ? gangs_max
: gangs
;
74 int workers
= acc_worker ();
75 workers_min
= (workers_min
< workers
) ? workers_min
: workers
;
76 workers_max
= (workers_max
> workers
) ? workers_max
: workers
;
77 int vectors
= acc_vector ();
78 vectors_min
= (vectors_min
< vectors
) ? vectors_min
: vectors
;
79 vectors_max
= (vectors_max
> vectors
) ? vectors_max
: vectors
;
83 if (gangs_actual
!= 1)
85 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
86 || workers_min
!= 0 || workers_max
!= 0
87 || vectors_min
!= 0 || vectors_max
!= 0)
94 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
95 int gangs_actual
= GANGS
;
96 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
97 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
98 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
99 #pragma acc parallel copy (gangs_actual) \
100 num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
102 /* We're actually executing with num_gangs (1). */
104 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
105 for (int i
= 100 * gangs_actual
; i
> -100 * gangs_actual
; --i
)
107 gangs_min
= gangs_max
= acc_gang ();
108 workers_min
= workers_max
= acc_worker ();
109 vectors_min
= vectors_max
= acc_vector ();
112 if (gangs_actual
!= 1)
114 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
115 || workers_min
!= 0 || workers_max
!= 0
116 || vectors_min
!= 0 || vectors_max
!= 0)
123 #define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
124 int workers_actual
= WORKERS
;
125 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
126 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
127 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
128 #pragma acc parallel copy (workers_actual) \
129 num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
131 /* We're actually executing with num_workers (1). */
133 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
134 for (int i
= 100 * workers_actual
; i
> -100 * workers_actual
; --i
)
136 gangs_min
= gangs_max
= acc_gang ();
137 workers_min
= workers_max
= acc_worker ();
138 vectors_min
= vectors_max
= acc_vector ();
141 if (workers_actual
!= 1)
143 if (gangs_min
!= 0 || gangs_max
!= 0
144 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
145 || vectors_min
!= 0 || vectors_max
!= 0)
152 #define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
153 int vectors_actual
= VECTORS
;
154 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
155 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
156 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
157 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \
158 vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
160 /* We're actually executing with vector_length (1), just the GCC nvptx
161 back end enforces vector_length (32). */
162 if (acc_on_device (acc_device_nvidia
))
166 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
167 for (int i
= 100 * vectors_actual
; i
> -100 * vectors_actual
; --i
)
169 gangs_min
= gangs_max
= acc_gang ();
170 workers_min
= workers_max
= acc_worker ();
171 vectors_min
= vectors_max
= acc_vector ();
174 if (acc_get_device_type () == acc_device_nvidia
)
176 if (vectors_actual
!= 32)
180 if (vectors_actual
!= 1)
182 if (gangs_min
!= 0 || gangs_max
!= 0
183 || workers_min
!= 0 || workers_max
!= 0
184 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
194 /* There is no actual limit for the number of gangs, so we try with a
195 rather high value. */
197 int gangs_actual
= gangs
;
198 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
199 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
200 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
201 #pragma acc parallel copy (gangs_actual) \
202 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
205 if (acc_on_device (acc_device_host
))
207 /* We're actually executing with num_gangs (1). */
210 /* As we're executing GR not GP, don't multiply with a "gangs_actual"
212 for (int i
= 100 /* * gangs_actual */; i
> -100 /* * gangs_actual */; --i
)
214 gangs_min
= gangs_max
= acc_gang ();
215 workers_min
= workers_max
= acc_worker ();
216 vectors_min
= vectors_max
= acc_vector ();
219 if (gangs_actual
< 1)
221 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
222 || workers_min
!= 0 || workers_max
!= 0
223 || vectors_min
!= 0 || vectors_max
!= 0)
229 /* There is no actual limit for the number of gangs, so we try with a
230 rather high value. */
232 int gangs_actual
= gangs
;
233 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
234 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
235 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
236 #pragma acc parallel copy (gangs_actual) \
239 if (acc_on_device (acc_device_host
))
241 /* We're actually executing with num_gangs (1). */
244 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
245 for (int i
= 100 * gangs_actual
; i
> -100 * gangs_actual
; --i
)
247 gangs_min
= gangs_max
= acc_gang ();
248 workers_min
= workers_max
= acc_worker ();
249 vectors_min
= vectors_max
= acc_vector ();
252 if (gangs_actual
< 1)
254 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
255 || workers_min
!= 0 || workers_max
!= 0
256 || vectors_min
!= 0 || vectors_max
!= 0)
262 /* We try with an outrageously large value. */
263 #define WORKERS 2 << 20
264 int workers_actual
= WORKERS
;
265 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
266 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
267 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
268 #pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
269 num_workers (WORKERS)
271 if (acc_on_device (acc_device_host
))
273 /* We're actually executing with num_workers (1). */
276 else if (acc_on_device (acc_device_nvidia
))
278 /* The GCC nvptx back end enforces num_workers (32). */
283 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
284 for (int i
= 100 * workers_actual
; i
> -100 * workers_actual
; --i
)
286 gangs_min
= gangs_max
= acc_gang ();
287 workers_min
= workers_max
= acc_worker ();
288 vectors_min
= vectors_max
= acc_vector ();
291 if (workers_actual
< 1)
293 if (gangs_min
!= 0 || gangs_max
!= 0
294 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
295 || vectors_min
!= 0 || vectors_max
!= 0)
302 /* We try with an outrageously large value. */
303 int workers
= 2 << 20;
304 /* For nvptx offloading, this one will not result in "using num_workers
305 (32), ignoring runtime setting", and will in fact try to launch with
306 "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
307 error: invalid argument". So, limit ourselves here. */
308 if (acc_get_device_type () == acc_device_nvidia
)
310 int workers_actual
= workers
;
311 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
312 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
313 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
314 #pragma acc parallel copy (workers_actual) \
315 num_workers (workers)
317 if (acc_on_device (acc_device_host
))
319 /* We're actually executing with num_workers (1). */
322 else if (acc_on_device (acc_device_nvidia
))
324 /* We're actually executing with num_workers (32). */
325 /* workers_actual = 32; */
329 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
330 for (int i
= 100 * workers_actual
; i
> -100 * workers_actual
; --i
)
332 gangs_min
= gangs_max
= acc_gang ();
333 workers_min
= workers_max
= acc_worker ();
334 vectors_min
= vectors_max
= acc_vector ();
337 if (workers_actual
< 1)
339 if (gangs_min
!= 0 || gangs_max
!= 0
340 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
341 || vectors_min
!= 0 || vectors_max
!= 0)
347 /* We try with an outrageously large value. */
348 #define VECTORS 2 << 20
349 int vectors_actual
= VECTORS
;
350 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
351 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
352 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
353 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
354 vector_length (VECTORS)
356 if (acc_on_device (acc_device_host
))
358 /* We're actually executing with vector_length (1). */
361 else if (acc_on_device (acc_device_nvidia
))
363 /* The GCC nvptx back end enforces vector_length (32). */
368 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
369 for (int i
= 100 * vectors_actual
; i
> -100 * vectors_actual
; --i
)
371 gangs_min
= gangs_max
= acc_gang ();
372 workers_min
= workers_max
= acc_worker ();
373 vectors_min
= vectors_max
= acc_vector ();
376 if (vectors_actual
< 1)
378 if (gangs_min
!= 0 || gangs_max
!= 0
379 || workers_min
!= 0 || workers_max
!= 0
380 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
387 /* We try with an outrageously large value. */
388 int vectors
= 2 << 20;
389 int vectors_actual
= vectors
;
390 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
391 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
392 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
393 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \
394 vector_length (vectors)
396 if (acc_on_device (acc_device_host
))
398 /* We're actually executing with vector_length (1). */
401 else if (acc_on_device (acc_device_nvidia
))
403 /* The GCC nvptx back end enforces vector_length (32). */
408 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
409 for (int i
= 100 * vectors_actual
; i
> -100 * vectors_actual
; --i
)
411 gangs_min
= gangs_max
= acc_gang ();
412 workers_min
= workers_max
= acc_worker ();
413 vectors_min
= vectors_max
= acc_vector ();
416 if (vectors_actual
< 1)
418 if (gangs_min
!= 0 || gangs_max
!= 0
419 || workers_min
!= 0 || workers_max
!= 0
420 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
425 /* Composition of GP, WP, VP. */
428 /* With nvptx offloading, multi-level reductions apparently are very slow
429 in the following case. So, limit ourselves here. */
430 if (acc_get_device_type () == acc_device_nvidia
)
432 int gangs_actual
= gangs
;
434 int workers_actual
= WORKERS
;
436 int vectors_actual
= VECTORS
;
437 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
438 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
439 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
440 #pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \
442 num_workers (WORKERS) \
443 vector_length (VECTORS)
445 if (acc_on_device (acc_device_host
))
447 /* We're actually executing with num_gangs (1), num_workers (1),
448 vector_length (1). */
453 else if (acc_on_device (acc_device_nvidia
))
455 /* The GCC nvptx back end enforces vector_length (32). */
460 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
461 for (int i
= 100 * gangs_actual
; i
> -100 * gangs_actual
; --i
)
462 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
463 for (int j
= 100 * workers_actual
; j
> -100 * workers_actual
; --j
)
464 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
465 for (int k
= 100 * vectors_actual
; k
> -100 * vectors_actual
; --k
)
467 gangs_min
= gangs_max
= acc_gang ();
468 workers_min
= workers_max
= acc_worker ();
469 vectors_min
= vectors_max
= acc_vector ();
472 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
473 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
474 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
481 /* We can't test parallelized OpenACC kernels constructs in this way: use of
482 the acc_gang, acc_worker, acc_vector functions will make the construct
486 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
489 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
490 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
491 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
494 /* This is to make the OpenACC kernels construct unparallelizable. */
495 asm volatile ("" : : : "memory");
497 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
498 for (int i
= 100; i
> -100; --i
)
500 gangs_min
= gangs_max
= acc_gang ();
501 workers_min
= workers_max
= acc_worker ();
502 vectors_min
= vectors_max
= acc_vector ();
505 if (gangs_min
!= 0 || gangs_max
!= 1 - 1
506 || workers_min
!= 0 || workers_max
!= 1 - 1
507 || vectors_min
!= 0 || vectors_max
!= 1 - 1)
512 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
513 kernels even when there are explicit num_gangs, num_workers, or
514 vector_length clauses. */
519 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
520 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
521 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
522 #pragma acc kernels \
524 num_workers (WORKERS) \
525 vector_length (VECTORS)
527 /* This is to make the OpenACC kernels construct unparallelizable. */
528 asm volatile ("" : : : "memory");
530 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
531 for (int i
= 100; i
> -100; --i
)
533 gangs_min
= gangs_max
= acc_gang ();
534 workers_min
= workers_max
= acc_worker ();
535 vectors_min
= vectors_max
= acc_vector ();
538 if (gangs_min
!= 0 || gangs_max
!= 1 - 1
539 || workers_min
!= 0 || workers_max
!= 1 - 1
540 || vectors_min
!= 0 || vectors_max
!= 1 - 1)