Daily bump.
[official-gcc.git] / libgomp / testsuite / libgomp.oacc-c-c++-common / parallel-dims.c
blob6798e23ef7012c35bed1264218223b687c50934f
1 /* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
2 vector_length. */
4 /* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
6 /* { dg-additional-options "--param=openacc-kernels=decompose" } */
8 /* { dg-additional-options "-fopt-info-all-omp" }
9 { dg-additional-options "-foffload=-fopt-info-all-omp" } */
11 /* { dg-additional-options "--param=openacc-privatization=noisy" }
12 { dg-additional-options "-foffload=--param=openacc-privatization=noisy" }
13 Prune a few: uninteresting, and potentially varying depending on GCC configuration (data types):
14 { dg-prune-output {note: variable 'D\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} } */
16 /* It's only with Tcl 8.5 (released in 2007) that "the variable 'varName'
17 passed to 'incr' may be unset, and in that case, it will be set to [...]",
18 so to maintain compatibility with earlier Tcl releases, we manually
19 initialize counter variables:
20 { dg-line l_dummy[variable c_compute 0 c_loop_i 0 c_loop_j 0 c_loop_k 0] }
21 { dg-message dummy {} { target iN-VAl-Id } l_dummy } to avoid
22 "WARNING: dg-line var l_dummy defined, but not used". */
24 /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
25 aspects of that functionality. */
27 /* See also '../libgomp.oacc-fortran/parallel-dims.f90'. */
29 #include <limits.h>
30 #include <openacc.h>
31 #include <gomp-constants.h>
33 #pragma acc routine seq
34 inline __attribute__ ((always_inline))
35 static int acc_gang ()
37 return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
40 #pragma acc routine seq
41 inline __attribute__ ((always_inline))
42 static int acc_worker ()
44 return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
47 #pragma acc routine seq
48 inline __attribute__ ((always_inline))
49 static int acc_vector ()
51 return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
54 #ifdef EXPENSIVE
55 #define N 100
56 #else
57 #define N 50
58 #endif
60 int main ()
62 acc_init (acc_device_default);
64 /* OpenACC parallel construct. */
66 /* Non-positive value. */
68 /* GR, WS, VS. */
70 #define GANGS 0
71 /* { dg-warning {'num_gangs' value must be positive} {} { target c } .-1 } */
72 int gangs_actual = GANGS;
73 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
74 gangs_min = workers_min = vectors_min = INT_MAX;
75 gangs_max = workers_max = vectors_max = INT_MIN;
76 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
77 copy (gangs_actual) \
78 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
79 num_gangs (GANGS)
80 /* { dg-note {in expansion of macro 'GANGS'} {} { target c } .-1 } */
81 /* { dg-warning {'num_gangs' value must be positive} {} { target c++ } .-2 } */
82 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
84 /* We're actually executing with num_gangs (1). */
85 gangs_actual = 1;
86 for (int i = N * gangs_actual; i > -N * gangs_actual; --i)
88 gangs_min = gangs_max = acc_gang ();
89 workers_min = workers_max = acc_worker ();
90 vectors_min = vectors_max = acc_vector ();
93 if (gangs_actual != 1)
94 __builtin_abort ();
95 if (gangs_min != 0 || gangs_max != gangs_actual - 1
96 || workers_min != 0 || workers_max != 0
97 || vectors_min != 0 || vectors_max != 0)
98 __builtin_abort ();
99 #undef GANGS
102 /* GP, WS, VS. */
104 #define GANGS 0
105 /* { dg-warning {'num_gangs' value must be positive} {} { target c } .-1 } */
106 int gangs_actual = GANGS;
107 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
108 gangs_min = workers_min = vectors_min = INT_MAX;
109 gangs_max = workers_max = vectors_max = INT_MIN;
110 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
111 copy (gangs_actual) \
112 num_gangs (GANGS)
113 /* { dg-note {in expansion of macro 'GANGS'} {} { target c } .-1 } */
114 /* { dg-warning {'num_gangs' value must be positive} {} { target c++ } .-2 } */
115 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
116 /* { dg-warning {region contains gang partitioned code but is not gang partitioned} {} { target *-*-* } l_compute$c_compute } */
118 /* We're actually executing with num_gangs (1). */
119 gangs_actual = 1;
120 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
121 gang \
122 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
123 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
124 /* { dg-optimized {assigned OpenACC gang loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
125 for (int i = N * gangs_actual; i > -N * gangs_actual; --i)
127 gangs_min = gangs_max = acc_gang ();
128 workers_min = workers_max = acc_worker ();
129 vectors_min = vectors_max = acc_vector ();
132 if (gangs_actual != 1)
133 __builtin_abort ();
134 if (gangs_min != 0 || gangs_max != gangs_actual - 1
135 || workers_min != 0 || workers_max != 0
136 || vectors_min != 0 || vectors_max != 0)
137 __builtin_abort ();
138 #undef GANGS
141 /* GR, WP, VS. */
143 #define WORKERS 0
144 /* { dg-warning {'num_workers' value must be positive} {} { target c } .-1 } */
145 int workers_actual = WORKERS;
146 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
147 gangs_min = workers_min = vectors_min = INT_MAX;
148 gangs_max = workers_max = vectors_max = INT_MIN;
149 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
150 copy (workers_actual) \
151 num_workers (WORKERS)
152 /* { dg-note {in expansion of macro 'WORKERS'} {} { target c } .-1 } */
153 /* { dg-warning {'num_workers' value must be positive} {} { target c++ } .-2 } */
154 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
155 /* { dg-warning {region contains worker partitioned code but is not worker partitioned} {} { target *-*-* } l_compute$c_compute } */
157 /* We're actually executing with num_workers (1). */
158 workers_actual = 1;
159 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
160 worker \
161 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
162 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
163 /* { dg-optimized {assigned OpenACC worker loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
164 for (int i = N * workers_actual; i > -N * workers_actual; --i)
166 gangs_min = gangs_max = acc_gang ();
167 workers_min = workers_max = acc_worker ();
168 vectors_min = vectors_max = acc_vector ();
171 if (workers_actual != 1)
172 __builtin_abort ();
173 if (gangs_min != 0 || gangs_max != 0
174 || workers_min != 0 || workers_max != workers_actual - 1
175 || vectors_min != 0 || vectors_max != 0)
176 __builtin_abort ();
177 #undef WORKERS
180 /* GR, WS, VP. */
182 #define VECTORS 0
183 /* { dg-warning {'vector_length' value must be positive} {} { target c } .-1 } */
184 int vectors_actual = VECTORS;
185 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
186 gangs_min = workers_min = vectors_min = INT_MAX;
187 gangs_max = workers_max = vectors_max = INT_MIN;
188 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
189 copy (vectors_actual) \
190 vector_length (VECTORS)
191 /* { dg-note {in expansion of macro 'VECTORS'} {} { target c } .-1 } */
192 /* { dg-warning {'vector_length' value must be positive} {} { target c++ } .-2 } */
193 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
194 /* { dg-warning {region contains vector partitioned code but is not vector partitioned} {} { target *-*-* } l_compute$c_compute } */
195 /* { dg-warning {using 'vector_length \(32\)', ignoring 1} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
197 /* We're actually executing with vector_length (1), just the GCC nvptx
198 back end enforces vector_length (32). */
199 if (acc_on_device (acc_device_nvidia))
200 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
201 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
202 vectors_actual = 32;
203 else
204 vectors_actual = 1;
205 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
206 vector \
207 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
208 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
209 /* { dg-optimized {assigned OpenACC vector loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
210 for (int i = N * vectors_actual; i > -N * vectors_actual; --i)
212 gangs_min = gangs_max = acc_gang ();
213 workers_min = workers_max = acc_worker ();
214 vectors_min = vectors_max = acc_vector ();
217 if (acc_get_device_type () == acc_device_nvidia)
219 if (vectors_actual != 32)
220 __builtin_abort ();
222 else
223 if (vectors_actual != 1)
224 __builtin_abort ();
225 if (gangs_min != 0 || gangs_max != 0
226 || workers_min != 0 || workers_max != 0
227 || vectors_min != 0 || vectors_max != vectors_actual - 1)
228 __builtin_abort ();
229 #undef VECTORS
233 /* High value. */
235 /* GR, WS, VS. */
237 /* There is no actual limit for the number of gangs, so we try with a
238 rather high value. */
239 int gangs = 12345;
240 int gangs_actual = gangs;
241 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
242 gangs_min = workers_min = vectors_min = INT_MAX;
243 gangs_max = workers_max = vectors_max = INT_MIN;
244 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
245 copy (gangs_actual) \
246 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
247 num_gangs (gangs)
248 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
249 /* { dg-bogus {warning: region is gang partitioned but does not contain gang partitioned code} {TODO 'reduction'} { xfail *-*-* } l_compute$c_compute } */
251 if (acc_on_device (acc_device_host))
252 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
253 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
255 /* We're actually executing with num_gangs (1). */
256 gangs_actual = 1;
258 /* As we're executing GR not GP, don't multiply with a "gangs_actual"
259 factor. */
260 for (int i = N /* * gangs_actual */; i > -N /* * gangs_actual */; --i)
262 gangs_min = gangs_max = acc_gang ();
263 workers_min = workers_max = acc_worker ();
264 vectors_min = vectors_max = acc_vector ();
267 if (gangs_actual < 1)
268 __builtin_abort ();
269 if (gangs_min != 0 || gangs_max != gangs_actual - 1
270 || workers_min != 0 || workers_max != 0
271 || vectors_min != 0 || vectors_max != 0)
272 __builtin_abort ();
275 /* GP, WS, VS. */
277 /* There is no actual limit for the number of gangs, so we try with a
278 rather high value. */
279 int gangs = 12345;
280 int gangs_actual = gangs;
281 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
282 gangs_min = workers_min = vectors_min = INT_MAX;
283 gangs_max = workers_max = vectors_max = INT_MIN;
284 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
285 copy (gangs_actual) \
286 num_gangs (gangs)
287 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
289 if (acc_on_device (acc_device_host))
290 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
291 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
293 /* We're actually executing with num_gangs (1). */
294 gangs_actual = 1;
296 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
297 gang \
298 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
299 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
300 /* { dg-optimized {assigned OpenACC gang loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
301 for (int i = N * gangs_actual; i > -N * gangs_actual; --i)
303 gangs_min = gangs_max = acc_gang ();
304 workers_min = workers_max = acc_worker ();
305 vectors_min = vectors_max = acc_vector ();
308 if (gangs_actual < 1)
309 __builtin_abort ();
310 if (gangs_min != 0 || gangs_max != gangs_actual - 1
311 || workers_min != 0 || workers_max != 0
312 || vectors_min != 0 || vectors_max != 0)
313 __builtin_abort ();
316 /* GR, WP, VS. */
318 /* We try with an outrageously large value. */
319 #define WORKERS 2 << 20
320 int workers_actual = WORKERS;
321 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
322 gangs_min = workers_min = vectors_min = INT_MAX;
323 gangs_max = workers_max = vectors_max = INT_MIN;
324 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
325 copy (workers_actual) \
326 num_workers (WORKERS)
327 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
328 /* { dg-warning {using 'num_workers \(32\)', ignoring 2097152} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
330 if (acc_on_device (acc_device_host))
331 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
332 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
334 /* We're actually executing with num_workers (1). */
335 workers_actual = 1;
337 else if (acc_on_device (acc_device_nvidia))
338 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
339 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
341 /* The GCC nvptx back end enforces num_workers (32). */
342 workers_actual = 32;
344 else if (acc_on_device (acc_device_radeon))
345 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
346 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
348 /* The GCC GCN back end is limited to num_workers (16). */
349 workers_actual = 16;
351 else
352 __builtin_abort ();
353 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
354 worker \
355 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
356 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
357 /* { dg-optimized {assigned OpenACC worker loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
358 for (int i = N * workers_actual; i > -N * workers_actual; --i)
360 gangs_min = gangs_max = acc_gang ();
361 workers_min = workers_max = acc_worker ();
362 vectors_min = vectors_max = acc_vector ();
365 if (workers_actual < 1)
366 __builtin_abort ();
367 if (gangs_min != 0 || gangs_max != 0
368 || workers_min != 0 || workers_max != workers_actual - 1
369 || vectors_min != 0 || vectors_max != 0)
370 __builtin_abort ();
371 #undef WORKERS
374 /* GR, WP, VS. */
376 /* We try with an outrageously large value. */
377 int workers = 2 << 20;
378 /* For nvptx offloading, this one will not result in "using num_workers
379 (32), ignoring runtime setting", and will in fact try to launch with
380 "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
381 error: invalid argument". So, limit ourselves here. */
382 if (acc_get_device_type () == acc_device_nvidia)
383 workers = 32;
384 int workers_actual = workers;
385 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
386 gangs_min = workers_min = vectors_min = INT_MAX;
387 gangs_max = workers_max = vectors_max = INT_MIN;
388 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
389 copy (workers_actual) \
390 num_workers (workers)
391 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
393 if (acc_on_device (acc_device_host))
394 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
395 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
397 /* We're actually executing with num_workers (1). */
398 workers_actual = 1;
400 else if (acc_on_device (acc_device_nvidia))
401 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
402 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
404 /* We're actually executing with num_workers (32). */
405 /* workers_actual = 32; */
407 else if (acc_on_device (acc_device_radeon))
408 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
409 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
411 /* The GCC GCN back end is limited to num_workers (16). */
412 workers_actual = 16;
414 else
415 __builtin_abort ();
416 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
417 worker \
418 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
419 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
420 /* { dg-optimized {assigned OpenACC worker loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
421 for (int i = N * workers_actual; i > -N * workers_actual; --i)
423 gangs_min = gangs_max = acc_gang ();
424 workers_min = workers_max = acc_worker ();
425 vectors_min = vectors_max = acc_vector ();
428 if (workers_actual < 1)
429 __builtin_abort ();
430 if (gangs_min != 0 || gangs_max != 0
431 || workers_min != 0 || workers_max != workers_actual - 1
432 || vectors_min != 0 || vectors_max != 0)
433 __builtin_abort ();
436 /* GR, WS, VP. */
438 /* We try with an outrageously large value. */
439 #define VECTORS 2 << 20
440 int vectors_actual = VECTORS;
441 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
442 gangs_min = workers_min = vectors_min = INT_MAX;
443 gangs_max = workers_max = vectors_max = INT_MIN;
444 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
445 copy (vectors_actual) \
446 vector_length (VECTORS)
447 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
448 /* { dg-warning {using 'vector_length \(1024\)', ignoring 2097152} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
450 if (acc_on_device (acc_device_host))
451 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
452 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
454 /* We're actually executing with vector_length (1). */
455 vectors_actual = 1;
457 else if (acc_on_device (acc_device_nvidia))
458 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
459 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
461 /* The GCC nvptx back end reduces to vector_length (1024). */
462 vectors_actual = 1024;
464 else if (acc_on_device (acc_device_radeon))
465 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
466 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
468 /* The GCC GCN back end enforces vector_length (1): autovectorize. */
469 vectors_actual = 1;
471 else
472 __builtin_abort ();
473 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
474 vector \
475 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
476 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
477 /* { dg-optimized {assigned OpenACC vector loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
478 for (int i = N * vectors_actual; i > -N * vectors_actual; --i)
480 gangs_min = gangs_max = acc_gang ();
481 workers_min = workers_max = acc_worker ();
482 vectors_min = vectors_max = acc_vector ();
485 if (vectors_actual < 1)
486 __builtin_abort ();
487 if (gangs_min != 0 || gangs_max != 0
488 || workers_min != 0 || workers_max != 0
489 || vectors_min != 0 || vectors_max != vectors_actual - 1)
490 __builtin_abort ();
491 #undef VECTORS
494 /* GR, WS, VP. */
496 /* We try with an outrageously large value. */
497 int vectors = 2 << 20;
498 int vectors_actual = vectors;
499 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
500 gangs_min = workers_min = vectors_min = INT_MAX;
501 gangs_max = workers_max = vectors_max = INT_MIN;
502 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
503 copy (vectors_actual) \
504 vector_length (vectors)
505 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
506 /* { dg-warning {using 'vector_length \(32\)', ignoring runtime setting} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
508 if (acc_on_device (acc_device_host))
509 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
510 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
512 /* We're actually executing with vector_length (1). */
513 vectors_actual = 1;
515 else if (acc_on_device (acc_device_nvidia))
516 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
517 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
519 /* The GCC nvptx back end enforces vector_length (32). */
520 vectors_actual = 32;
522 else if (acc_on_device (acc_device_radeon))
523 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
524 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
526 /* Because of the way vectors are implemented for GCN, a vector loop
527 containing a seq routine call will not vectorize calls to that
528 routine. Hence, we'll only get one "vector". */
529 vectors_actual = 1;
531 else
532 __builtin_abort ();
533 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
534 vector \
535 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
536 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
537 /* { dg-optimized {assigned OpenACC vector loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
538 for (int i = N * vectors_actual; i > -N * vectors_actual; --i)
540 gangs_min = gangs_max = acc_gang ();
541 workers_min = workers_max = acc_worker ();
542 vectors_min = vectors_max = acc_vector ();
545 if (vectors_actual < 1)
546 __builtin_abort ();
547 if (gangs_min != 0 || gangs_max != 0
548 || workers_min != 0 || workers_max != 0
549 || vectors_min != 0 || vectors_max != vectors_actual - 1)
550 __builtin_abort ();
554 /* Composition of GP, WP, VP. */
556 int gangs = 12345;
557 /* With nvptx offloading, multi-level reductions apparently are very slow
558 in the following case. So, limit ourselves here. */
559 if (acc_get_device_type () == acc_device_nvidia)
560 gangs = 3;
561 /* Similar appears to be true for GCN. */
562 if (acc_get_device_type () == acc_device_radeon)
563 gangs = 3;
564 int gangs_actual = gangs;
565 #define WORKERS 3
566 int workers_actual = WORKERS;
567 #define VECTORS 11
568 int vectors_actual = VECTORS;
569 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
570 gangs_min = workers_min = vectors_min = INT_MAX;
571 gangs_max = workers_max = vectors_max = INT_MIN;
572 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
573 copy (gangs_actual, workers_actual, vectors_actual) \
574 num_gangs (gangs) \
575 num_workers (WORKERS) \
576 vector_length (VECTORS)
577 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
578 /* { dg-warning {using 'vector_length \(32\)', ignoring 11} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
580 if (acc_on_device (acc_device_host))
581 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
582 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
584 /* We're actually executing with num_gangs (1), num_workers (1),
585 vector_length (1). */
586 gangs_actual = 1;
587 workers_actual = 1;
588 vectors_actual = 1;
590 else if (acc_on_device (acc_device_nvidia))
591 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
592 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
594 /* The GCC nvptx back end enforces vector_length (32). */
595 vectors_actual = 32;
597 else if (acc_on_device (acc_device_radeon))
598 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
599 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
601 /* See above comments about GCN vectors_actual. */
602 vectors_actual = 1;
604 else
605 __builtin_abort ();
606 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
607 gang \
608 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
609 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
610 /* { dg-note {variable 'j' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
611 /* { dg-optimized {assigned OpenACC gang loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
612 for (int i = N * gangs_actual; i > -N * gangs_actual; --i)
613 #pragma acc loop /* { dg-line l_loop_j[incr c_loop_j] } */ \
614 worker \
615 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
616 /* { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_j$c_loop_j } */
617 /* { dg-note {variable 'k' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_j$c_loop_j } */
618 /* { dg-optimized {assigned OpenACC worker loop parallelism} {} { target *-*-* } l_loop_j$c_loop_j } */
619 for (int j = N * workers_actual; j > -N * workers_actual; --j)
620 #pragma acc loop /* { dg-line l_loop_k[incr c_loop_k] } */ \
621 vector \
622 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
623 /* { dg-note {variable 'k' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_k$c_loop_k } */
624 /* { dg-optimized {assigned OpenACC vector loop parallelism} {} { target *-*-* } l_loop_k$c_loop_k } */
625 for (int k = N * vectors_actual; k > -N * vectors_actual; --k)
627 gangs_min = gangs_max = acc_gang ();
628 workers_min = workers_max = acc_worker ();
629 vectors_min = vectors_max = acc_vector ();
632 if (gangs_min != 0 || gangs_max != gangs_actual - 1
633 || workers_min != 0 || workers_max != workers_actual - 1
634 || vectors_min != 0 || vectors_max != vectors_actual - 1)
635 __builtin_abort ();
636 #undef VECTORS
637 #undef WORKERS
641 /* OpenACC kernels construct. */
643 /* We can't test parallelized OpenACC kernels constructs in this way: use of
644 the acc_gang, acc_worker, acc_vector functions will make the construct
645 unparallelizable. */
648 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
649 kernels. */
651 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
652 gangs_min = workers_min = vectors_min = INT_MAX;
653 gangs_max = workers_max = vectors_max = INT_MIN;
654 #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */
655 /* { dg-note {OpenACC 'kernels' decomposition: variable 'vectors_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
656 { dg-note {variable 'vectors_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
657 /* { dg-note {OpenACC 'kernels' decomposition: variable 'vectors_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
658 { dg-note {variable 'vectors_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
659 /* { dg-note {OpenACC 'kernels' decomposition: variable 'workers_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
660 { dg-note {variable 'workers_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
661 /* { dg-note {OpenACC 'kernels' decomposition: variable 'workers_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
662 { dg-note {variable 'workers_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
663 /* { dg-note {OpenACC 'kernels' decomposition: variable 'gangs_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
664 { dg-note {variable 'gangs_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
665 /* { dg-note {OpenACC 'kernels' decomposition: variable 'gangs_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
666 { dg-note {variable 'gangs_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
668 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
669 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
670 /* { dg-note {forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis} {} { target *-*-* } l_loop_i$c_loop_i } */
671 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
672 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
673 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
674 for (int i = N; i > -N; --i)
676 /* This is to make the loop unparallelizable. */
677 asm volatile ("" : : : "memory");
679 gangs_min = gangs_max = acc_gang ();
680 workers_min = workers_max = acc_worker ();
681 vectors_min = vectors_max = acc_vector ();
684 if (gangs_min != 0 || gangs_max != 1 - 1
685 || workers_min != 0 || workers_max != 1 - 1
686 || vectors_min != 0 || vectors_max != 1 - 1)
687 __builtin_abort ();
691 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
692 kernels even when there are explicit num_gangs, num_workers, or
693 vector_length clauses. */
695 int gangs = 5;
696 #define WORKERS 5
697 #define VECTORS 13
698 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
699 gangs_min = workers_min = vectors_min = INT_MAX;
700 gangs_max = workers_max = vectors_max = INT_MIN;
701 #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ \
702 num_gangs (gangs) \
703 num_workers (WORKERS) \
704 vector_length (VECTORS)
705 /* { dg-note {OpenACC 'kernels' decomposition: variable 'vectors_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
706 { dg-note {variable 'vectors_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
707 /* { dg-note {OpenACC 'kernels' decomposition: variable 'vectors_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
708 { dg-note {variable 'vectors_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
709 /* { dg-note {OpenACC 'kernels' decomposition: variable 'workers_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
710 { dg-note {variable 'workers_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
711 /* { dg-note {OpenACC 'kernels' decomposition: variable 'workers_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
712 { dg-note {variable 'workers_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
713 /* { dg-note {OpenACC 'kernels' decomposition: variable 'gangs_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
714 { dg-note {variable 'gangs_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
715 /* { dg-note {OpenACC 'kernels' decomposition: variable 'gangs_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
716 { dg-note {variable 'gangs_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
718 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
719 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
720 /* { dg-note {forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis} {} { target *-*-* } l_loop_i$c_loop_i } */
721 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
722 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
723 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
724 for (int i = N; i > -N; --i)
726 /* This is to make the loop unparallelizable. */
727 asm volatile ("" : : : "memory");
729 gangs_min = gangs_max = acc_gang ();
730 workers_min = workers_max = acc_worker ();
731 vectors_min = vectors_max = acc_vector ();
734 if (gangs_min != 0 || gangs_max != 1 - 1
735 || workers_min != 0 || workers_max != 1 - 1
736 || vectors_min != 0 || vectors_max != 1 - 1)
737 __builtin_abort ();
738 #undef VECTORS
739 #undef WORKERS
743 /* OpenACC serial construct. */
745 /* GR, WS, VS. */
747 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
748 gangs_min = workers_min = vectors_min = INT_MAX;
749 gangs_max = workers_max = vectors_max = INT_MIN;
750 #pragma acc serial /* { dg-line l_compute[incr c_compute] } */ \
751 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
752 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
753 /* { dg-warning {using 'vector_length \(32\)', ignoring 1} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
755 for (int i = N; i > -N; i--)
757 gangs_min = gangs_max = acc_gang ();
758 workers_min = workers_max = acc_worker ();
759 vectors_min = vectors_max = acc_vector ();
762 if (gangs_min != 0 || gangs_max != 1 - 1
763 || workers_min != 0 || workers_max != 1 - 1
764 || vectors_min != 0 || vectors_max != 1 - 1)
765 __builtin_abort ();
768 /* Composition of GP, WP, VP. */
770 int vectors_actual = 1; /* Implicit 'vector_length (1)' clause. */
771 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
772 gangs_min = workers_min = vectors_min = INT_MAX;
773 gangs_max = workers_max = vectors_max = INT_MIN;
774 #pragma acc serial /* { dg-line l_compute[incr c_compute] } */ \
775 copy (vectors_actual) \
776 copy (gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max)
777 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
778 /* { dg-bogus {warning: region contains gang partitioned code but is not gang partitioned} {TODO 'serial'} { xfail *-*-* } l_compute$c_compute }
779 { dg-bogus {warning: region contains worker partitioned code but is not worker partitioned} {TODO 'serial'} { xfail *-*-* } l_compute$c_compute }
780 { dg-bogus {warning: region contains vector partitioned code but is not vector partitioned} {TODO 'serial'} { xfail *-*-* } l_compute$c_compute } */
781 /* { dg-warning {using 'vector_length \(32\)', ignoring 1} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
783 if (acc_on_device (acc_device_nvidia))
784 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
785 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
787 /* The GCC nvptx back end enforces vector_length (32). */
788 /* It's unclear if that's actually permissible here;
789 <https://github.com/OpenACC/openacc-spec/issues/238> "OpenACC
790 'serial' construct might not actually be serial". */
791 vectors_actual = 32;
793 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
794 gang \
795 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
796 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
797 /* { dg-note {variable 'j' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
798 /* { dg-optimized {assigned OpenACC gang loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
799 for (int i = N; i > -N; i--)
800 #pragma acc loop /* { dg-line l_loop_j[incr c_loop_j] } */ \
801 worker \
802 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
803 /* { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_j$c_loop_j } */
804 /* { dg-note {variable 'k' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_j$c_loop_j } */
805 /* { dg-optimized {assigned OpenACC worker loop parallelism} {} { target *-*-* } l_loop_j$c_loop_j } */
806 for (int j = N; j > -N; j--)
807 #pragma acc loop /* { dg-line l_loop_k[incr c_loop_k] } */ \
808 vector \
809 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
810 /* { dg-note {variable 'k' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_k$c_loop_k } */
811 /* { dg-optimized {assigned OpenACC vector loop parallelism} {} { target *-*-* } l_loop_k$c_loop_k } */
812 for (int k = N * vectors_actual; k > -N * vectors_actual; k--)
814 gangs_min = gangs_max = acc_gang ();
815 workers_min = workers_max = acc_worker ();
816 vectors_min = vectors_max = acc_vector ();
819 if (acc_get_device_type () == acc_device_nvidia)
821 if (vectors_actual != 32)
822 __builtin_abort ();
824 else
825 if (vectors_actual != 1)
826 __builtin_abort ();
827 if (gangs_min != 0 || gangs_max != 1 - 1
828 || workers_min != 0 || workers_max != 1 - 1
829 || vectors_min != 0 || vectors_max != vectors_actual - 1)
830 __builtin_abort ();
834 return 0;