1 /* AMD GCN does not use 32-lane vectors, so the expected use counts mismatch.
2 { dg-skip-if "unsuitable dimensions" { openacc_radeon_accel_selected } { "*" } { "" } } */
4 /* { dg-additional-options "-fopenacc-dim=32" } */
8 #include <gomp-constants.h>
10 static int check (const int *ary
, int size
, int gp
, int wp
, int vp
)
14 int gangs
[32], workers
[32], vectors
[32];
16 for (ix
= 0; ix
< 32; ix
++)
17 gangs
[ix
] = workers
[ix
] = vectors
[ix
] = 0;
19 for (ix
= 0; ix
< size
; ix
++)
21 vectors
[ary
[ix
] & 0xff]++;
22 workers
[(ary
[ix
] >> 8) & 0xff]++;
23 gangs
[(ary
[ix
] >> 16) & 0xff]++;
26 for (ix
= 0; ix
< 32; ix
++)
30 int expect
= gangs
[0];
31 if (gangs
[ix
] != expect
)
34 printf ("gang %d not used %d times\n", ix
, expect
);
37 else if (ix
&& gangs
[ix
])
40 printf ("gang %d unexpectedly used\n", ix
);
45 int expect
= workers
[0];
46 if (workers
[ix
] != expect
)
49 printf ("worker %d not used %d times\n", ix
, expect
);
52 else if (ix
&& workers
[ix
])
55 printf ("worker %d unexpectedly used\n", ix
);
60 int expect
= vectors
[0];
61 if (vectors
[ix
] != expect
)
64 printf ("vector %d not used %d times\n", ix
, expect
);
67 else if (ix
&& vectors
[ix
])
70 printf ("vector %d unexpectedly used\n", ix
);
77 #pragma acc routine seq
78 static int __attribute__((noinline
)) place ()
82 if (acc_on_device (acc_device_not_host
))
86 g
= __builtin_goacc_parlevel_id (GOMP_DIM_GANG
);
87 w
= __builtin_goacc_parlevel_id (GOMP_DIM_WORKER
);
88 v
= __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR
);
89 r
= (g
<< 16) | (w
<< 8) | v
;
94 static void clear (int *ary
, int size
)
98 for (ix
= 0; ix
< size
; ix
++)
102 int gang_vector_1 (int *ary
, int size
)
105 #pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
107 #pragma acc loop tile(128) gang vector
108 for (int jx
= 0; jx
< size
; jx
++)
112 return check (ary
, size
, 1, 0, 1);
115 int gang_vector_2a (int *ary
, int size
)
121 #pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
123 #pragma acc loop tile(64, 64) gang vector
124 for (int jx
= 0; jx
< size
/ 256; jx
++)
125 for (int ix
= 0; ix
< 256; ix
++)
126 ary
[jx
* 256 + ix
] = place ();
129 return check (ary
, size
, 1, 0, 1);
132 int gang_vector_2b (int *ary
, int size
)
138 #pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
140 #pragma acc loop tile(64, 64) gang vector
141 for (int jx
= 0; jx
< size
; jx
+= 256)
142 for (int ix
= 0; ix
< 256; ix
++)
143 ary
[jx
+ ix
] = place ();
146 return check (ary
, size
, 1, 0, 1);
149 int worker_vector_2a (int *ary
, int size
)
155 #pragma acc parallel vector_length(32) num_workers (32) copy (ary[0:size]) firstprivate (size)
157 #pragma acc loop tile(64, 64) worker vector
158 for (int jx
= 0; jx
< size
/ 256; jx
++)
159 for (int ix
= 0; ix
< 256; ix
++)
160 ary
[jx
* 256 + ix
] = place ();
163 return check (ary
, size
, 0, 1, 1);
166 int worker_vector_2b (int *ary
, int size
)
172 #pragma acc parallel vector_length(32) num_workers (32) copy (ary[0:size]) firstprivate (size)
174 #pragma acc loop tile(64, 64) worker vector
175 for (int jx
= 0; jx
< size
; jx
+= 256)
176 for (int ix
= 0; ix
< 256; ix
++)
177 ary
[jx
+ ix
] = place ();
180 return check (ary
, size
, 0, 1, 1);
183 int gang_worker_vector_2a (int *ary
, int size
)
188 #pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
190 #pragma acc loop tile(32, 32)
191 for (int jx
= 0; jx
< size
/ 256; jx
++)
192 for (int ix
= 0; ix
< 256; ix
++)
193 ary
[jx
* 256 + ix
] = place ();
196 return check (ary
, size
, 1, 1, 1);
199 int gang_worker_vector_2b (int *ary
, int size
)
204 #pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
206 #pragma acc loop tile(32, 32)
207 for (int jx
= 0; jx
< size
; jx
+= 256)
208 for (int ix
= 0; ix
< 256; ix
++)
209 ary
[jx
+ ix
] = place ();
212 return check (ary
, size
, 1, 1, 1);
215 int gang_worker_vector_star_2a (int *ary
, int size
)
221 #pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
223 #pragma acc loop tile(*, *)
224 for (int jx
= 0; jx
< size
/ 256; jx
++)
225 for (int ix
= 0; ix
< 256; ix
++)
226 ary
[jx
* 256 + ix
] = place ();
229 return check (ary
, size
, 1, 1, 1);
232 int gang_worker_vector_star_2b (int *ary
, int size
)
238 #pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
240 #pragma acc loop tile(*, *)
241 for (int jx
= 0; jx
< size
; jx
+=256)
242 for (int ix
= 0; ix
< 256; ix
++)
243 ary
[jx
+ ix
] = place ();
246 return check (ary
, size
, 1, 1, 1);
249 #define N (32*32*32*8)
254 #pragma acc parallel copy(ondev)
256 ondev
= acc_on_device (acc_device_not_host
);
262 if (gang_vector_1 (ary
, N
))
264 if (gang_vector_2a (ary
, N
))
266 if (worker_vector_2a (ary
, N
))
268 if (gang_worker_vector_2a (ary
, N
))
270 if (gang_worker_vector_star_2a (ary
, N
))
272 if (gang_vector_2b (ary
, N
))
274 if (worker_vector_2b (ary
, N
))
276 if (gang_worker_vector_2b (ary
, N
))
278 if (gang_worker_vector_star_2b (ary
, N
))