1 /* This code uses nvptx inline assembly guarded with acc_on_device, which is
2 not optimized away at -O0, and then confuses the target assembler.
3 { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
5 /* { dg-additional-options "-fopenacc-dim=32" } */
10 int check (const int *ary
, int size
, int gp
, int wp
, int vp
)
14 int gangs
[32], workers
[32], vectors
[32];
16 for (ix
= 0; ix
< 32; ix
++)
17 gangs
[ix
] = workers
[ix
] = vectors
[ix
] = 0;
19 for (ix
= 0; ix
< size
; ix
++)
21 vectors
[ary
[ix
] & 0xff]++;
22 workers
[(ary
[ix
] >> 8) & 0xff]++;
23 gangs
[(ary
[ix
] >> 16) & 0xff]++;
26 for (ix
= 0; ix
< 32; ix
++)
30 int expect
= gangs
[0];
31 if (gangs
[ix
] != expect
)
34 printf ("gang %d not used %d times\n", ix
, expect
);
37 else if (ix
&& gangs
[ix
])
40 printf ("gang %d unexpectedly used\n", ix
);
45 int expect
= workers
[0];
46 if (workers
[ix
] != expect
)
49 printf ("worker %d not used %d times\n", ix
, expect
);
52 else if (ix
&& workers
[ix
])
55 printf ("worker %d unexpectedly used\n", ix
);
60 int expect
= vectors
[0];
61 if (vectors
[ix
] != expect
)
64 printf ("vector %d not used %d times\n", ix
, expect
);
67 else if (ix
&& vectors
[ix
])
70 printf ("vector %d unexpectedly used\n", ix
);
77 #pragma acc routine seq
78 static int __attribute__((noinline
)) place ()
82 if (acc_on_device (acc_device_nvidia
))
84 int g
= 0, w
= 0, v
= 0;
86 __asm__
volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g
));
87 __asm__
volatile ("mov.u32 %0,%%tid.y;" : "=r" (w
));
88 __asm__
volatile ("mov.u32 %0,%%tid.x;" : "=r" (v
));
89 r
= (g
<< 16) | (w
<< 8) | v
;
94 static void clear (int *ary
, int size
)
98 for (ix
= 0; ix
< size
; ix
++)
102 int vector_1 (int *ary
, int size
)
106 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
108 #pragma acc loop gang
109 for (int jx
= 0; jx
< 1; jx
++)
110 #pragma acc loop auto
111 for (int ix
= 0; ix
< size
; ix
++)
115 return check (ary
, size
, 0, 1, 1);
118 int vector_2 (int *ary
, int size
)
122 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
124 #pragma acc loop worker
125 for (int jx
= 0; jx
< size
/ 64; jx
++)
126 #pragma acc loop auto
127 for (int ix
= 0; ix
< 64; ix
++)
128 ary
[ix
+ jx
* 64] = place ();
131 return check (ary
, size
, 0, 1, 1);
134 int worker_1 (int *ary
, int size
)
138 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
140 #pragma acc loop gang
141 for (int kx
= 0; kx
< 1; kx
++)
142 #pragma acc loop auto
143 for (int jx
= 0; jx
< size
/ 64; jx
++)
144 #pragma acc loop vector
145 for (int ix
= 0; ix
< 64; ix
++)
146 ary
[ix
+ jx
* 64] = place ();
149 return check (ary
, size
, 0, 1, 1);
152 int gang_1 (int *ary
, int size
)
156 #pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
158 #pragma acc loop auto
159 for (int jx
= 0; jx
< size
/ 64; jx
++)
160 #pragma acc loop worker
161 for (int ix
= 0; ix
< 64; ix
++)
162 ary
[ix
+ jx
* 64] = place ();
165 return check (ary
, size
, 1, 1, 0);
168 int gang_2 (int *ary
, int size
)
172 #pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
174 #pragma acc loop auto
175 for (int kx
= 0; kx
< size
/ (32 * 32); kx
++)
176 #pragma acc loop auto
177 for (int jx
= 0; jx
< 32; jx
++)
178 #pragma acc loop auto
179 for (int ix
= 0; ix
< 32; ix
++)
180 ary
[ix
+ jx
* 32 + kx
* 32 * 32] = place ();
183 return check (ary
, size
, 1, 1, 1);
186 int gang_3 (int *ary
, int size
)
190 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
192 #pragma acc loop auto
193 for (int jx
= 0; jx
< size
/ 64; jx
++)
194 #pragma acc loop auto
195 for (int ix
= 0; ix
< 64; ix
++)
196 ary
[ix
+ jx
* 64] = place ();
199 return check (ary
, size
, 1, 1, 1);
202 int gang_4 (int *ary
, int size
)
206 #pragma acc parallel vector_length(32) copy(ary[0:size]) firstprivate (size)
208 #pragma acc loop auto
209 for (int jx
= 0; jx
< size
; jx
++)
213 return check (ary
, size
, 1, 0, 1);
216 #define N (32*32*32*2)
221 #pragma acc parallel copy(ondev)
223 ondev
= acc_on_device (acc_device_not_host
);
230 if (vector_1 (ary
, N
))
232 if (vector_2 (ary
, N
))
235 if (worker_1 (ary
, N
))