1 /* This code uses nvptx inline assembly guarded with acc_on_device, which is
2 not optimized away at -O0, and then confuses the target assembler. */
3 /* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
4 /* { dg-additional-options "-fopenacc-dim=16:16" } */
12 static int __attribute__ ((noinline
)) coord ()
16 if (acc_on_device (acc_device_nvidia
))
18 int g
= 0, w
= 0, v
= 0;
20 __asm__
volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g
));
21 __asm__
volatile ("mov.u32 %0,%%tid.y;" : "=r" (w
));
22 __asm__
volatile ("mov.u32 %0,%%tid.x;" : "=r" (v
));
23 res
= (1 << 24) | (g
<< 16) | (w
<< 8) | v
;
29 int check (const int *ary
, int size
, int gp
, int wp
, int vp
)
33 int *gangs
= (int *)alloca (gp
* sizeof (int));
34 int *workers
= (int *)alloca (wp
* sizeof (int));
35 int *vectors
= (int *)alloca (vp
* sizeof (int));
38 memset (gangs
, 0, gp
* sizeof (int));
39 memset (workers
, 0, wp
* sizeof (int));
40 memset (vectors
, 0, vp
* sizeof (int));
42 for (ix
= 0; ix
< size
; ix
++)
44 int g
= (ary
[ix
] >> 16) & 0xff;
45 int w
= (ary
[ix
] >> 8) & 0xff;
46 int v
= (ary
[ix
] >> 0) & 0xff;
48 if (g
>= gp
|| w
>= wp
|| v
>= vp
)
50 printf ("unexpected cpu %#x used\n", ary
[ix
]);
59 offloaded
+= ary
[ix
] >> 24;
65 if (offloaded
!= size
)
67 printf ("offloaded %d times, expected %d\n", offloaded
, size
);
71 for (ix
= 0; ix
< gp
; ix
++)
72 if (gangs
[ix
] != gangs
[0])
74 printf ("gang %d not used %d times\n", ix
, gangs
[0]);
78 for (ix
= 0; ix
< wp
; ix
++)
79 if (workers
[ix
] != workers
[0])
81 printf ("worker %d not used %d times\n", ix
, workers
[0]);
85 for (ix
= 0; ix
< vp
; ix
++)
86 if (vectors
[ix
] != vectors
[0])
88 printf ("vector %d not used %d times\n", ix
, vectors
[0]);
97 int test_1 (int gp
, int wp
, int vp
)
102 #pragma acc parallel copyout (ary)
104 #pragma acc loop gang (static:1)
105 for (int ix
= 0; ix
< N
; ix
++)
109 exit
|= check (ary
, N
, gp
, 1, 1);
111 #pragma acc parallel copyout (ary)
113 #pragma acc loop worker
114 for (int ix
= 0; ix
< N
; ix
++)
118 exit
|= check (ary
, N
, 1, wp
, 1);
120 #pragma acc parallel copyout (ary)
122 #pragma acc loop vector
123 for (int ix
= 0; ix
< N
; ix
++)
127 exit
|= check (ary
, N
, 1, 1, vp
);
134 return test_1 (16, 16, 32);