1 /* This code uses nvptx inline assembly guarded with acc_on_device, which is
2 not optimized away at -O0, and then confuses the target assembler.
3 { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
13 #define WORK_ID(I,N) \
14 (acc_on_device (acc_device_nvidia) \
16 __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (__r)); \
19 (acc_on_device (acc_device_nvidia) \
21 __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (__r)); \
24 #pragma acc routine worker
25 void __attribute__ ((noinline
))
26 WorkVec (int *ptr
, int w
, int h
, int nw
, int nv
)
28 #pragma acc loop worker
29 for (int i
= 0; i
< h
; i
++)
30 #pragma acc loop vector
31 for (int j
= 0; j
< w
; j
++)
32 ptr
[i
*w
+ j
] = (WORK_ID (i
, nw
) << 8) | VEC_ID(j
, nv
);
35 int DoWorkVec (int nw
)
37 int ary
[HEIGHT
][WIDTH
];
40 for (int ix
= 0; ix
!= HEIGHT
; ix
++)
41 for (int jx
= 0; jx
!= WIDTH
; jx
++)
42 ary
[ix
][jx
] = 0xdeadbeef;
44 printf ("spawning %d ...", nw
); fflush (stdout
);
46 #pragma acc parallel num_workers(nw) vector_length (NUM_VECTORS) copy (ary)
48 WorkVec ((int *)ary
, WIDTH
, HEIGHT
, nw
, NUM_VECTORS
);
51 for (int ix
= 0; ix
!= HEIGHT
; ix
++)
52 for (int jx
= 0; jx
!= WIDTH
; jx
++)
54 int exp
= ((ix
% nw
) << 8) | (jx
% NUM_VECTORS
);
56 if (ary
[ix
][jx
] != exp
)
58 printf ("\nary[%d][%d] = %#x expected %#x", ix
, jx
,
63 printf (err
? " failed\n" : " ok\n");
72 for (int W
= 1; W
<= NUM_WORKERS
; W
<<= 1)