2 /* { dg-additional-options "-DUSE_CUDA_H" { target openacc_cuda } } */
3 /* { dg-additional-options "-lcuda" { target { openacc_nvidia_accel_selected && openacc_cuda } } } */
7 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
15 main (int argc
, char **argv
)
17 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
21 int N
= 128; //1024 * 1024;
22 float *a
, *b
, *c
, *d
, *e
;
26 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
27 acc_init (acc_device_nvidia
);
30 nbytes
= N
* sizeof (float);
32 a
= (float *) malloc (nbytes
);
33 b
= (float *) malloc (nbytes
);
34 c
= (float *) malloc (nbytes
);
35 d
= (float *) malloc (nbytes
);
36 e
= (float *) malloc (nbytes
);
38 for (i
= 0; i
< N
; i
++)
44 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
47 #pragma acc parallel async
51 for (ii
= 0; ii
< N
; ii
++)
59 for (i
= 0; i
< N
; i
++)
68 for (i
= 0; i
< N
; i
++)
74 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
77 #pragma acc parallel async (1)
81 for (ii
= 0; ii
< N
; ii
++)
89 for (i
= 0; i
< N
; i
++)
98 for (i
= 0; i
< N
; i
++)
106 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
109 #pragma acc parallel async (1)
113 for (ii
= 0; ii
< N
; ii
++)
114 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
117 #pragma acc parallel async (1)
121 for (ii
= 0; ii
< N
; ii
++)
122 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
126 #pragma acc parallel async (1)
130 for (ii
= 0; ii
< N
; ii
++)
131 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
138 for (i
= 0; i
< N
; i
++)
153 for (i
= 0; i
< N
; i
++)
162 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
165 #pragma acc parallel async (1)
169 for (ii
= 0; ii
< N
; ii
++)
170 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
173 #pragma acc parallel async (1)
177 for (ii
= 0; ii
< N
; ii
++)
178 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
181 #pragma acc parallel async (1)
185 for (ii
= 0; ii
< N
; ii
++)
186 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
189 #pragma acc parallel wait (1) async (1)
193 for (ii
= 0; ii
< N
; ii
++)
194 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
201 for (i
= 0; i
< N
; i
++)
220 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
221 r
= cuStreamCreate (&stream1
, CU_STREAM_NON_BLOCKING
);
222 if (r
!= CUDA_SUCCESS
)
224 fprintf (stderr
, "cuStreamCreate failed: %d\n", r
);
228 acc_set_cuda_stream (1, stream1
);
231 for (i
= 0; i
< N
; i
++)
237 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
240 #pragma acc parallel async (1)
244 for (ii
= 0; ii
< N
; ii
++)
252 for (i
= 0; i
< N
; i
++)
261 for (i
= 0; i
< N
; i
++)
269 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
272 #pragma acc parallel async (1)
276 for (ii
= 0; ii
< N
; ii
++)
277 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
280 #pragma acc parallel async (1)
284 for (ii
= 0; ii
< N
; ii
++)
285 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
288 #pragma acc parallel async (1)
292 for (ii
= 0; ii
< N
; ii
++)
293 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
300 for (i
= 0; i
< N
; i
++)
315 for (i
= 0; i
< N
; i
++)
324 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
327 #pragma acc parallel async (1)
331 for (ii
= 0; ii
< N
; ii
++)
332 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
335 #pragma acc parallel async (1)
339 for (ii
= 0; ii
< N
; ii
++)
340 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
343 #pragma acc parallel async (1)
347 for (ii
= 0; ii
< N
; ii
++)
348 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
351 #pragma acc parallel wait (1) async (1)
355 for (ii
= 0; ii
< N
; ii
++)
356 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
363 for (i
= 0; i
< N
; i
++)
381 for (i
= 0; i
< N
; i
++)
390 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
393 #pragma acc parallel async (1)
397 for (ii
= 0; ii
< N
; ii
++)
398 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
401 #pragma acc parallel async (1)
405 for (ii
= 0; ii
< N
; ii
++)
406 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
409 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
413 for (i
= 0; i
< N
; i
++)
426 for (i
= 0; i
< N
; i
++)
435 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
438 #pragma acc parallel async (1)
442 for (ii
= 0; ii
< N
; ii
++)
443 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
446 #pragma acc parallel async (1)
450 for (ii
= 0; ii
< N
; ii
++)
451 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
454 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
460 for (i
= 0; i
< N
; i
++)
472 for (i
= 0; i
< N
; i
++)
478 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
481 #pragma acc kernels async
485 for (ii
= 0; ii
< N
; ii
++)
493 for (i
= 0; i
< N
; i
++)
502 for (i
= 0; i
< N
; i
++)
508 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
511 #pragma acc kernels async (1)
515 for (ii
= 0; ii
< N
; ii
++)
523 for (i
= 0; i
< N
; i
++)
532 for (i
= 0; i
< N
; i
++)
540 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
543 #pragma acc kernels async (1)
547 for (ii
= 0; ii
< N
; ii
++)
548 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
551 #pragma acc kernels async (1)
555 for (ii
= 0; ii
< N
; ii
++)
556 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
560 #pragma acc kernels async (1)
564 for (ii
= 0; ii
< N
; ii
++)
565 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
572 for (i
= 0; i
< N
; i
++)
587 for (i
= 0; i
< N
; i
++)
596 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
599 #pragma acc kernels async (1)
603 for (ii
= 0; ii
< N
; ii
++)
604 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
607 #pragma acc kernels async (1)
611 for (ii
= 0; ii
< N
; ii
++)
612 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
615 #pragma acc kernels async (1)
619 for (ii
= 0; ii
< N
; ii
++)
620 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
623 #pragma acc kernels wait (1) async (1)
627 for (ii
= 0; ii
< N
; ii
++)
628 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
635 for (i
= 0; i
< N
; i
++)
654 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
655 r
= cuStreamCreate (&stream1
, CU_STREAM_NON_BLOCKING
);
656 if (r
!= CUDA_SUCCESS
)
658 fprintf (stderr
, "cuStreamCreate failed: %d\n", r
);
662 acc_set_cuda_stream (1, stream1
);
665 for (i
= 0; i
< N
; i
++)
671 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
674 #pragma acc kernels async (1)
678 for (ii
= 0; ii
< N
; ii
++)
686 for (i
= 0; i
< N
; i
++)
695 for (i
= 0; i
< N
; i
++)
703 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
706 #pragma acc kernels async (1)
710 for (ii
= 0; ii
< N
; ii
++)
711 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
714 #pragma acc kernels async (1)
718 for (ii
= 0; ii
< N
; ii
++)
719 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
722 #pragma acc kernels async (1)
726 for (ii
= 0; ii
< N
; ii
++)
727 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
734 for (i
= 0; i
< N
; i
++)
749 for (i
= 0; i
< N
; i
++)
758 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
761 #pragma acc kernels async (1)
765 for (ii
= 0; ii
< N
; ii
++)
766 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
769 #pragma acc kernels async (1)
773 for (ii
= 0; ii
< N
; ii
++)
774 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
777 #pragma acc kernels async (1)
781 for (ii
= 0; ii
< N
; ii
++)
782 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
785 #pragma acc kernels wait (1) async (1)
789 for (ii
= 0; ii
< N
; ii
++)
790 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
797 for (i
= 0; i
< N
; i
++)
815 for (i
= 0; i
< N
; i
++)
824 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
827 #pragma acc kernels async (1)
831 for (ii
= 0; ii
< N
; ii
++)
832 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
835 #pragma acc kernels async (1)
839 for (ii
= 0; ii
< N
; ii
++)
840 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
843 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
847 for (i
= 0; i
< N
; i
++)
860 for (i
= 0; i
< N
; i
++)
869 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
872 #pragma acc kernels async (1)
876 for (ii
= 0; ii
< N
; ii
++)
877 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
880 #pragma acc kernels async (1)
884 for (ii
= 0; ii
< N
; ii
++)
885 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
888 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
894 for (i
= 0; i
< N
; i
++)
906 #if defined ACC_DEVICE_TYPE_nvidia && defined USE_CUDA_H
907 acc_shutdown (acc_device_nvidia
);