1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda -lcublas -lcudart" } */
7 #include <cuda_runtime_api.h>
12 saxpy (int n
, float a
, float *x
, float *y
)
16 for (i
= 0; i
< n
; i
++)
18 y
[i
] = a
* x
[i
] + y
[i
];
23 context_check (CUcontext ctx1
)
28 r
= cuCtxGetCurrent (&ctx2
);
29 if (r
!= CUDA_SUCCESS
)
31 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
37 fprintf (stderr
, "new context established\n");
41 ctx3
= (CUcontext
) acc_get_current_cuda_context ();
45 fprintf (stderr
, "acc_get_current_cuda_context returned wrong value\n");
53 main (int argc
, char **argv
)
61 float *h_X
, *h_Y1
, *h_Y2
;
67 /* Test 4 - OpenACC creates, cuBLAS shares. */
69 acc_set_device_num (0, acc_device_nvidia
);
71 r
= cuCtxGetCurrent (&pctx
);
72 if (r
!= CUDA_SUCCESS
)
74 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
78 h_X
= (float *) malloc (N
* sizeof (float));
81 fprintf (stderr
, "malloc failed: for h_X\n");
85 h_Y1
= (float *) malloc (N
* sizeof (float));
88 fprintf (stderr
, "malloc failed: for h_Y1\n");
92 h_Y2
= (float *) malloc (N
* sizeof (float));
95 fprintf (stderr
, "malloc failed: for h_Y2\n");
99 for (i
= 0; i
< N
; i
++)
101 h_X
[i
] = rand () / (float) RAND_MAX
;
102 h_Y2
[i
] = h_Y1
[i
] = rand () / (float) RAND_MAX
;
105 #pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha)
109 for (i
= 0; i
< N
; i
++)
111 h_Y2
[i
] = alpha
* h_X
[i
] + h_Y2
[i
];
115 r
= cuCtxGetCurrent (&pctx
);
116 if (r
!= CUDA_SUCCESS
)
118 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
122 d_X
= (float *) acc_copyin (&h_X
[0], N
* sizeof (float));
125 fprintf (stderr
, "copyin error h_Y1\n");
129 d_Y
= (float *) acc_copyin (&h_Y1
[0], N
* sizeof (float));
132 fprintf (stderr
, "copyin error h_Y1\n");
136 s
= cublasCreate (&h
);
137 if (s
!= CUBLAS_STATUS_SUCCESS
)
139 fprintf (stderr
, "cublasCreate failed: %d\n", s
);
143 context_check (pctx
);
145 s
= cublasSaxpy (h
, N
, &alpha
, d_X
, 1, d_Y
, 1);
146 if (s
!= CUBLAS_STATUS_SUCCESS
)
148 fprintf (stderr
, "cublasSaxpy failed: %d\n", s
);
152 context_check (pctx
);
154 acc_memcpy_from_device (&h_Y1
[0], d_Y
, N
* sizeof (float));
156 context_check (pctx
);
161 for (i
= 0; i
< N
; ++i
)
165 diff
= h_Y1
[i
] - h_Y2
[i
];
166 error_norm
+= diff
* diff
;
167 ref_norm
+= h_Y2
[i
] * h_Y2
[i
];
170 error_norm
= (float) sqrt ((double) error_norm
);
171 ref_norm
= (float) sqrt ((double) ref_norm
);
173 if ((fabs (ref_norm
) < 1e-7) || ((error_norm
/ ref_norm
) >= 1e-6f
))
175 fprintf (stderr
, "math error\n");
186 context_check (pctx
);
188 s
= cublasDestroy (h
);
189 if (s
!= CUBLAS_STATUS_SUCCESS
)
191 fprintf (stderr
, "cublasDestroy failed: %d\n", s
);
195 context_check (pctx
);
197 acc_shutdown (acc_device_nvidia
);
199 r
= cuCtxGetCurrent (&pctx
);
200 if (r
!= CUDA_SUCCESS
)
202 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
208 fprintf (stderr
, "Unexpected context\n");