1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda -lcublas -lcudart" } */
3 /* { dg-require-effective-target openacc_cublas } */
4 /* { dg-require-effective-target openacc_cudart } */
9 #include <cuda_runtime_api.h>
10 #include <cublas_v2.h>
14 saxpy (int n
, float a
, float *x
, float *y
)
18 for (i
= 0; i
< n
; i
++)
20 y
[i
] = a
* x
[i
] + y
[i
];
25 context_check (CUcontext ctx1
)
30 r
= cuCtxGetCurrent (&ctx2
);
31 if (r
!= CUDA_SUCCESS
)
33 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
39 fprintf (stderr
, "new context established\n");
43 ctx3
= (CUcontext
) acc_get_current_cuda_context ();
47 fprintf (stderr
, "acc_get_current_cuda_context returned wrong value\n");
55 main (int argc
, char **argv
)
63 float *h_X
, *h_Y1
, *h_Y2
;
69 /* Test 4 - OpenACC creates, cuBLAS shares. */
71 acc_set_device_num (0, acc_device_nvidia
);
73 r
= cuCtxGetCurrent (&pctx
);
74 if (r
!= CUDA_SUCCESS
)
76 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
80 h_X
= (float *) malloc (N
* sizeof (float));
83 fprintf (stderr
, "malloc failed: for h_X\n");
87 h_Y1
= (float *) malloc (N
* sizeof (float));
90 fprintf (stderr
, "malloc failed: for h_Y1\n");
94 h_Y2
= (float *) malloc (N
* sizeof (float));
97 fprintf (stderr
, "malloc failed: for h_Y2\n");
101 for (i
= 0; i
< N
; i
++)
103 h_X
[i
] = rand () / (float) RAND_MAX
;
104 h_Y2
[i
] = h_Y1
[i
] = rand () / (float) RAND_MAX
;
107 #pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copy (alpha)
111 for (i
= 0; i
< N
; i
++)
113 h_Y2
[i
] = alpha
* h_X
[i
] + h_Y2
[i
];
117 r
= cuCtxGetCurrent (&pctx
);
118 if (r
!= CUDA_SUCCESS
)
120 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
124 d_X
= (float *) acc_copyin (&h_X
[0], N
* sizeof (float));
127 fprintf (stderr
, "copyin error h_Y1\n");
131 d_Y
= (float *) acc_copyin (&h_Y1
[0], N
* sizeof (float));
134 fprintf (stderr
, "copyin error h_Y1\n");
138 s
= cublasCreate (&h
);
139 if (s
!= CUBLAS_STATUS_SUCCESS
)
141 fprintf (stderr
, "cublasCreate failed: %d\n", s
);
145 context_check (pctx
);
147 s
= cublasSaxpy (h
, N
, &alpha
, d_X
, 1, d_Y
, 1);
148 if (s
!= CUBLAS_STATUS_SUCCESS
)
150 fprintf (stderr
, "cublasSaxpy failed: %d\n", s
);
154 context_check (pctx
);
156 acc_memcpy_from_device (&h_Y1
[0], d_Y
, N
* sizeof (float));
158 context_check (pctx
);
163 for (i
= 0; i
< N
; ++i
)
167 diff
= h_Y1
[i
] - h_Y2
[i
];
168 error_norm
+= diff
* diff
;
169 ref_norm
+= h_Y2
[i
] * h_Y2
[i
];
172 error_norm
= (float) sqrt ((double) error_norm
);
173 ref_norm
= (float) sqrt ((double) ref_norm
);
175 if ((fabs (ref_norm
) < 1e-7) || ((error_norm
/ ref_norm
) >= 1e-6f
))
177 fprintf (stderr
, "math error\n");
181 acc_delete (&h_X
[0], N
* sizeof (float));
182 acc_delete (&h_Y1
[0], N
* sizeof (float));
188 context_check (pctx
);
190 s
= cublasDestroy (h
);
191 if (s
!= CUBLAS_STATUS_SUCCESS
)
193 fprintf (stderr
, "cublasDestroy failed: %d\n", s
);
197 context_check (pctx
);
199 acc_shutdown (acc_device_nvidia
);
201 r
= cuCtxGetCurrent (&pctx
);
202 if (r
!= CUDA_SUCCESS
)
204 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
210 fprintf (stderr
, "Unexpected context\n");