2 extern "C" __global__ void
3 delay (clock_t * d_o, clock_t delay)
12 ticks = clock () - start;
15 extern "C" __global__ void
16 delay2 (unsigned long *d_o, clock_t delay, unsigned long tid)
25 ticks = clock () - start;
30 extern "C" __global__ void
31 sum (clock_t * d_o, int N)
35 __shared__ clock_t ticks[32];
39 for (i = threadIdx.x; i < N; i += blockDim.x)
42 ticks[threadIdx.x] = sum;
46 for (i = 16; i >= 1; i >>= 1)
49 ticks[threadIdx.x] += ticks[threadIdx.x + i];
57 extern "C" __global__ void
58 mult (int n, float *x, float *y)
60 int i = blockIdx.x * blockDim.x + threadIdx.x;
62 for (i = 0; i < n; i++)