* builtins.def (BUILT_IN_SETJMP): Revert latest change.
[official-gcc.git] / libgomp / testsuite / libgomp.hsa.c / tiling-2.c
blob6e5430455598e9c38f3e97a74529612de372187c
1 /*
3 matmul.c : Matrix Multiplication with tiling for openmp4 example
5 */
7 #include <stdlib.h>
8 #include <math.h>
10 #define BLOCK_SIZE 16
12 #define BLOCK_SIZE 32
14 #define NSECPERSEC 1000000000L
16 typedef struct {
17 int width;
18 int height;
19 int stride;
20 int hpad;
21 float* elements;
22 } Matrix;
24 /* Correctly extract the number of nanoseconds from the two time structures */
25 long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
26 long int nanosecs;
27 if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
28 ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
29 ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
30 else nanosecs =
31 (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
32 ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
33 return nanosecs;
36 void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
37 const float* B,const int LDB, const float beta,float* C, const int LDC) ;
38 void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
39 const float* B,const int LDB, const float beta,float* C, const int LDC) ;
40 void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
41 const float* B,const int LDB, const float beta,float* C, const int LDC) ;
43 int verify(float* v_res, float* v_ref, int len) {
44 int passed = 1;
45 int i;
46 for (i = 0; i < len; ++i) {
47 if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
48 __builtin_abort ();
51 return passed;
55 int main(int argc, char* argv[]){
57 Matrix A,B,Bt,C,Cref;
58 int a1,a2,a3,i,j;
59 struct timespec start_time1, end_time1;
60 struct timespec start_time2, end_time2;
61 long int nanosecs,total_ops;
62 float gflopsTiled,gflopsCPU;
64 a1 = 35;
65 a2 = 28;
66 a3 = 47;
68 A.height = a1;
69 A.width = a2;
70 A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
71 A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
72 A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
74 B.height = a2;
75 B.width = a3;
76 B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
77 B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
78 B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
80 /* Bt is same as B but stored in column-major order */
81 Bt.height = B.height;
82 Bt.width = B.width;
83 Bt.stride = B.stride;
84 Bt.hpad = B.hpad;
85 Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
87 C.height = a1;
88 C.width = a3;
89 C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
90 C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
91 C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
93 Cref.height = a1;
94 Cref.width = a3;
95 Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
96 Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
97 Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
99 for(i = 0; i < A.hpad ; i++)
100 for(j = 0; j < A.stride; j++) {
101 if (( j<A.width ) && (i<A.height)) {
102 A.elements[i*A.stride + j] = (i % 3);
103 } else {
104 A.elements[i*A.stride + j] = 0.0;
108 /* Initialize B and Bt */
109 for(i = 0; i < B.hpad ; i++)
110 for(j = 0; j < B.stride; j++) {
111 if (( j<B.width ) && (i<B.height)) {
112 B.elements[i*B.stride+j] = (j % 2);
113 Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
114 } else {
115 B.elements[i*B.stride+j] = 0.0;
116 Bt.elements[j*Bt.stride+i] = 0.0;
120 /* zero C, and Cref */
121 for(i = 0; i < C.hpad; i++)
122 for(j = 0; j < C.stride; j++) {
123 C.elements[i*C.stride+j] = 0.0;
124 Cref.elements[i*Cref.stride+j] = 0.0;
127 simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
128 tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
130 verify(C.elements, Cref.elements, C.height * C.stride);
131 return 0;
134 void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
135 const float* B,const int LDB, const float beta,float* C, const int LDC) {
136 /* A,B, and C are in row-major order */
137 int c_row,c_col,inner;
138 float sum;
139 for (c_col = 0 ; c_col<N; c_col++ ) {
140 for (c_row = 0 ; c_row<M; c_row++ ) {
141 sum = 0.0 ;
142 for (inner = 0 ; inner<K; inner++ ) {
143 sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
145 C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
150 /***************************
152 tiled_sgemm_tt: Tiled matrix multiplication:
154 ***************************/
156 void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
157 const float*B, const int LDB, const float beta, float*C, const int LDC){
159 #pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
160 #pragma omp distribute collapse(2)
161 for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) {
162 for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) {
164 // We now have M/BLOCK_SIZE * N/BLOCK_SIZE teams = (M*N)/(BLOCK_SIZE*BLOCK_SIZE)
165 // The grid global dimensions are M,N,1
166 // The grid local dimensions are BLOCK_SIZE,BLOCK_SIZE,1
168 // -------------------------------------------------------------------
169 // The rest of this code forms the HSAIL kernel with the
170 // pairs of "paralell for collapse(2)" loops repalced with a barrier.
171 // The kernel initializes these values
172 // C_row_start = get_group_id(0) * BLOCK_SIZE
173 // C_col_start = get_group_id(1) * BLOCK_SIZE
174 // row=get_local_id(0)
175 // col=get_local_id(1)
176 // -------------------------------------------------------------------
178 // Each team has a local copy of these mini matrices
179 float As[BLOCK_SIZE][BLOCK_SIZE];
180 float Bs[BLOCK_SIZE][BLOCK_SIZE];
181 float Cs[BLOCK_SIZE][BLOCK_SIZE];
182 int C_row, C_col;
184 /* Zero Cs for this BLOCK */
185 // - - - - - - - - - - - - - - - - - - - -
186 // REPLACE NEXT THREE LINES WITH A BARRIER
187 #pragma omp parallel for collapse(2)
188 for (int row=0 ; row < BLOCK_SIZE ; row++) {
189 for (int col=0 ; col < BLOCK_SIZE ; col++) {
190 // END BARRIER
191 // - - - - - - - - - - - - - - - - - - - -
192 Cs[row][col] = 0.0;
196 // This kblock loop is run on the master thread of each team
197 for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE ) {
199 // Copy global memory values to local memory
200 // - - - - - - - - - - - - - - - - - - - -
201 // REPLACE NEXT THREE LINES WITH A BARRIER
202 #pragma omp parallel for collapse(2)
203 for (int row=0 ; row < BLOCK_SIZE ; row++) {
204 for (int col=0 ; col < BLOCK_SIZE ; col++) {
205 // END BARRIER
206 // - - - - - - - - - - - - - - - - - - - -
207 C_row = C_row_start + row;
208 C_col = C_col_start + col;
209 if ((C_row < M) && (kblock + col < K))
210 As[row][col] = A[(C_row*LDA)+ kblock + col];
211 else
212 As[row][col] = 0;
213 if ((kblock + row < K) && C_col < N)
214 Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
215 else
216 Bs[row][col] = 0;
220 // Calculate Cs <- Sum(As X Bs) across all kblocks
221 // - - - - - - - - - - - - - - - - - - - -
222 // REPLACE NEXT THREE LINES WITH A BARRIER
223 #pragma omp parallel for collapse(2)
224 for (int row=0 ; row < BLOCK_SIZE ; row++) {
225 for (int col=0 ; col < BLOCK_SIZE ; col++) {
226 // END BARRIER
227 // - - - - - - - - - - - - - - - - - - - -
228 for (int e = 0; e < BLOCK_SIZE; ++e)
229 Cs[row][col] += As[row][e] * Bs[e][col];
233 } /* End for kblock .. */
236 // Scale Update actual C from Cs
237 // - - - - - - - - - - - - - - - - - - - -
238 // REPLACE NEXT THREE LINES WITH A BARRIER
239 #pragma omp parallel for collapse(2)
240 for (int row=0 ; row < BLOCK_SIZE ; row++) {
241 for (int col=0 ; col < BLOCK_SIZE ; col++) {
242 // END BARRIER
243 // - - - - - - - - - - - - - - - - - - - -
244 C_row = C_row_start + row;
245 C_col = C_col_start + col;
246 if ((C_row < M) && (C_col < N)) {
247 C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
252 // -------------------------------------------------------------------
253 // This is the end of the kernel