libgomp/testsuite/libgomp.hsa.c/tiling-2.c

   1 /*
   2
   3    matmul.c : Matrix Multiplication with tiling for openmp4 example
   4
   5 */
   6
   7 #include <stdlib.h>
   8 #include <math.h>
   9
  10 #define BLOCK_SIZE 16
  11 /*
  12   #define BLOCK_SIZE 32
  13 */
  14 #define NSECPERSEC 1000000000L
  15
  16 typedef struct {
  17    int width;
  18    int height;
  19    int stride;
  20    int hpad;
  21    float* elements;
  22 } Matrix;
  23
  24 /* Correctly extract the number of nanoseconds from the two time structures */
  25 long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
  26    long int nanosecs;
  27    if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
  28       ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
  29       ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
  30    else nanosecs =
  31       (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
  32       ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
  33    return nanosecs;
  34 }
  35
  36 void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
  37      const float* B,const int LDB, const float beta,float* C, const int LDC) ;
  38 void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
  39      const float* B,const int LDB, const float beta,float* C, const int LDC) ;
  40 void  tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
  41      const float* B,const int LDB, const float beta,float* C, const int LDC) ;
  42
  43 int verify(float* v_res, float* v_ref, int len) {
  44     int passed = 1;
  45     int i;
  46     for (i = 0; i < len; ++i) {
  47         if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
  48           __builtin_abort ();
  49         }
  50     }
  51     return passed;
  52 }
  53
  54
  55 int main(int argc, char* argv[]){
  56
  57    Matrix A,B,Bt,C,Cref;
  58    int a1,a2,a3,i,j;
  59    struct timespec start_time1, end_time1;
  60    struct timespec start_time2, end_time2;
  61    long int nanosecs,total_ops;
  62    float gflopsTiled,gflopsCPU;
  63
  64    a1 = 35;
  65    a2 = 28;
  66    a3 = 47;
  67
  68    A.height = a1;
  69    A.width = a2;
  70    A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
  71    A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
  72    A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
  73
  74    B.height = a2;
  75    B.width = a3;
  76    B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
  77    B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
  78    B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
  79
  80    /* Bt is same as B but stored in column-major order */
  81    Bt.height = B.height;
  82    Bt.width = B.width;
  83    Bt.stride = B.stride;
  84    Bt.hpad = B.hpad;
  85    Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
  86
  87    C.height = a1;
  88    C.width = a3;
  89    C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
  90    C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
  91    C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
  92
  93    Cref.height = a1;
  94    Cref.width = a3;
  95    Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
  96    Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
  97    Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
  98
  99    for(i = 0; i < A.hpad ; i++)
 100       for(j = 0; j < A.stride; j++) {
 101          if (( j<A.width ) && (i<A.height)) {
 102             A.elements[i*A.stride + j] = (i % 3);
 103          } else {
 104             A.elements[i*A.stride + j] = 0.0;
 105          }
 106       }
 107
 108    /*  Initialize B and Bt */
 109    for(i = 0; i < B.hpad ; i++)
 110       for(j = 0; j < B.stride; j++) {
 111          if (( j<B.width ) && (i<B.height)) {
 112             B.elements[i*B.stride+j] = (j % 2);
 113             Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
 114          } else {
 115             B.elements[i*B.stride+j] = 0.0;
 116             Bt.elements[j*Bt.stride+i] = 0.0;
 117          }
 118       }
 119
 120    /* zero C, and Cref */
 121    for(i = 0; i < C.hpad; i++)
 122       for(j = 0; j < C.stride; j++) {
 123          C.elements[i*C.stride+j] = 0.0;
 124          Cref.elements[i*Cref.stride+j] = 0.0;
 125       }
 126
 127    simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
 128    tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
 129
 130    verify(C.elements, Cref.elements, C.height * C.stride);
 131    return 0;
 132 }
 133
 134 void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
 135 const float* B,const int LDB, const float beta,float* C, const int LDC) {
 136    /*  A,B, and C  are in row-major order */
 137    int c_row,c_col,inner;
 138    float sum;
 139    for (c_col  = 0 ;  c_col<N; c_col++ ) {
 140       for (c_row = 0 ; c_row<M; c_row++ ) {
 141          sum = 0.0 ;
 142          for (inner = 0 ; inner<K; inner++ ) {
 143             sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
 144          }
 145          C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
 146       }
 147    }
 148 }
 149
 150 /***************************
 151
 152    tiled_sgemm_tt:  Tiled matrix multiplication:
 153
 154 ***************************/
 155
 156 void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
 157    const float*B, const int LDB, const float beta, float*C, const int LDC){
 158
 159 #pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
 160 #pragma omp distribute collapse(2)
 161    for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) {
 162       for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) {
 163
 164 // We now have M/BLOCK_SIZE * N/BLOCK_SIZE teams = (M*N)/(BLOCK_SIZE*BLOCK_SIZE)
 165 // The grid global dimensions are M,N,1
 166 // The grid local dimensions are BLOCK_SIZE,BLOCK_SIZE,1
 167
 168 // -------------------------------------------------------------------
 169 //      The rest of this code forms the HSAIL kernel with the
 170 //      pairs of "paralell for collapse(2)" loops repalced with a barrier.
 171 //      The kernel initializes these values
 172 //      C_row_start = get_group_id(0) * BLOCK_SIZE
 173 //      C_col_start = get_group_id(1) * BLOCK_SIZE
 174 //      row=get_local_id(0)
 175 //      col=get_local_id(1)
 176 // -------------------------------------------------------------------
 177
 178 //       Each team has a local copy of these mini matrices
 179          float As[BLOCK_SIZE][BLOCK_SIZE];
 180          float Bs[BLOCK_SIZE][BLOCK_SIZE];
 181          float Cs[BLOCK_SIZE][BLOCK_SIZE];
 182          int C_row, C_col;
 183
 184          /* Zero Cs for this BLOCK */
 185 // - - - - - - - - - - - - - - - - - - - -
 186 // REPLACE NEXT THREE LINES WITH A BARRIER
 187 #pragma omp parallel for collapse(2)
 188          for (int row=0 ; row < BLOCK_SIZE ; row++) {
 189             for (int col=0 ; col < BLOCK_SIZE ; col++) {
 190 // END BARRIER
 191 // - - - - - - - - - - - - - - - - - - - -
 192                Cs[row][col] = 0.0;
 193             }
 194          }
 195
 196          // This kblock loop is run on the master thread of each team
 197          for (int kblock = 0; kblock  < K ; kblock += BLOCK_SIZE ) {
 198
 199             // Copy global memory values to local memory
 200 // - - - - - - - - - - - - - - - - - - - -
 201 // REPLACE NEXT THREE LINES WITH A BARRIER
 202 #pragma omp parallel for collapse(2)
 203             for (int row=0 ; row < BLOCK_SIZE ; row++) {
 204                for (int col=0 ; col < BLOCK_SIZE ; col++) {
 205 // END BARRIER
 206 // - - - - - - - - - - - - - - - - - - - -
 207                   C_row = C_row_start + row;
 208                   C_col = C_col_start + col;
 209                   if ((C_row < M) && (kblock + col < K))
 210                     As[row][col] = A[(C_row*LDA)+ kblock + col];
 211                   else
 212                     As[row][col] = 0;
 213                   if ((kblock + row < K) && C_col < N)
 214                     Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
 215                   else
 216                     Bs[row][col] = 0;
 217                }
 218             }
 219
 220             // Calculate Cs <- Sum(As X Bs) across all kblocks
 221 // - - - - - - - - - - - - - - - - - - - -
 222 // REPLACE NEXT THREE LINES WITH A BARRIER
 223 #pragma omp parallel for collapse(2)
 224             for (int row=0 ; row < BLOCK_SIZE ; row++) {
 225                for (int col=0 ; col < BLOCK_SIZE ; col++) {
 226 // END BARRIER
 227 // - - - - - - - - - - - - - - - - - - - -
 228                   for (int e = 0; e < BLOCK_SIZE; ++e)
 229                      Cs[row][col] += As[row][e] * Bs[e][col];
 230                 }
 231             }
 232
 233          }  /* End for kblock .. */
 234
 235
 236          // Scale Update actual C from Cs
 237 // - - - - - - - - - - - - - - - - - - - -
 238 // REPLACE NEXT THREE LINES WITH A BARRIER
 239 #pragma omp parallel for collapse(2)
 240          for (int row=0 ; row < BLOCK_SIZE ; row++) {
 241             for (int col=0 ; col < BLOCK_SIZE ; col++) {
 242 // END BARRIER
 243 // - - - - - - - - - - - - - - - - - - - -
 244                C_row = C_row_start + row;
 245                C_col = C_col_start + col;
 246                if ((C_row < M) && (C_col < N)) {
 247                  C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
 248                }
 249             }
 250          }
 251
 252 // -------------------------------------------------------------------
 253 // This is the end of the kernel
 254
 255       }
 256    }
 257
 258 }