3 matmul.c : Matrix Multiplication with tiling for openmp4 example
14 #define NSECPERSEC 1000000000L
24 /* Correctly extract the number of nanoseconds from the two time structures */
25 long int get_nanosecs( struct timespec start_time
, struct timespec end_time
) {
27 if ((end_time
.tv_nsec
-start_time
.tv_nsec
)<0) nanosecs
=
28 ((((long int) end_time
.tv_sec
- (long int) start_time
.tv_sec
)-1)*NSECPERSEC
) +
29 ( NSECPERSEC
+ (long int) end_time
.tv_nsec
- (long int) start_time
.tv_nsec
) ;
31 (((long int) end_time
.tv_sec
- (long int) start_time
.tv_sec
)*NSECPERSEC
) +
32 ( (long int) end_time
.tv_nsec
- (long int) start_time
.tv_nsec
);
36 void simple_sgemm_tt(const int M
,const int N
,const int K
,const float alpha
, const float* A
,const int LDA
,
37 const float* B
,const int LDB
, const float beta
,float* C
, const int LDC
) ;
38 void simple_sgemm_tn(const int M
,const int N
,const int K
,const float alpha
, const float* A
,const int LDA
,
39 const float* B
,const int LDB
, const float beta
,float* C
, const int LDC
) ;
40 void tiled_sgemm_tt(const int M
,const int N
,const int K
,const float alpha
, const float*A
, const int LDA
,
41 const float* B
,const int LDB
, const float beta
,float* C
, const int LDC
) ;
43 int verify(float* v_res
, float* v_ref
, int len
) {
46 for (i
= 0; i
< len
; ++i
) {
47 if (fabs(v_res
[i
] - v_ref
[i
]) > 0.001*v_ref
[i
]) {
55 int main(int argc
, char* argv
[]){
59 struct timespec start_time1
, end_time1
;
60 struct timespec start_time2
, end_time2
;
61 long int nanosecs
,total_ops
;
62 float gflopsTiled
,gflopsCPU
;
70 A
.stride
= (((A
.width
-1)/BLOCK_SIZE
)+1) * BLOCK_SIZE
;
71 A
.hpad
= (((A
.height
-1)/BLOCK_SIZE
)+1) * BLOCK_SIZE
;
72 A
.elements
= (float*)malloc(A
.stride
* A
.hpad
* sizeof(float));
76 B
.stride
= (((B
.width
-1)/BLOCK_SIZE
)+1) * BLOCK_SIZE
;
77 B
.hpad
= (((B
.height
-1)/BLOCK_SIZE
)+1) * BLOCK_SIZE
;
78 B
.elements
= (float*)malloc(B
.stride
* B
.hpad
* sizeof(float));
80 /* Bt is same as B but stored in column-major order */
85 Bt
.elements
= (float*)malloc(Bt
.stride
* Bt
.hpad
* sizeof(float));
89 C
.stride
= (((C
.width
-1)/BLOCK_SIZE
)+1) * BLOCK_SIZE
;
90 C
.hpad
= (((C
.height
-1)/BLOCK_SIZE
)+1) * BLOCK_SIZE
;
91 C
.elements
= (float*)malloc(C
.stride
* C
.hpad
* sizeof(float));
95 Cref
.stride
= (((Cref
.width
-1)/BLOCK_SIZE
)+1) * BLOCK_SIZE
;
96 Cref
.hpad
= (((Cref
.height
-1)/BLOCK_SIZE
)+1) * BLOCK_SIZE
;
97 Cref
.elements
= (float*)malloc(Cref
.stride
* Cref
.hpad
* sizeof(float));
99 for(i
= 0; i
< A
.hpad
; i
++)
100 for(j
= 0; j
< A
.stride
; j
++) {
101 if (( j
<A
.width
) && (i
<A
.height
)) {
102 A
.elements
[i
*A
.stride
+ j
] = (i
% 3);
104 A
.elements
[i
*A
.stride
+ j
] = 0.0;
108 /* Initialize B and Bt */
109 for(i
= 0; i
< B
.hpad
; i
++)
110 for(j
= 0; j
< B
.stride
; j
++) {
111 if (( j
<B
.width
) && (i
<B
.height
)) {
112 B
.elements
[i
*B
.stride
+j
] = (j
% 2);
113 Bt
.elements
[j
*Bt
.stride
+i
] = B
.elements
[i
*B
.stride
+j
] ;
115 B
.elements
[i
*B
.stride
+j
] = 0.0;
116 Bt
.elements
[j
*Bt
.stride
+i
] = 0.0;
120 /* zero C, and Cref */
121 for(i
= 0; i
< C
.hpad
; i
++)
122 for(j
= 0; j
< C
.stride
; j
++) {
123 C
.elements
[i
*C
.stride
+j
] = 0.0;
124 Cref
.elements
[i
*Cref
.stride
+j
] = 0.0;
127 simple_sgemm_tt(A
.height
,B
.width
,B
.height
,1.0,A
.elements
,A
.stride
,B
.elements
,B
.stride
,1.0,Cref
.elements
,Cref
.stride
);
128 tiled_sgemm_tt(A
.height
,B
.width
,B
.height
,1.0,A
.elements
,A
.stride
,B
.elements
,B
.stride
,1.0,C
.elements
,C
.stride
);
130 verify(C
.elements
, Cref
.elements
, C
.height
* C
.stride
);
134 void simple_sgemm_tt(const int M
,const int N
,const int K
,const float alpha
, const float* A
,const int LDA
,
135 const float* B
,const int LDB
, const float beta
,float* C
, const int LDC
) {
136 /* A,B, and C are in row-major order */
137 int c_row
,c_col
,inner
;
139 for (c_col
= 0 ; c_col
<N
; c_col
++ ) {
140 for (c_row
= 0 ; c_row
<M
; c_row
++ ) {
142 for (inner
= 0 ; inner
<K
; inner
++ ) {
143 sum
+= A
[c_row
*LDA
+ inner
] * B
[inner
*LDB
+ c_col
] ;
145 C
[c_row
*LDC
+ c_col
] = alpha
*sum
+ beta
*C
[ c_row
*LDC
+ c_col
] ;
150 /***************************
152 tiled_sgemm_tt: Tiled matrix multiplication:
154 ***************************/
156 void tiled_sgemm_tt(const int M
, const int N
, const int K
, const float alpha
, const float*A
, const int LDA
,
157 const float*B
, const int LDB
, const float beta
, float*C
, const int LDC
){
159 #pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
160 #pragma omp distribute collapse(2)
161 for (int C_row_start
=0 ; C_row_start
< M
; C_row_start
+=BLOCK_SIZE
)
162 for (int C_col_start
=0 ; C_col_start
< N
; C_col_start
+=BLOCK_SIZE
)
164 // Each team has a local copy of these mini matrices
165 float As
[BLOCK_SIZE
][BLOCK_SIZE
];
166 float Bs
[BLOCK_SIZE
][BLOCK_SIZE
];
172 for (int kblock
= 0; kblock
< K
; kblock
+= BLOCK_SIZE
)
174 #pragma omp for collapse(2)
175 for (int row
=0 ; row
< BLOCK_SIZE
; row
++)
176 for (int col
=0 ; col
< BLOCK_SIZE
; col
++)
178 C_row
= C_row_start
+ row
;
179 C_col
= C_col_start
+ col
;
180 if ((C_row
< M
) && (kblock
+ col
< K
))
181 As
[row
][col
] = A
[(C_row
*LDA
)+ kblock
+ col
];
184 if ((kblock
+ row
< K
) && C_col
< N
)
185 Bs
[row
][col
] = B
[((kblock
+row
)*LDB
)+ C_col
];
190 #pragma omp for collapse(2)
191 for (int row
=0 ; row
< BLOCK_SIZE
; row
++)
192 for (int col
=0 ; col
< BLOCK_SIZE
; col
++)
194 for (int e
= 0; e
< BLOCK_SIZE
; ++e
)
195 Cval
+= As
[row
][e
] * Bs
[e
][col
];
197 } /* End for kblock .. */
200 #pragma omp for collapse(2)
201 for (int row
=0 ; row
< BLOCK_SIZE
; row
++)
202 for (int col
=0 ; col
< BLOCK_SIZE
; col
++)
204 C_row
= C_row_start
+ row
;
205 C_col
= C_col_start
+ col
;
206 if ((C_row
< M
) && (C_col
< N
))
207 C
[(C_row
*LDC
)+C_col
] = alpha
*Cval
+ beta
*C
[(C_row
*LDC
)+C_col
];
211 } /* end target teams distribute */