6 #include "matmul_shared.h"
9 ////////////////////////////////////////////////////////////////////////////////////////////////
12 int callbackFlag
= FALSE
;
14 ELEM_TYPE A
[MATRIX_A_ROWS
* MATRIX_A_COLS
] __attribute__((aligned(128))) = { ((ELEM_TYPE
)(0.0)) };
15 ELEM_TYPE B
[MATRIX_B_ROWS
* MATRIX_B_COLS
] __attribute__((aligned(128))) = { ((ELEM_TYPE
)(0.0)) };
16 ELEM_TYPE C
[MATRIX_C_ROWS
* MATRIX_C_COLS
] __attribute__((aligned(128))) = { ((ELEM_TYPE
)(0.0)) };
17 char __wrRecord
[NUM_WRS_PER_ITER
* SIZEOF_16(WRRecord
)];
18 volatile WRRecord
* wrRecord
[NUM_WRS_PER_ITER
];
20 ////////////////////////////////////////////////////////////////////////////////////////////////
21 // Function Prototypes
23 int main(int argc
, char* argv
[]);
24 void wrCallback(void* ptr
);
27 ////////////////////////////////////////////////////////////////////////////////////////////////
30 int main(int argc
, char* argv
[]) {
32 // Display a header for the user
33 printf("----- Offload API : Matrix Multiply -----\n");
34 printf(" Number of SPEs Used : %d\n", NUM_SPE_THREADS
);
35 printf(" Matrices : A[%d x %d], B[%d x %d], C[%d x %d]\n",
36 MATRIX_A_ROWS
, MATRIX_A_COLS
, MATRIX_B_ROWS
, MATRIX_B_COLS
, MATRIX_C_ROWS
, MATRIX_C_COLS
38 printf(" Work Requests / Iteration : %d\n", NUM_WRS_PER_ITER
);
39 printf(" [Rows x Cols] / Work Request : [%d x %d]\n", NUM_ROWS_PER_WR
, NUM_COLS_PER_WR
);
40 register int wrDataSize
= ((MATRIX_A_COLS
* NUM_ROWS_PER_WR
) +
41 (MATRIX_B_ROWS
* NUM_COLS_PER_WR
) +
42 (NUM_ROWS_PER_WR
+ NUM_COLS_PER_WR
)
43 ) * sizeof(ELEM_TYPE
);
44 float flopsPerIteration
= (float)MATRIX_C_ROWS
* (float)MATRIX_C_COLS
* (float)MATRIX_A_COLS
* 2.0f
;
45 float flopsPerWR
= (float)NUM_ROWS_PER_WR
* (float)NUM_COLS_PER_WR
* (float)MATRIX_A_COLS
* 2.0f
;
47 printf(" Data / Work Request : %d (0x%08X)\n", wrDataSize
, wrDataSize
);
48 printf(" Flops / Work Request : %.3f KFlops\n", flopsPerWR
/ 1000.0f
);
49 printf(" Flops / Iteration : %.3f MFlops\n", flopsPerIteration
/ 1000000.0f
);
53 //printf(" NUM_WRS_PER_ITER = %d\n", NUM_WRS_PER_ITER);
54 //printf(" __wrRecord @ %p\n", __wrRecord);
55 //printf(" SIZEOF_16(WRRecord) = %d\n", SIZEOF_16(WRRecord));
56 //printf(" wrRecord @ %p\n", wrRecord);
59 // Set the random number generator's seed value
62 // Initialize the wrRecord pointers
64 for (r1
= 0; r1
< (MATRIX_C_ROWS
/ NUM_ROWS_PER_WR
); r1
++) {
65 for (c1
= 0; c1
< (MATRIX_C_COLS
/ NUM_COLS_PER_WR
); c1
++) {
66 register int index
= c1
+ (r1
* (MATRIX_C_COLS
/ NUM_COLS_PER_WR
));
67 wrRecord
[index
] = (WRRecord
*)(__wrRecord
+ (index
* SIZEOF_16(WRRecord
)));
68 wrRecord
[index
]->startRow
= r1
* NUM_ROWS_PER_WR
;
69 wrRecord
[index
]->startCol
= c1
* NUM_COLS_PER_WR
;
72 //printf("PPE :: wrRecord @ %p = { startRow = %d, startCol = %d, ... } ...\n",
73 // wrRecord[index], wrRecord[index]->startRow, wrRecord[index]->startCol
78 // Fill in A and B with random values
80 for (r0
= 0; r0
< MATRIX_A_ROWS
; r0
++)
81 for (c0
= 0; c0
< MATRIX_A_COLS
; c0
++)
82 A
[c0
+ (r0
* MATRIX_A_COLS
)] = ((rand() % 100) / (rand() % 100));
83 // NOTE: For the sake of filling in random numbers, treat B as if it is in row-major form
84 for (r0
= 0; r0
< MATRIX_B_ROWS
; r0
++)
85 for (c0
= 0; c0
< MATRIX_B_COLS
; c0
++)
86 B
[c0
+ (r0
* MATRIX_B_COLS
)] = ((rand() % 100) / (rand() % 100));
88 // Initialize the Offload API
89 InitOffloadAPI(wrCallback
);
96 gettimeofday(&startTime
, NULL
);
98 // Entry the main loop
99 int iterCount
= REPEAT_COUNT
;
100 while (iterCount
> 0) {
102 // Send all of the work requests for this iteration
103 for (int r
= 0; r
< (MATRIX_C_ROWS
/ NUM_ROWS_PER_WR
); r
++) {
104 for (int c
= 0; c
< (MATRIX_C_COLS
/ NUM_COLS_PER_WR
); c
++) {
106 // NOTE: A stored in row-major form
107 // B stored in col-major form
109 register ELEM_TYPE
* ARowPtr
= A
+ ((r
* NUM_ROWS_PER_WR
) * MATRIX_A_COLS
);
110 register int ADataSize
= NUM_ROWS_PER_WR
* MATRIX_A_COLS
* sizeof(ELEM_TYPE
);
111 register ELEM_TYPE
* BColPtr
= B
+ ((c
* NUM_COLS_PER_WR
) * MATRIX_B_ROWS
);
112 register int BDataSize
= NUM_COLS_PER_WR
* MATRIX_B_ROWS
* sizeof(ELEM_TYPE
);
113 register volatile WRRecord
* recordPtr
= wrRecord
[c
+ (r
* (MATRIX_C_COLS
/ NUM_COLS_PER_WR
))];
114 register int CDataSize
= NUM_ROWS_PER_WR
* NUM_COLS_PER_WR
* sizeof(ELEM_TYPE
);
117 //printf("PPE :: B = %p, BColPtr = %p...\n", B, BColPtr);
119 //printf("PPE :: B data = { ");
120 //for (j = 0; j < MATRIX_B_COLS; j++)
121 // for (i = 0; i < MATRIX_B_ROWS; i++)
122 // printf("%f ", *(BColPtr + i + (j * MATRIX_B_ROWS)));
125 sendWorkRequest(FUNC_CALC
, // Funcion Index
126 ARowPtr
, ADataSize
, // Read/Write Data
127 BColPtr
, BDataSize
, // Read-Only Data
128 (void*)recordPtr
, CDataSize
, // Write-Only Data
129 (void*)recordPtr
, // User Data
130 WORK_REQUEST_FLAGS_RW_IS_RO
// Flags
135 // Wait for them to complete (barrier for all work requests)
136 while (callbackFlag
== FALSE
) OffloadAPIProgress();
137 callbackFlag
= FALSE
; // reset the flag
139 // Decrement the iteration count
143 //displayLastWRTimes();
148 gettimeofday(&endTime
, NULL
);
150 // Close the Offload API
153 // Display the matrices
154 #if DISPLAY_MATRICES != 0
156 printf("matrix A [%d x %d] = {\n", MATRIX_A_ROWS
, MATRIX_A_COLS
);
157 for (r2
= 0; r2
< MATRIX_A_ROWS
; r2
++) {
159 for (c2
= 0; c2
< MATRIX_A_COLS
; c2
++) {
160 register int index
= c2
+ (r2
* MATRIX_A_COLS
);
162 printf("%05.3f ", A
[index
]);
164 printf("%05.3lf ", A
[index
]);
169 printf("matrix B [%d x %d] = {\n", MATRIX_B_ROWS
, MATRIX_B_COLS
);
170 for (r2
= 0; r2
< MATRIX_B_ROWS
; r2
++) {
172 for (c2
= 0; c2
< MATRIX_B_COLS
; c2
++) {
173 register int index
= r2
+ (c2
* MATRIX_A_ROWS
);
175 printf("%05.3f ", B
[index
]);
177 printf("%05.3lf ", B
[index
]);
182 printf("matrix C [%d x %d] = {\n", MATRIX_C_ROWS
, MATRIX_C_COLS
);
183 for (r2
= 0; r2
< MATRIX_C_ROWS
; r2
++) {
185 for (c2
= 0; c2
< MATRIX_C_COLS
; c2
++) {
186 register int index
= c2
+ (r2
* MATRIX_A_COLS
);
188 printf("%05.3f ", C
[index
]);
190 printf("%05.3lf ", C
[index
]);
195 #endif // DISPLAY_MATRICES != 0
197 // Calculate the total flops
198 float totalGFlops
= (float)REPEAT_COUNT
* flopsPerIteration
/ 1000000000.0f
;
199 printf("Total Flops : %.3lf GFlops\n", totalGFlops
);
201 // Calculate the time taken
202 double startTimeD
= (double)startTime
.tv_sec
+ ((double)startTime
.tv_usec
/ 1000000.0);
203 double endTimeD
= (double)endTime
.tv_sec
+ ((double)endTime
.tv_usec
/ 1000000.0);
204 double timeDiff
= endTimeD
- startTimeD
;
205 printf("Time Taken : %.6lf secs\n", timeDiff
);
206 printf("Average GFlops/s : %.6lf\n", (double)totalGFlops
/ timeDiff
);
214 void wrCallback(void* ptr
) {
216 static int completeCounter
= 0;
217 register WRRecord
* wrRecord
= (WRRecord
*)ptr
;
220 //printf("PPE :: wrRecord @ %p = { startRow = %d, startCol = %d, ... } ...\n",
221 // wrRecord, wrRecord->startRow, wrRecord->startCol
224 // Display text from time to time for the user
225 #if DISPLAY_WR_FINISH_FREQ != 0
226 if (completeCounter
% DISPLAY_WR_FINISH_FREQ
== 0)
227 printf("PPE :: [INFO] :: completeCounter = %d...\n", completeCounter
);
230 // Merge the results into the C matrix
231 for (int r
= 0; r
< NUM_ROWS_PER_WR
; r
++)
232 for (int c
= 0; c
< NUM_COLS_PER_WR
; c
++) {
233 register int gIndex
= (wrRecord
->startCol
+ c
) + ((wrRecord
->startRow
+ r
) * MATRIX_C_COLS
);
234 register int lIndex
= c
+ (r
* NUM_COLS_PER_WR
);
237 //printf("PPE :: [%d x %d] = %f : gIndex = %d, lIndex = %d\n", r, c, wrRecord->C[lIndex], gIndex, lIndex);
239 C
[gIndex
] = wrRecord
->C
[lIndex
];
242 // Count this completion
245 // Check to see if all work requests for the iteration have completed
246 if (completeCounter
>= NUM_WRS_PER_ITER
) {