Bug #1081: Report an error on command-line flags that expect a wholly-missing integer...
[charm.git] / examples / charm++ / cell / OffloadAPI / matmul / matmul.cpp
blob83ab331bd60582e527239e924ce7a4ab903549aa
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <spert_ppu.h>
5 #include <sys/time.h>
6 #include "matmul_shared.h"
9 ////////////////////////////////////////////////////////////////////////////////////////////////
10 // Global Data
12 int callbackFlag = FALSE;
14 ELEM_TYPE A[MATRIX_A_ROWS * MATRIX_A_COLS] __attribute__((aligned(128))) = { ((ELEM_TYPE)(0.0)) };
15 ELEM_TYPE B[MATRIX_B_ROWS * MATRIX_B_COLS] __attribute__((aligned(128))) = { ((ELEM_TYPE)(0.0)) };
16 ELEM_TYPE C[MATRIX_C_ROWS * MATRIX_C_COLS] __attribute__((aligned(128))) = { ((ELEM_TYPE)(0.0)) };
17 char __wrRecord[NUM_WRS_PER_ITER * SIZEOF_16(WRRecord)];
18 volatile WRRecord* wrRecord[NUM_WRS_PER_ITER];
20 ////////////////////////////////////////////////////////////////////////////////////////////////
21 // Function Prototypes
23 int main(int argc, char* argv[]);
24 void wrCallback(void* ptr);
27 ////////////////////////////////////////////////////////////////////////////////////////////////
28 // Function Bodies
30 int main(int argc, char* argv[]) {
32 // Display a header for the user
33 printf("----- Offload API : Matrix Multiply -----\n");
34 printf(" Number of SPEs Used : %d\n", NUM_SPE_THREADS);
35 printf(" Matrices : A[%d x %d], B[%d x %d], C[%d x %d]\n",
36 MATRIX_A_ROWS, MATRIX_A_COLS, MATRIX_B_ROWS, MATRIX_B_COLS, MATRIX_C_ROWS, MATRIX_C_COLS
38 printf(" Work Requests / Iteration : %d\n", NUM_WRS_PER_ITER);
39 printf(" [Rows x Cols] / Work Request : [%d x %d]\n", NUM_ROWS_PER_WR, NUM_COLS_PER_WR);
40 register int wrDataSize = ((MATRIX_A_COLS * NUM_ROWS_PER_WR) +
41 (MATRIX_B_ROWS * NUM_COLS_PER_WR) +
42 (NUM_ROWS_PER_WR + NUM_COLS_PER_WR)
43 ) * sizeof(ELEM_TYPE);
44 float flopsPerIteration = (float)MATRIX_C_ROWS * (float)MATRIX_C_COLS * (float)MATRIX_A_COLS * 2.0f;
45 float flopsPerWR = (float)NUM_ROWS_PER_WR * (float)NUM_COLS_PER_WR * (float)MATRIX_A_COLS * 2.0f;
47 printf(" Data / Work Request : %d (0x%08X)\n", wrDataSize, wrDataSize);
48 printf(" Flops / Work Request : %.3f KFlops\n", flopsPerWR / 1000.0f);
49 printf(" Flops / Iteration : %.3f MFlops\n", flopsPerIteration / 1000000.0f);
52 // DEBUG
53 //printf(" NUM_WRS_PER_ITER = %d\n", NUM_WRS_PER_ITER);
54 //printf(" __wrRecord @ %p\n", __wrRecord);
55 //printf(" SIZEOF_16(WRRecord) = %d\n", SIZEOF_16(WRRecord));
56 //printf(" wrRecord @ %p\n", wrRecord);
59 // Set the random number generator's seed value
60 srand(0);
62 // Initialize the wrRecord pointers
63 register int r1,c1;
64 for (r1 = 0; r1 < (MATRIX_C_ROWS / NUM_ROWS_PER_WR); r1++) {
65 for (c1 = 0; c1 < (MATRIX_C_COLS / NUM_COLS_PER_WR); c1++) {
66 register int index = c1 + (r1 * (MATRIX_C_COLS / NUM_COLS_PER_WR));
67 wrRecord[index] = (WRRecord*)(__wrRecord + (index * SIZEOF_16(WRRecord)));
68 wrRecord[index]->startRow = r1 * NUM_ROWS_PER_WR;
69 wrRecord[index]->startCol = c1 * NUM_COLS_PER_WR;
71 //// DEBUG
72 //printf("PPE :: wrRecord @ %p = { startRow = %d, startCol = %d, ... } ...\n",
73 // wrRecord[index], wrRecord[index]->startRow, wrRecord[index]->startCol
74 // );
78 // Fill in A and B with random values
79 register int r0,c0;
80 for (r0 = 0; r0 < MATRIX_A_ROWS; r0++)
81 for (c0 = 0; c0 < MATRIX_A_COLS; c0++)
82 A[c0 + (r0 * MATRIX_A_COLS)] = ((rand() % 100) / (rand() % 100));
83 // NOTE: For the sake of filling in random numbers, treat B as if it is in row-major form
84 for (r0 = 0; r0 < MATRIX_B_ROWS; r0++)
85 for (c0 = 0; c0 < MATRIX_B_COLS; c0++)
86 B[c0 + (r0 * MATRIX_B_COLS)] = ((rand() % 100) / (rand() % 100));
88 // Initialize the Offload API
89 InitOffloadAPI(wrCallback);
91 // DEBUG
92 //enableTrace();
94 // Start timing
95 timeval startTime;
96 gettimeofday(&startTime, NULL);
98 // Entry the main loop
99 int iterCount = REPEAT_COUNT;
100 while (iterCount > 0) {
102 // Send all of the work requests for this iteration
103 for (int r = 0; r < (MATRIX_C_ROWS / NUM_ROWS_PER_WR); r++) {
104 for (int c = 0; c < (MATRIX_C_COLS / NUM_COLS_PER_WR); c++) {
106 // NOTE: A stored in row-major form
107 // B stored in col-major form
109 register ELEM_TYPE* ARowPtr = A + ((r * NUM_ROWS_PER_WR) * MATRIX_A_COLS);
110 register int ADataSize = NUM_ROWS_PER_WR * MATRIX_A_COLS * sizeof(ELEM_TYPE);
111 register ELEM_TYPE* BColPtr = B + ((c * NUM_COLS_PER_WR) * MATRIX_B_ROWS);
112 register int BDataSize = NUM_COLS_PER_WR * MATRIX_B_ROWS * sizeof(ELEM_TYPE);
113 register volatile WRRecord* recordPtr = wrRecord[c + (r * (MATRIX_C_COLS / NUM_COLS_PER_WR))];
114 register int CDataSize = NUM_ROWS_PER_WR * NUM_COLS_PER_WR * sizeof(ELEM_TYPE);
116 //// DEBUG
117 //printf("PPE :: B = %p, BColPtr = %p...\n", B, BColPtr);
118 //register int i, j;
119 //printf("PPE :: B data = { ");
120 //for (j = 0; j < MATRIX_B_COLS; j++)
121 // for (i = 0; i < MATRIX_B_ROWS; i++)
122 // printf("%f ", *(BColPtr + i + (j * MATRIX_B_ROWS)));
123 //printf("}...\n");
125 sendWorkRequest(FUNC_CALC, // Funcion Index
126 ARowPtr, ADataSize, // Read/Write Data
127 BColPtr, BDataSize, // Read-Only Data
128 (void*)recordPtr, CDataSize, // Write-Only Data
129 (void*)recordPtr, // User Data
130 WORK_REQUEST_FLAGS_RW_IS_RO // Flags
135 // Wait for them to complete (barrier for all work requests)
136 while (callbackFlag == FALSE) OffloadAPIProgress();
137 callbackFlag = FALSE; // reset the flag
139 // Decrement the iteration count
140 iterCount--;
142 // DEBUG
143 //displayLastWRTimes();
146 // Stop Timing
147 timeval endTime;
148 gettimeofday(&endTime, NULL);
150 // Close the Offload API
151 CloseOffloadAPI();
153 // Display the matrices
154 #if DISPLAY_MATRICES != 0
155 register int r2, c2;
156 printf("matrix A [%d x %d] = {\n", MATRIX_A_ROWS, MATRIX_A_COLS);
157 for (r2 = 0; r2 < MATRIX_A_ROWS; r2++) {
158 printf(" ");
159 for (c2 = 0; c2 < MATRIX_A_COLS; c2++) {
160 register int index = c2 + (r2 * MATRIX_A_COLS);
161 #if USE_DOUBLE == 0
162 printf("%05.3f ", A[index]);
163 #else
164 printf("%05.3lf ", A[index]);
165 #endif
167 printf("\n");
169 printf("matrix B [%d x %d] = {\n", MATRIX_B_ROWS, MATRIX_B_COLS);
170 for (r2 = 0; r2 < MATRIX_B_ROWS; r2++) {
171 printf(" ");
172 for (c2 = 0; c2 < MATRIX_B_COLS; c2++) {
173 register int index = r2 + (c2 * MATRIX_A_ROWS);
174 #if USE_DOUBLE == 0
175 printf("%05.3f ", B[index]);
176 #else
177 printf("%05.3lf ", B[index]);
178 #endif
180 printf("\n");
182 printf("matrix C [%d x %d] = {\n", MATRIX_C_ROWS, MATRIX_C_COLS);
183 for (r2 = 0; r2 < MATRIX_C_ROWS; r2++) {
184 printf(" ");
185 for (c2 = 0; c2 < MATRIX_C_COLS; c2++) {
186 register int index = c2 + (r2 * MATRIX_A_COLS);
187 #if USE_DOUBLE == 0
188 printf("%05.3f ", C[index]);
189 #else
190 printf("%05.3lf ", C[index]);
191 #endif
193 printf("\n");
195 #endif // DISPLAY_MATRICES != 0
197 // Calculate the total flops
198 float totalGFlops = (float)REPEAT_COUNT * flopsPerIteration / 1000000000.0f;
199 printf("Total Flops : %.3lf GFlops\n", totalGFlops);
201 // Calculate the time taken
202 double startTimeD = (double)startTime.tv_sec + ((double)startTime.tv_usec / 1000000.0);
203 double endTimeD = (double)endTime.tv_sec + ((double)endTime.tv_usec / 1000000.0);
204 double timeDiff = endTimeD - startTimeD;
205 printf("Time Taken : %.6lf secs\n", timeDiff);
206 printf("Average GFlops/s : %.6lf\n", (double)totalGFlops / timeDiff);
209 // All Good
210 return EXIT_SUCCESS;
214 void wrCallback(void* ptr) {
216 static int completeCounter = 0;
217 register WRRecord* wrRecord = (WRRecord*)ptr;
219 // DEBUG
220 //printf("PPE :: wrRecord @ %p = { startRow = %d, startCol = %d, ... } ...\n",
221 // wrRecord, wrRecord->startRow, wrRecord->startCol
222 // );
224 // Display text from time to time for the user
225 #if DISPLAY_WR_FINISH_FREQ != 0
226 if (completeCounter % DISPLAY_WR_FINISH_FREQ == 0)
227 printf("PPE :: [INFO] :: completeCounter = %d...\n", completeCounter);
228 #endif
230 // Merge the results into the C matrix
231 for (int r = 0; r < NUM_ROWS_PER_WR; r++)
232 for (int c = 0; c < NUM_COLS_PER_WR; c++) {
233 register int gIndex = (wrRecord->startCol + c) + ((wrRecord->startRow + r) * MATRIX_C_COLS);
234 register int lIndex = c + (r * NUM_COLS_PER_WR);
236 // DEBUG
237 //printf("PPE :: [%d x %d] = %f : gIndex = %d, lIndex = %d\n", r, c, wrRecord->C[lIndex], gIndex, lIndex);
239 C[gIndex] = wrRecord->C[lIndex];
242 // Count this completion
243 completeCounter++;
245 // Check to see if all work requests for the iteration have completed
246 if (completeCounter >= NUM_WRS_PER_ITER) {
247 completeCounter = 0;
248 callbackFlag = TRUE;