* es.po: Update.
[official-gcc.git] / libgfortran / generated / matmul_r4.c
blob2a85d6b3bccc65370876f351046eb752d37f51ef
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2016 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
27 #include <stdlib.h>
28 #include <string.h>
29 #include <assert.h>
32 #if defined (HAVE_GFC_REAL_4)
34 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
35 passed to us by the front-end, in which case we call it for large
36 matrices. */
38 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
39 const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
40 const int *, const GFC_REAL_4 *, const int *,
41 const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
42 int, int);
44 /* The order of loops is different in the case of plain matrix
45 multiplication C=MATMUL(A,B), and in the frequent special case where
46 the argument A is the temporary result of a TRANSPOSE intrinsic:
47 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
48 looking at their strides.
50 The equivalent Fortran pseudo-code is:
52 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
53 IF (.NOT.IS_TRANSPOSED(A)) THEN
54 C = 0
55 DO J=1,N
56 DO K=1,COUNT
57 DO I=1,M
58 C(I,J) = C(I,J)+A(I,K)*B(K,J)
59 ELSE
60 DO J=1,N
61 DO I=1,M
62 S = 0
63 DO K=1,COUNT
64 S = S+A(I,K)*B(K,J)
65 C(I,J) = S
66 ENDIF
69 /* If try_blas is set to a nonzero value, then the matmul function will
70 see if there is a way to perform the matrix multiplication by a call
71 to the BLAS gemm function. */
73 extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
74 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
75 int blas_limit, blas_call gemm);
76 export_proto(matmul_r4);
78 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
79 /* REAL types generate identical code for AVX and AVX2. Only generate
80 an AVX2 function if we are dealing with integer. */
81 #undef HAVE_AVX2
82 #endif
85 /* Put exhaustive list of possible architectures here here, ORed together. */
87 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
89 #ifdef HAVE_AVX
90 static void
91 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
92 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
93 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
94 static void
95 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
96 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
97 int blas_limit, blas_call gemm)
99 const GFC_REAL_4 * restrict abase;
100 const GFC_REAL_4 * restrict bbase;
101 GFC_REAL_4 * restrict dest;
103 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
104 index_type x, y, n, count, xcount, ycount;
106 assert (GFC_DESCRIPTOR_RANK (a) == 2
107 || GFC_DESCRIPTOR_RANK (b) == 2);
109 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
111 Either A or B (but not both) can be rank 1:
113 o One-dimensional argument A is implicitly treated as a row matrix
114 dimensioned [1,count], so xcount=1.
116 o One-dimensional argument B is implicitly treated as a column matrix
117 dimensioned [count, 1], so ycount=1.
120 if (retarray->base_addr == NULL)
122 if (GFC_DESCRIPTOR_RANK (a) == 1)
124 GFC_DIMENSION_SET(retarray->dim[0], 0,
125 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
127 else if (GFC_DESCRIPTOR_RANK (b) == 1)
129 GFC_DIMENSION_SET(retarray->dim[0], 0,
130 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
132 else
134 GFC_DIMENSION_SET(retarray->dim[0], 0,
135 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
137 GFC_DIMENSION_SET(retarray->dim[1], 0,
138 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
139 GFC_DESCRIPTOR_EXTENT(retarray,0));
142 retarray->base_addr
143 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
144 retarray->offset = 0;
146 else if (unlikely (compile_options.bounds_check))
148 index_type ret_extent, arg_extent;
150 if (GFC_DESCRIPTOR_RANK (a) == 1)
152 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
153 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
154 if (arg_extent != ret_extent)
155 runtime_error ("Incorrect extent in return array in"
156 " MATMUL intrinsic: is %ld, should be %ld",
157 (long int) ret_extent, (long int) arg_extent);
159 else if (GFC_DESCRIPTOR_RANK (b) == 1)
161 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
162 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
163 if (arg_extent != ret_extent)
164 runtime_error ("Incorrect extent in return array in"
165 " MATMUL intrinsic: is %ld, should be %ld",
166 (long int) ret_extent, (long int) arg_extent);
168 else
170 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 1:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
178 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
179 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
180 if (arg_extent != ret_extent)
181 runtime_error ("Incorrect extent in return array in"
182 " MATMUL intrinsic for dimension 2:"
183 " is %ld, should be %ld",
184 (long int) ret_extent, (long int) arg_extent);
189 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
191 /* One-dimensional result may be addressed in the code below
192 either as a row or a column matrix. We want both cases to
193 work. */
194 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
196 else
198 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
199 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
203 if (GFC_DESCRIPTOR_RANK (a) == 1)
205 /* Treat it as a a row matrix A[1,count]. */
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = 1;
209 xcount = 1;
210 count = GFC_DESCRIPTOR_EXTENT(a,0);
212 else
214 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
215 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
217 count = GFC_DESCRIPTOR_EXTENT(a,1);
218 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
221 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
223 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
224 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
227 if (GFC_DESCRIPTOR_RANK (b) == 1)
229 /* Treat it as a column matrix B[count,1] */
230 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
232 /* bystride should never be used for 1-dimensional b.
233 in case it is we want it to cause a segfault, rather than
234 an incorrect result. */
235 bystride = 0xDEADBEEF;
236 ycount = 1;
238 else
240 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
241 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
242 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
245 abase = a->base_addr;
246 bbase = b->base_addr;
247 dest = retarray->base_addr;
249 /* Now that everything is set up, we perform the multiplication
250 itself. */
252 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
253 #define min(a,b) ((a) <= (b) ? (a) : (b))
254 #define max(a,b) ((a) >= (b) ? (a) : (b))
256 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
257 && (bxstride == 1 || bystride == 1)
258 && (((float) xcount) * ((float) ycount) * ((float) count)
259 > POW3(blas_limit)))
261 const int m = xcount, n = ycount, k = count, ldc = rystride;
262 const GFC_REAL_4 one = 1, zero = 0;
263 const int lda = (axstride == 1) ? aystride : axstride,
264 ldb = (bxstride == 1) ? bystride : bxstride;
266 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
268 assert (gemm != NULL);
269 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
270 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
271 &ldc, 1, 1);
272 return;
276 if (rxstride == 1 && axstride == 1 && bxstride == 1)
278 /* This block of code implements a tuned matmul, derived from
279 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
281 Bo Kagstrom and Per Ling
282 Department of Computing Science
283 Umea University
284 S-901 87 Umea, Sweden
286 from netlib.org, translated to C, and modified for matmul.m4. */
288 const GFC_REAL_4 *a, *b;
289 GFC_REAL_4 *c;
290 const index_type m = xcount, n = ycount, k = count;
292 /* System generated locals */
293 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
294 i1, i2, i3, i4, i5, i6;
296 /* Local variables */
297 GFC_REAL_4 t1[65536], /* was [256][256] */
298 f11, f12, f21, f22, f31, f32, f41, f42,
299 f13, f14, f23, f24, f33, f34, f43, f44;
300 index_type i, j, l, ii, jj, ll;
301 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
303 a = abase;
304 b = bbase;
305 c = retarray->base_addr;
307 /* Parameter adjustments */
308 c_dim1 = rystride;
309 c_offset = 1 + c_dim1;
310 c -= c_offset;
311 a_dim1 = aystride;
312 a_offset = 1 + a_dim1;
313 a -= a_offset;
314 b_dim1 = bystride;
315 b_offset = 1 + b_dim1;
316 b -= b_offset;
318 /* Early exit if possible */
319 if (m == 0 || n == 0 || k == 0)
320 return;
322 /* Empty c first. */
323 for (j=1; j<=n; j++)
324 for (i=1; i<=m; i++)
325 c[i + j * c_dim1] = (GFC_REAL_4)0;
327 /* Start turning the crank. */
328 i1 = n;
329 for (jj = 1; jj <= i1; jj += 512)
331 /* Computing MIN */
332 i2 = 512;
333 i3 = n - jj + 1;
334 jsec = min(i2,i3);
335 ujsec = jsec - jsec % 4;
336 i2 = k;
337 for (ll = 1; ll <= i2; ll += 256)
339 /* Computing MIN */
340 i3 = 256;
341 i4 = k - ll + 1;
342 lsec = min(i3,i4);
343 ulsec = lsec - lsec % 2;
345 i3 = m;
346 for (ii = 1; ii <= i3; ii += 256)
348 /* Computing MIN */
349 i4 = 256;
350 i5 = m - ii + 1;
351 isec = min(i4,i5);
352 uisec = isec - isec % 2;
353 i4 = ll + ulsec - 1;
354 for (l = ll; l <= i4; l += 2)
356 i5 = ii + uisec - 1;
357 for (i = ii; i <= i5; i += 2)
359 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
360 a[i + l * a_dim1];
361 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
362 a[i + (l + 1) * a_dim1];
363 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
364 a[i + 1 + l * a_dim1];
365 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
366 a[i + 1 + (l + 1) * a_dim1];
368 if (uisec < isec)
370 t1[l - ll + 1 + (isec << 8) - 257] =
371 a[ii + isec - 1 + l * a_dim1];
372 t1[l - ll + 2 + (isec << 8) - 257] =
373 a[ii + isec - 1 + (l + 1) * a_dim1];
376 if (ulsec < lsec)
378 i4 = ii + isec - 1;
379 for (i = ii; i<= i4; ++i)
381 t1[lsec + ((i - ii + 1) << 8) - 257] =
382 a[i + (ll + lsec - 1) * a_dim1];
386 uisec = isec - isec % 4;
387 i4 = jj + ujsec - 1;
388 for (j = jj; j <= i4; j += 4)
390 i5 = ii + uisec - 1;
391 for (i = ii; i <= i5; i += 4)
393 f11 = c[i + j * c_dim1];
394 f21 = c[i + 1 + j * c_dim1];
395 f12 = c[i + (j + 1) * c_dim1];
396 f22 = c[i + 1 + (j + 1) * c_dim1];
397 f13 = c[i + (j + 2) * c_dim1];
398 f23 = c[i + 1 + (j + 2) * c_dim1];
399 f14 = c[i + (j + 3) * c_dim1];
400 f24 = c[i + 1 + (j + 3) * c_dim1];
401 f31 = c[i + 2 + j * c_dim1];
402 f41 = c[i + 3 + j * c_dim1];
403 f32 = c[i + 2 + (j + 1) * c_dim1];
404 f42 = c[i + 3 + (j + 1) * c_dim1];
405 f33 = c[i + 2 + (j + 2) * c_dim1];
406 f43 = c[i + 3 + (j + 2) * c_dim1];
407 f34 = c[i + 2 + (j + 3) * c_dim1];
408 f44 = c[i + 3 + (j + 3) * c_dim1];
409 i6 = ll + lsec - 1;
410 for (l = ll; l <= i6; ++l)
412 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
413 * b[l + j * b_dim1];
414 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
415 * b[l + j * b_dim1];
416 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
417 * b[l + (j + 1) * b_dim1];
418 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
419 * b[l + (j + 1) * b_dim1];
420 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
421 * b[l + (j + 2) * b_dim1];
422 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
423 * b[l + (j + 2) * b_dim1];
424 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
425 * b[l + (j + 3) * b_dim1];
426 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
427 * b[l + (j + 3) * b_dim1];
428 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
429 * b[l + j * b_dim1];
430 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
431 * b[l + j * b_dim1];
432 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
433 * b[l + (j + 1) * b_dim1];
434 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
435 * b[l + (j + 1) * b_dim1];
436 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
437 * b[l + (j + 2) * b_dim1];
438 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
439 * b[l + (j + 2) * b_dim1];
440 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
441 * b[l + (j + 3) * b_dim1];
442 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
443 * b[l + (j + 3) * b_dim1];
445 c[i + j * c_dim1] = f11;
446 c[i + 1 + j * c_dim1] = f21;
447 c[i + (j + 1) * c_dim1] = f12;
448 c[i + 1 + (j + 1) * c_dim1] = f22;
449 c[i + (j + 2) * c_dim1] = f13;
450 c[i + 1 + (j + 2) * c_dim1] = f23;
451 c[i + (j + 3) * c_dim1] = f14;
452 c[i + 1 + (j + 3) * c_dim1] = f24;
453 c[i + 2 + j * c_dim1] = f31;
454 c[i + 3 + j * c_dim1] = f41;
455 c[i + 2 + (j + 1) * c_dim1] = f32;
456 c[i + 3 + (j + 1) * c_dim1] = f42;
457 c[i + 2 + (j + 2) * c_dim1] = f33;
458 c[i + 3 + (j + 2) * c_dim1] = f43;
459 c[i + 2 + (j + 3) * c_dim1] = f34;
460 c[i + 3 + (j + 3) * c_dim1] = f44;
462 if (uisec < isec)
464 i5 = ii + isec - 1;
465 for (i = ii + uisec; i <= i5; ++i)
467 f11 = c[i + j * c_dim1];
468 f12 = c[i + (j + 1) * c_dim1];
469 f13 = c[i + (j + 2) * c_dim1];
470 f14 = c[i + (j + 3) * c_dim1];
471 i6 = ll + lsec - 1;
472 for (l = ll; l <= i6; ++l)
474 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
475 257] * b[l + j * b_dim1];
476 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
477 257] * b[l + (j + 1) * b_dim1];
478 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
479 257] * b[l + (j + 2) * b_dim1];
480 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
481 257] * b[l + (j + 3) * b_dim1];
483 c[i + j * c_dim1] = f11;
484 c[i + (j + 1) * c_dim1] = f12;
485 c[i + (j + 2) * c_dim1] = f13;
486 c[i + (j + 3) * c_dim1] = f14;
490 if (ujsec < jsec)
492 i4 = jj + jsec - 1;
493 for (j = jj + ujsec; j <= i4; ++j)
495 i5 = ii + uisec - 1;
496 for (i = ii; i <= i5; i += 4)
498 f11 = c[i + j * c_dim1];
499 f21 = c[i + 1 + j * c_dim1];
500 f31 = c[i + 2 + j * c_dim1];
501 f41 = c[i + 3 + j * c_dim1];
502 i6 = ll + lsec - 1;
503 for (l = ll; l <= i6; ++l)
505 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
506 257] * b[l + j * b_dim1];
507 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
508 257] * b[l + j * b_dim1];
509 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
510 257] * b[l + j * b_dim1];
511 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
512 257] * b[l + j * b_dim1];
514 c[i + j * c_dim1] = f11;
515 c[i + 1 + j * c_dim1] = f21;
516 c[i + 2 + j * c_dim1] = f31;
517 c[i + 3 + j * c_dim1] = f41;
519 i5 = ii + isec - 1;
520 for (i = ii + uisec; i <= i5; ++i)
522 f11 = c[i + j * c_dim1];
523 i6 = ll + lsec - 1;
524 for (l = ll; l <= i6; ++l)
526 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
527 257] * b[l + j * b_dim1];
529 c[i + j * c_dim1] = f11;
536 return;
538 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
540 if (GFC_DESCRIPTOR_RANK (a) != 1)
542 const GFC_REAL_4 *restrict abase_x;
543 const GFC_REAL_4 *restrict bbase_y;
544 GFC_REAL_4 *restrict dest_y;
545 GFC_REAL_4 s;
547 for (y = 0; y < ycount; y++)
549 bbase_y = &bbase[y*bystride];
550 dest_y = &dest[y*rystride];
551 for (x = 0; x < xcount; x++)
553 abase_x = &abase[x*axstride];
554 s = (GFC_REAL_4) 0;
555 for (n = 0; n < count; n++)
556 s += abase_x[n] * bbase_y[n];
557 dest_y[x] = s;
561 else
563 const GFC_REAL_4 *restrict bbase_y;
564 GFC_REAL_4 s;
566 for (y = 0; y < ycount; y++)
568 bbase_y = &bbase[y*bystride];
569 s = (GFC_REAL_4) 0;
570 for (n = 0; n < count; n++)
571 s += abase[n*axstride] * bbase_y[n];
572 dest[y*rystride] = s;
576 else if (axstride < aystride)
578 for (y = 0; y < ycount; y++)
579 for (x = 0; x < xcount; x++)
580 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
582 for (y = 0; y < ycount; y++)
583 for (n = 0; n < count; n++)
584 for (x = 0; x < xcount; x++)
585 /* dest[x,y] += a[x,n] * b[n,y] */
586 dest[x*rxstride + y*rystride] +=
587 abase[x*axstride + n*aystride] *
588 bbase[n*bxstride + y*bystride];
590 else if (GFC_DESCRIPTOR_RANK (a) == 1)
592 const GFC_REAL_4 *restrict bbase_y;
593 GFC_REAL_4 s;
595 for (y = 0; y < ycount; y++)
597 bbase_y = &bbase[y*bystride];
598 s = (GFC_REAL_4) 0;
599 for (n = 0; n < count; n++)
600 s += abase[n*axstride] * bbase_y[n*bxstride];
601 dest[y*rxstride] = s;
604 else
606 const GFC_REAL_4 *restrict abase_x;
607 const GFC_REAL_4 *restrict bbase_y;
608 GFC_REAL_4 *restrict dest_y;
609 GFC_REAL_4 s;
611 for (y = 0; y < ycount; y++)
613 bbase_y = &bbase[y*bystride];
614 dest_y = &dest[y*rystride];
615 for (x = 0; x < xcount; x++)
617 abase_x = &abase[x*axstride];
618 s = (GFC_REAL_4) 0;
619 for (n = 0; n < count; n++)
620 s += abase_x[n*aystride] * bbase_y[n*bxstride];
621 dest_y[x*rxstride] = s;
626 #undef POW3
627 #undef min
628 #undef max
630 #endif /* HAVE_AVX */
632 #ifdef HAVE_AVX2
633 static void
634 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
635 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
636 int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
637 static void
638 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
639 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
640 int blas_limit, blas_call gemm)
642 const GFC_REAL_4 * restrict abase;
643 const GFC_REAL_4 * restrict bbase;
644 GFC_REAL_4 * restrict dest;
646 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
647 index_type x, y, n, count, xcount, ycount;
649 assert (GFC_DESCRIPTOR_RANK (a) == 2
650 || GFC_DESCRIPTOR_RANK (b) == 2);
652 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
654 Either A or B (but not both) can be rank 1:
656 o One-dimensional argument A is implicitly treated as a row matrix
657 dimensioned [1,count], so xcount=1.
659 o One-dimensional argument B is implicitly treated as a column matrix
660 dimensioned [count, 1], so ycount=1.
663 if (retarray->base_addr == NULL)
665 if (GFC_DESCRIPTOR_RANK (a) == 1)
667 GFC_DIMENSION_SET(retarray->dim[0], 0,
668 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
670 else if (GFC_DESCRIPTOR_RANK (b) == 1)
672 GFC_DIMENSION_SET(retarray->dim[0], 0,
673 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
675 else
677 GFC_DIMENSION_SET(retarray->dim[0], 0,
678 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
680 GFC_DIMENSION_SET(retarray->dim[1], 0,
681 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
682 GFC_DESCRIPTOR_EXTENT(retarray,0));
685 retarray->base_addr
686 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
687 retarray->offset = 0;
689 else if (unlikely (compile_options.bounds_check))
691 index_type ret_extent, arg_extent;
693 if (GFC_DESCRIPTOR_RANK (a) == 1)
695 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
696 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
697 if (arg_extent != ret_extent)
698 runtime_error ("Incorrect extent in return array in"
699 " MATMUL intrinsic: is %ld, should be %ld",
700 (long int) ret_extent, (long int) arg_extent);
702 else if (GFC_DESCRIPTOR_RANK (b) == 1)
704 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
705 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
706 if (arg_extent != ret_extent)
707 runtime_error ("Incorrect extent in return array in"
708 " MATMUL intrinsic: is %ld, should be %ld",
709 (long int) ret_extent, (long int) arg_extent);
711 else
713 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
714 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
715 if (arg_extent != ret_extent)
716 runtime_error ("Incorrect extent in return array in"
717 " MATMUL intrinsic for dimension 1:"
718 " is %ld, should be %ld",
719 (long int) ret_extent, (long int) arg_extent);
721 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
722 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
723 if (arg_extent != ret_extent)
724 runtime_error ("Incorrect extent in return array in"
725 " MATMUL intrinsic for dimension 2:"
726 " is %ld, should be %ld",
727 (long int) ret_extent, (long int) arg_extent);
732 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
734 /* One-dimensional result may be addressed in the code below
735 either as a row or a column matrix. We want both cases to
736 work. */
737 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
739 else
741 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
742 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
746 if (GFC_DESCRIPTOR_RANK (a) == 1)
748 /* Treat it as a a row matrix A[1,count]. */
749 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
750 aystride = 1;
752 xcount = 1;
753 count = GFC_DESCRIPTOR_EXTENT(a,0);
755 else
757 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
758 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
760 count = GFC_DESCRIPTOR_EXTENT(a,1);
761 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
764 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
766 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
767 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
770 if (GFC_DESCRIPTOR_RANK (b) == 1)
772 /* Treat it as a column matrix B[count,1] */
773 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
775 /* bystride should never be used for 1-dimensional b.
776 in case it is we want it to cause a segfault, rather than
777 an incorrect result. */
778 bystride = 0xDEADBEEF;
779 ycount = 1;
781 else
783 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
784 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
785 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
788 abase = a->base_addr;
789 bbase = b->base_addr;
790 dest = retarray->base_addr;
792 /* Now that everything is set up, we perform the multiplication
793 itself. */
795 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
796 #define min(a,b) ((a) <= (b) ? (a) : (b))
797 #define max(a,b) ((a) >= (b) ? (a) : (b))
799 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
800 && (bxstride == 1 || bystride == 1)
801 && (((float) xcount) * ((float) ycount) * ((float) count)
802 > POW3(blas_limit)))
804 const int m = xcount, n = ycount, k = count, ldc = rystride;
805 const GFC_REAL_4 one = 1, zero = 0;
806 const int lda = (axstride == 1) ? aystride : axstride,
807 ldb = (bxstride == 1) ? bystride : bxstride;
809 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
811 assert (gemm != NULL);
812 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
813 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
814 &ldc, 1, 1);
815 return;
819 if (rxstride == 1 && axstride == 1 && bxstride == 1)
821 /* This block of code implements a tuned matmul, derived from
822 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
824 Bo Kagstrom and Per Ling
825 Department of Computing Science
826 Umea University
827 S-901 87 Umea, Sweden
829 from netlib.org, translated to C, and modified for matmul.m4. */
831 const GFC_REAL_4 *a, *b;
832 GFC_REAL_4 *c;
833 const index_type m = xcount, n = ycount, k = count;
835 /* System generated locals */
836 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
837 i1, i2, i3, i4, i5, i6;
839 /* Local variables */
840 GFC_REAL_4 t1[65536], /* was [256][256] */
841 f11, f12, f21, f22, f31, f32, f41, f42,
842 f13, f14, f23, f24, f33, f34, f43, f44;
843 index_type i, j, l, ii, jj, ll;
844 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
846 a = abase;
847 b = bbase;
848 c = retarray->base_addr;
850 /* Parameter adjustments */
851 c_dim1 = rystride;
852 c_offset = 1 + c_dim1;
853 c -= c_offset;
854 a_dim1 = aystride;
855 a_offset = 1 + a_dim1;
856 a -= a_offset;
857 b_dim1 = bystride;
858 b_offset = 1 + b_dim1;
859 b -= b_offset;
861 /* Early exit if possible */
862 if (m == 0 || n == 0 || k == 0)
863 return;
865 /* Empty c first. */
866 for (j=1; j<=n; j++)
867 for (i=1; i<=m; i++)
868 c[i + j * c_dim1] = (GFC_REAL_4)0;
870 /* Start turning the crank. */
871 i1 = n;
872 for (jj = 1; jj <= i1; jj += 512)
874 /* Computing MIN */
875 i2 = 512;
876 i3 = n - jj + 1;
877 jsec = min(i2,i3);
878 ujsec = jsec - jsec % 4;
879 i2 = k;
880 for (ll = 1; ll <= i2; ll += 256)
882 /* Computing MIN */
883 i3 = 256;
884 i4 = k - ll + 1;
885 lsec = min(i3,i4);
886 ulsec = lsec - lsec % 2;
888 i3 = m;
889 for (ii = 1; ii <= i3; ii += 256)
891 /* Computing MIN */
892 i4 = 256;
893 i5 = m - ii + 1;
894 isec = min(i4,i5);
895 uisec = isec - isec % 2;
896 i4 = ll + ulsec - 1;
897 for (l = ll; l <= i4; l += 2)
899 i5 = ii + uisec - 1;
900 for (i = ii; i <= i5; i += 2)
902 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
903 a[i + l * a_dim1];
904 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
905 a[i + (l + 1) * a_dim1];
906 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
907 a[i + 1 + l * a_dim1];
908 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
909 a[i + 1 + (l + 1) * a_dim1];
911 if (uisec < isec)
913 t1[l - ll + 1 + (isec << 8) - 257] =
914 a[ii + isec - 1 + l * a_dim1];
915 t1[l - ll + 2 + (isec << 8) - 257] =
916 a[ii + isec - 1 + (l + 1) * a_dim1];
919 if (ulsec < lsec)
921 i4 = ii + isec - 1;
922 for (i = ii; i<= i4; ++i)
924 t1[lsec + ((i - ii + 1) << 8) - 257] =
925 a[i + (ll + lsec - 1) * a_dim1];
929 uisec = isec - isec % 4;
930 i4 = jj + ujsec - 1;
931 for (j = jj; j <= i4; j += 4)
933 i5 = ii + uisec - 1;
934 for (i = ii; i <= i5; i += 4)
936 f11 = c[i + j * c_dim1];
937 f21 = c[i + 1 + j * c_dim1];
938 f12 = c[i + (j + 1) * c_dim1];
939 f22 = c[i + 1 + (j + 1) * c_dim1];
940 f13 = c[i + (j + 2) * c_dim1];
941 f23 = c[i + 1 + (j + 2) * c_dim1];
942 f14 = c[i + (j + 3) * c_dim1];
943 f24 = c[i + 1 + (j + 3) * c_dim1];
944 f31 = c[i + 2 + j * c_dim1];
945 f41 = c[i + 3 + j * c_dim1];
946 f32 = c[i + 2 + (j + 1) * c_dim1];
947 f42 = c[i + 3 + (j + 1) * c_dim1];
948 f33 = c[i + 2 + (j + 2) * c_dim1];
949 f43 = c[i + 3 + (j + 2) * c_dim1];
950 f34 = c[i + 2 + (j + 3) * c_dim1];
951 f44 = c[i + 3 + (j + 3) * c_dim1];
952 i6 = ll + lsec - 1;
953 for (l = ll; l <= i6; ++l)
955 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
956 * b[l + j * b_dim1];
957 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
958 * b[l + j * b_dim1];
959 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
960 * b[l + (j + 1) * b_dim1];
961 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
962 * b[l + (j + 1) * b_dim1];
963 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
964 * b[l + (j + 2) * b_dim1];
965 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
966 * b[l + (j + 2) * b_dim1];
967 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
968 * b[l + (j + 3) * b_dim1];
969 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
970 * b[l + (j + 3) * b_dim1];
971 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
972 * b[l + j * b_dim1];
973 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
974 * b[l + j * b_dim1];
975 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
976 * b[l + (j + 1) * b_dim1];
977 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
978 * b[l + (j + 1) * b_dim1];
979 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
980 * b[l + (j + 2) * b_dim1];
981 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
982 * b[l + (j + 2) * b_dim1];
983 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
984 * b[l + (j + 3) * b_dim1];
985 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
986 * b[l + (j + 3) * b_dim1];
988 c[i + j * c_dim1] = f11;
989 c[i + 1 + j * c_dim1] = f21;
990 c[i + (j + 1) * c_dim1] = f12;
991 c[i + 1 + (j + 1) * c_dim1] = f22;
992 c[i + (j + 2) * c_dim1] = f13;
993 c[i + 1 + (j + 2) * c_dim1] = f23;
994 c[i + (j + 3) * c_dim1] = f14;
995 c[i + 1 + (j + 3) * c_dim1] = f24;
996 c[i + 2 + j * c_dim1] = f31;
997 c[i + 3 + j * c_dim1] = f41;
998 c[i + 2 + (j + 1) * c_dim1] = f32;
999 c[i + 3 + (j + 1) * c_dim1] = f42;
1000 c[i + 2 + (j + 2) * c_dim1] = f33;
1001 c[i + 3 + (j + 2) * c_dim1] = f43;
1002 c[i + 2 + (j + 3) * c_dim1] = f34;
1003 c[i + 3 + (j + 3) * c_dim1] = f44;
1005 if (uisec < isec)
1007 i5 = ii + isec - 1;
1008 for (i = ii + uisec; i <= i5; ++i)
1010 f11 = c[i + j * c_dim1];
1011 f12 = c[i + (j + 1) * c_dim1];
1012 f13 = c[i + (j + 2) * c_dim1];
1013 f14 = c[i + (j + 3) * c_dim1];
1014 i6 = ll + lsec - 1;
1015 for (l = ll; l <= i6; ++l)
1017 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1018 257] * b[l + j * b_dim1];
1019 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1020 257] * b[l + (j + 1) * b_dim1];
1021 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1022 257] * b[l + (j + 2) * b_dim1];
1023 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1024 257] * b[l + (j + 3) * b_dim1];
1026 c[i + j * c_dim1] = f11;
1027 c[i + (j + 1) * c_dim1] = f12;
1028 c[i + (j + 2) * c_dim1] = f13;
1029 c[i + (j + 3) * c_dim1] = f14;
1033 if (ujsec < jsec)
1035 i4 = jj + jsec - 1;
1036 for (j = jj + ujsec; j <= i4; ++j)
1038 i5 = ii + uisec - 1;
1039 for (i = ii; i <= i5; i += 4)
1041 f11 = c[i + j * c_dim1];
1042 f21 = c[i + 1 + j * c_dim1];
1043 f31 = c[i + 2 + j * c_dim1];
1044 f41 = c[i + 3 + j * c_dim1];
1045 i6 = ll + lsec - 1;
1046 for (l = ll; l <= i6; ++l)
1048 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1049 257] * b[l + j * b_dim1];
1050 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1051 257] * b[l + j * b_dim1];
1052 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1053 257] * b[l + j * b_dim1];
1054 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1055 257] * b[l + j * b_dim1];
1057 c[i + j * c_dim1] = f11;
1058 c[i + 1 + j * c_dim1] = f21;
1059 c[i + 2 + j * c_dim1] = f31;
1060 c[i + 3 + j * c_dim1] = f41;
1062 i5 = ii + isec - 1;
1063 for (i = ii + uisec; i <= i5; ++i)
1065 f11 = c[i + j * c_dim1];
1066 i6 = ll + lsec - 1;
1067 for (l = ll; l <= i6; ++l)
1069 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1070 257] * b[l + j * b_dim1];
1072 c[i + j * c_dim1] = f11;
1079 return;
1081 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1083 if (GFC_DESCRIPTOR_RANK (a) != 1)
1085 const GFC_REAL_4 *restrict abase_x;
1086 const GFC_REAL_4 *restrict bbase_y;
1087 GFC_REAL_4 *restrict dest_y;
1088 GFC_REAL_4 s;
1090 for (y = 0; y < ycount; y++)
1092 bbase_y = &bbase[y*bystride];
1093 dest_y = &dest[y*rystride];
1094 for (x = 0; x < xcount; x++)
1096 abase_x = &abase[x*axstride];
1097 s = (GFC_REAL_4) 0;
1098 for (n = 0; n < count; n++)
1099 s += abase_x[n] * bbase_y[n];
1100 dest_y[x] = s;
1104 else
1106 const GFC_REAL_4 *restrict bbase_y;
1107 GFC_REAL_4 s;
1109 for (y = 0; y < ycount; y++)
1111 bbase_y = &bbase[y*bystride];
1112 s = (GFC_REAL_4) 0;
1113 for (n = 0; n < count; n++)
1114 s += abase[n*axstride] * bbase_y[n];
1115 dest[y*rystride] = s;
1119 else if (axstride < aystride)
1121 for (y = 0; y < ycount; y++)
1122 for (x = 0; x < xcount; x++)
1123 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1125 for (y = 0; y < ycount; y++)
1126 for (n = 0; n < count; n++)
1127 for (x = 0; x < xcount; x++)
1128 /* dest[x,y] += a[x,n] * b[n,y] */
1129 dest[x*rxstride + y*rystride] +=
1130 abase[x*axstride + n*aystride] *
1131 bbase[n*bxstride + y*bystride];
1133 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1135 const GFC_REAL_4 *restrict bbase_y;
1136 GFC_REAL_4 s;
1138 for (y = 0; y < ycount; y++)
1140 bbase_y = &bbase[y*bystride];
1141 s = (GFC_REAL_4) 0;
1142 for (n = 0; n < count; n++)
1143 s += abase[n*axstride] * bbase_y[n*bxstride];
1144 dest[y*rxstride] = s;
1147 else
1149 const GFC_REAL_4 *restrict abase_x;
1150 const GFC_REAL_4 *restrict bbase_y;
1151 GFC_REAL_4 *restrict dest_y;
1152 GFC_REAL_4 s;
1154 for (y = 0; y < ycount; y++)
1156 bbase_y = &bbase[y*bystride];
1157 dest_y = &dest[y*rystride];
1158 for (x = 0; x < xcount; x++)
1160 abase_x = &abase[x*axstride];
1161 s = (GFC_REAL_4) 0;
1162 for (n = 0; n < count; n++)
1163 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1164 dest_y[x*rxstride] = s;
1169 #undef POW3
1170 #undef min
1171 #undef max
1173 #endif /* HAVE_AVX2 */
1175 #ifdef HAVE_AVX512F
1176 static void
1177 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1178 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1179 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1180 static void
1181 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1182 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1183 int blas_limit, blas_call gemm)
1185 const GFC_REAL_4 * restrict abase;
1186 const GFC_REAL_4 * restrict bbase;
1187 GFC_REAL_4 * restrict dest;
1189 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1190 index_type x, y, n, count, xcount, ycount;
1192 assert (GFC_DESCRIPTOR_RANK (a) == 2
1193 || GFC_DESCRIPTOR_RANK (b) == 2);
1195 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1197 Either A or B (but not both) can be rank 1:
1199 o One-dimensional argument A is implicitly treated as a row matrix
1200 dimensioned [1,count], so xcount=1.
1202 o One-dimensional argument B is implicitly treated as a column matrix
1203 dimensioned [count, 1], so ycount=1.
1206 if (retarray->base_addr == NULL)
1208 if (GFC_DESCRIPTOR_RANK (a) == 1)
1210 GFC_DIMENSION_SET(retarray->dim[0], 0,
1211 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1213 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1215 GFC_DIMENSION_SET(retarray->dim[0], 0,
1216 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1218 else
1220 GFC_DIMENSION_SET(retarray->dim[0], 0,
1221 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1223 GFC_DIMENSION_SET(retarray->dim[1], 0,
1224 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1225 GFC_DESCRIPTOR_EXTENT(retarray,0));
1228 retarray->base_addr
1229 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1230 retarray->offset = 0;
1232 else if (unlikely (compile_options.bounds_check))
1234 index_type ret_extent, arg_extent;
1236 if (GFC_DESCRIPTOR_RANK (a) == 1)
1238 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1239 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1240 if (arg_extent != ret_extent)
1241 runtime_error ("Incorrect extent in return array in"
1242 " MATMUL intrinsic: is %ld, should be %ld",
1243 (long int) ret_extent, (long int) arg_extent);
1245 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1247 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1248 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1249 if (arg_extent != ret_extent)
1250 runtime_error ("Incorrect extent in return array in"
1251 " MATMUL intrinsic: is %ld, should be %ld",
1252 (long int) ret_extent, (long int) arg_extent);
1254 else
1256 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1257 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1258 if (arg_extent != ret_extent)
1259 runtime_error ("Incorrect extent in return array in"
1260 " MATMUL intrinsic for dimension 1:"
1261 " is %ld, should be %ld",
1262 (long int) ret_extent, (long int) arg_extent);
1264 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1265 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1266 if (arg_extent != ret_extent)
1267 runtime_error ("Incorrect extent in return array in"
1268 " MATMUL intrinsic for dimension 2:"
1269 " is %ld, should be %ld",
1270 (long int) ret_extent, (long int) arg_extent);
1275 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1277 /* One-dimensional result may be addressed in the code below
1278 either as a row or a column matrix. We want both cases to
1279 work. */
1280 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1282 else
1284 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1285 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1289 if (GFC_DESCRIPTOR_RANK (a) == 1)
1291 /* Treat it as a a row matrix A[1,count]. */
1292 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1293 aystride = 1;
1295 xcount = 1;
1296 count = GFC_DESCRIPTOR_EXTENT(a,0);
1298 else
1300 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1301 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1303 count = GFC_DESCRIPTOR_EXTENT(a,1);
1304 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1307 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1309 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1310 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1313 if (GFC_DESCRIPTOR_RANK (b) == 1)
1315 /* Treat it as a column matrix B[count,1] */
1316 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1318 /* bystride should never be used for 1-dimensional b.
1319 in case it is we want it to cause a segfault, rather than
1320 an incorrect result. */
1321 bystride = 0xDEADBEEF;
1322 ycount = 1;
1324 else
1326 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1327 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1328 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1331 abase = a->base_addr;
1332 bbase = b->base_addr;
1333 dest = retarray->base_addr;
1335 /* Now that everything is set up, we perform the multiplication
1336 itself. */
1338 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1339 #define min(a,b) ((a) <= (b) ? (a) : (b))
1340 #define max(a,b) ((a) >= (b) ? (a) : (b))
1342 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1343 && (bxstride == 1 || bystride == 1)
1344 && (((float) xcount) * ((float) ycount) * ((float) count)
1345 > POW3(blas_limit)))
1347 const int m = xcount, n = ycount, k = count, ldc = rystride;
1348 const GFC_REAL_4 one = 1, zero = 0;
1349 const int lda = (axstride == 1) ? aystride : axstride,
1350 ldb = (bxstride == 1) ? bystride : bxstride;
1352 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1354 assert (gemm != NULL);
1355 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1356 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1357 &ldc, 1, 1);
1358 return;
1362 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1364 /* This block of code implements a tuned matmul, derived from
1365 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1367 Bo Kagstrom and Per Ling
1368 Department of Computing Science
1369 Umea University
1370 S-901 87 Umea, Sweden
1372 from netlib.org, translated to C, and modified for matmul.m4. */
1374 const GFC_REAL_4 *a, *b;
1375 GFC_REAL_4 *c;
1376 const index_type m = xcount, n = ycount, k = count;
1378 /* System generated locals */
1379 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1380 i1, i2, i3, i4, i5, i6;
1382 /* Local variables */
1383 GFC_REAL_4 t1[65536], /* was [256][256] */
1384 f11, f12, f21, f22, f31, f32, f41, f42,
1385 f13, f14, f23, f24, f33, f34, f43, f44;
1386 index_type i, j, l, ii, jj, ll;
1387 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1389 a = abase;
1390 b = bbase;
1391 c = retarray->base_addr;
1393 /* Parameter adjustments */
1394 c_dim1 = rystride;
1395 c_offset = 1 + c_dim1;
1396 c -= c_offset;
1397 a_dim1 = aystride;
1398 a_offset = 1 + a_dim1;
1399 a -= a_offset;
1400 b_dim1 = bystride;
1401 b_offset = 1 + b_dim1;
1402 b -= b_offset;
1404 /* Early exit if possible */
1405 if (m == 0 || n == 0 || k == 0)
1406 return;
1408 /* Empty c first. */
1409 for (j=1; j<=n; j++)
1410 for (i=1; i<=m; i++)
1411 c[i + j * c_dim1] = (GFC_REAL_4)0;
1413 /* Start turning the crank. */
1414 i1 = n;
1415 for (jj = 1; jj <= i1; jj += 512)
1417 /* Computing MIN */
1418 i2 = 512;
1419 i3 = n - jj + 1;
1420 jsec = min(i2,i3);
1421 ujsec = jsec - jsec % 4;
1422 i2 = k;
1423 for (ll = 1; ll <= i2; ll += 256)
1425 /* Computing MIN */
1426 i3 = 256;
1427 i4 = k - ll + 1;
1428 lsec = min(i3,i4);
1429 ulsec = lsec - lsec % 2;
1431 i3 = m;
1432 for (ii = 1; ii <= i3; ii += 256)
1434 /* Computing MIN */
1435 i4 = 256;
1436 i5 = m - ii + 1;
1437 isec = min(i4,i5);
1438 uisec = isec - isec % 2;
1439 i4 = ll + ulsec - 1;
1440 for (l = ll; l <= i4; l += 2)
1442 i5 = ii + uisec - 1;
1443 for (i = ii; i <= i5; i += 2)
1445 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1446 a[i + l * a_dim1];
1447 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1448 a[i + (l + 1) * a_dim1];
1449 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1450 a[i + 1 + l * a_dim1];
1451 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1452 a[i + 1 + (l + 1) * a_dim1];
1454 if (uisec < isec)
1456 t1[l - ll + 1 + (isec << 8) - 257] =
1457 a[ii + isec - 1 + l * a_dim1];
1458 t1[l - ll + 2 + (isec << 8) - 257] =
1459 a[ii + isec - 1 + (l + 1) * a_dim1];
1462 if (ulsec < lsec)
1464 i4 = ii + isec - 1;
1465 for (i = ii; i<= i4; ++i)
1467 t1[lsec + ((i - ii + 1) << 8) - 257] =
1468 a[i + (ll + lsec - 1) * a_dim1];
1472 uisec = isec - isec % 4;
1473 i4 = jj + ujsec - 1;
1474 for (j = jj; j <= i4; j += 4)
1476 i5 = ii + uisec - 1;
1477 for (i = ii; i <= i5; i += 4)
1479 f11 = c[i + j * c_dim1];
1480 f21 = c[i + 1 + j * c_dim1];
1481 f12 = c[i + (j + 1) * c_dim1];
1482 f22 = c[i + 1 + (j + 1) * c_dim1];
1483 f13 = c[i + (j + 2) * c_dim1];
1484 f23 = c[i + 1 + (j + 2) * c_dim1];
1485 f14 = c[i + (j + 3) * c_dim1];
1486 f24 = c[i + 1 + (j + 3) * c_dim1];
1487 f31 = c[i + 2 + j * c_dim1];
1488 f41 = c[i + 3 + j * c_dim1];
1489 f32 = c[i + 2 + (j + 1) * c_dim1];
1490 f42 = c[i + 3 + (j + 1) * c_dim1];
1491 f33 = c[i + 2 + (j + 2) * c_dim1];
1492 f43 = c[i + 3 + (j + 2) * c_dim1];
1493 f34 = c[i + 2 + (j + 3) * c_dim1];
1494 f44 = c[i + 3 + (j + 3) * c_dim1];
1495 i6 = ll + lsec - 1;
1496 for (l = ll; l <= i6; ++l)
1498 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1499 * b[l + j * b_dim1];
1500 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1501 * b[l + j * b_dim1];
1502 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1503 * b[l + (j + 1) * b_dim1];
1504 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1505 * b[l + (j + 1) * b_dim1];
1506 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1507 * b[l + (j + 2) * b_dim1];
1508 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1509 * b[l + (j + 2) * b_dim1];
1510 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1511 * b[l + (j + 3) * b_dim1];
1512 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1513 * b[l + (j + 3) * b_dim1];
1514 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1515 * b[l + j * b_dim1];
1516 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1517 * b[l + j * b_dim1];
1518 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1519 * b[l + (j + 1) * b_dim1];
1520 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1521 * b[l + (j + 1) * b_dim1];
1522 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1523 * b[l + (j + 2) * b_dim1];
1524 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1525 * b[l + (j + 2) * b_dim1];
1526 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1527 * b[l + (j + 3) * b_dim1];
1528 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1529 * b[l + (j + 3) * b_dim1];
1531 c[i + j * c_dim1] = f11;
1532 c[i + 1 + j * c_dim1] = f21;
1533 c[i + (j + 1) * c_dim1] = f12;
1534 c[i + 1 + (j + 1) * c_dim1] = f22;
1535 c[i + (j + 2) * c_dim1] = f13;
1536 c[i + 1 + (j + 2) * c_dim1] = f23;
1537 c[i + (j + 3) * c_dim1] = f14;
1538 c[i + 1 + (j + 3) * c_dim1] = f24;
1539 c[i + 2 + j * c_dim1] = f31;
1540 c[i + 3 + j * c_dim1] = f41;
1541 c[i + 2 + (j + 1) * c_dim1] = f32;
1542 c[i + 3 + (j + 1) * c_dim1] = f42;
1543 c[i + 2 + (j + 2) * c_dim1] = f33;
1544 c[i + 3 + (j + 2) * c_dim1] = f43;
1545 c[i + 2 + (j + 3) * c_dim1] = f34;
1546 c[i + 3 + (j + 3) * c_dim1] = f44;
1548 if (uisec < isec)
1550 i5 = ii + isec - 1;
1551 for (i = ii + uisec; i <= i5; ++i)
1553 f11 = c[i + j * c_dim1];
1554 f12 = c[i + (j + 1) * c_dim1];
1555 f13 = c[i + (j + 2) * c_dim1];
1556 f14 = c[i + (j + 3) * c_dim1];
1557 i6 = ll + lsec - 1;
1558 for (l = ll; l <= i6; ++l)
1560 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1561 257] * b[l + j * b_dim1];
1562 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1563 257] * b[l + (j + 1) * b_dim1];
1564 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1565 257] * b[l + (j + 2) * b_dim1];
1566 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1567 257] * b[l + (j + 3) * b_dim1];
1569 c[i + j * c_dim1] = f11;
1570 c[i + (j + 1) * c_dim1] = f12;
1571 c[i + (j + 2) * c_dim1] = f13;
1572 c[i + (j + 3) * c_dim1] = f14;
1576 if (ujsec < jsec)
1578 i4 = jj + jsec - 1;
1579 for (j = jj + ujsec; j <= i4; ++j)
1581 i5 = ii + uisec - 1;
1582 for (i = ii; i <= i5; i += 4)
1584 f11 = c[i + j * c_dim1];
1585 f21 = c[i + 1 + j * c_dim1];
1586 f31 = c[i + 2 + j * c_dim1];
1587 f41 = c[i + 3 + j * c_dim1];
1588 i6 = ll + lsec - 1;
1589 for (l = ll; l <= i6; ++l)
1591 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1592 257] * b[l + j * b_dim1];
1593 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1594 257] * b[l + j * b_dim1];
1595 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1596 257] * b[l + j * b_dim1];
1597 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1598 257] * b[l + j * b_dim1];
1600 c[i + j * c_dim1] = f11;
1601 c[i + 1 + j * c_dim1] = f21;
1602 c[i + 2 + j * c_dim1] = f31;
1603 c[i + 3 + j * c_dim1] = f41;
1605 i5 = ii + isec - 1;
1606 for (i = ii + uisec; i <= i5; ++i)
1608 f11 = c[i + j * c_dim1];
1609 i6 = ll + lsec - 1;
1610 for (l = ll; l <= i6; ++l)
1612 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1613 257] * b[l + j * b_dim1];
1615 c[i + j * c_dim1] = f11;
1622 return;
1624 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1626 if (GFC_DESCRIPTOR_RANK (a) != 1)
1628 const GFC_REAL_4 *restrict abase_x;
1629 const GFC_REAL_4 *restrict bbase_y;
1630 GFC_REAL_4 *restrict dest_y;
1631 GFC_REAL_4 s;
1633 for (y = 0; y < ycount; y++)
1635 bbase_y = &bbase[y*bystride];
1636 dest_y = &dest[y*rystride];
1637 for (x = 0; x < xcount; x++)
1639 abase_x = &abase[x*axstride];
1640 s = (GFC_REAL_4) 0;
1641 for (n = 0; n < count; n++)
1642 s += abase_x[n] * bbase_y[n];
1643 dest_y[x] = s;
1647 else
1649 const GFC_REAL_4 *restrict bbase_y;
1650 GFC_REAL_4 s;
1652 for (y = 0; y < ycount; y++)
1654 bbase_y = &bbase[y*bystride];
1655 s = (GFC_REAL_4) 0;
1656 for (n = 0; n < count; n++)
1657 s += abase[n*axstride] * bbase_y[n];
1658 dest[y*rystride] = s;
1662 else if (axstride < aystride)
1664 for (y = 0; y < ycount; y++)
1665 for (x = 0; x < xcount; x++)
1666 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1668 for (y = 0; y < ycount; y++)
1669 for (n = 0; n < count; n++)
1670 for (x = 0; x < xcount; x++)
1671 /* dest[x,y] += a[x,n] * b[n,y] */
1672 dest[x*rxstride + y*rystride] +=
1673 abase[x*axstride + n*aystride] *
1674 bbase[n*bxstride + y*bystride];
1676 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1678 const GFC_REAL_4 *restrict bbase_y;
1679 GFC_REAL_4 s;
1681 for (y = 0; y < ycount; y++)
1683 bbase_y = &bbase[y*bystride];
1684 s = (GFC_REAL_4) 0;
1685 for (n = 0; n < count; n++)
1686 s += abase[n*axstride] * bbase_y[n*bxstride];
1687 dest[y*rxstride] = s;
1690 else
1692 const GFC_REAL_4 *restrict abase_x;
1693 const GFC_REAL_4 *restrict bbase_y;
1694 GFC_REAL_4 *restrict dest_y;
1695 GFC_REAL_4 s;
1697 for (y = 0; y < ycount; y++)
1699 bbase_y = &bbase[y*bystride];
1700 dest_y = &dest[y*rystride];
1701 for (x = 0; x < xcount; x++)
1703 abase_x = &abase[x*axstride];
1704 s = (GFC_REAL_4) 0;
1705 for (n = 0; n < count; n++)
1706 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1707 dest_y[x*rxstride] = s;
1712 #undef POW3
1713 #undef min
1714 #undef max
1716 #endif /* HAVE_AVX512F */
1718 /* Function to fall back to if there is no special processor-specific version. */
1719 static void
1720 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
1721 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1722 int blas_limit, blas_call gemm)
1724 const GFC_REAL_4 * restrict abase;
1725 const GFC_REAL_4 * restrict bbase;
1726 GFC_REAL_4 * restrict dest;
1728 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1729 index_type x, y, n, count, xcount, ycount;
1731 assert (GFC_DESCRIPTOR_RANK (a) == 2
1732 || GFC_DESCRIPTOR_RANK (b) == 2);
1734 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1736 Either A or B (but not both) can be rank 1:
1738 o One-dimensional argument A is implicitly treated as a row matrix
1739 dimensioned [1,count], so xcount=1.
1741 o One-dimensional argument B is implicitly treated as a column matrix
1742 dimensioned [count, 1], so ycount=1.
1745 if (retarray->base_addr == NULL)
1747 if (GFC_DESCRIPTOR_RANK (a) == 1)
1749 GFC_DIMENSION_SET(retarray->dim[0], 0,
1750 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1752 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1754 GFC_DIMENSION_SET(retarray->dim[0], 0,
1755 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1757 else
1759 GFC_DIMENSION_SET(retarray->dim[0], 0,
1760 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1762 GFC_DIMENSION_SET(retarray->dim[1], 0,
1763 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1764 GFC_DESCRIPTOR_EXTENT(retarray,0));
1767 retarray->base_addr
1768 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1769 retarray->offset = 0;
1771 else if (unlikely (compile_options.bounds_check))
1773 index_type ret_extent, arg_extent;
1775 if (GFC_DESCRIPTOR_RANK (a) == 1)
1777 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1778 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1779 if (arg_extent != ret_extent)
1780 runtime_error ("Incorrect extent in return array in"
1781 " MATMUL intrinsic: is %ld, should be %ld",
1782 (long int) ret_extent, (long int) arg_extent);
1784 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1786 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1787 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1788 if (arg_extent != ret_extent)
1789 runtime_error ("Incorrect extent in return array in"
1790 " MATMUL intrinsic: is %ld, should be %ld",
1791 (long int) ret_extent, (long int) arg_extent);
1793 else
1795 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1796 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1797 if (arg_extent != ret_extent)
1798 runtime_error ("Incorrect extent in return array in"
1799 " MATMUL intrinsic for dimension 1:"
1800 " is %ld, should be %ld",
1801 (long int) ret_extent, (long int) arg_extent);
1803 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1804 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1805 if (arg_extent != ret_extent)
1806 runtime_error ("Incorrect extent in return array in"
1807 " MATMUL intrinsic for dimension 2:"
1808 " is %ld, should be %ld",
1809 (long int) ret_extent, (long int) arg_extent);
1814 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1816 /* One-dimensional result may be addressed in the code below
1817 either as a row or a column matrix. We want both cases to
1818 work. */
1819 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1821 else
1823 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1824 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1828 if (GFC_DESCRIPTOR_RANK (a) == 1)
1830 /* Treat it as a a row matrix A[1,count]. */
1831 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1832 aystride = 1;
1834 xcount = 1;
1835 count = GFC_DESCRIPTOR_EXTENT(a,0);
1837 else
1839 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1840 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1842 count = GFC_DESCRIPTOR_EXTENT(a,1);
1843 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1846 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1848 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1849 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1852 if (GFC_DESCRIPTOR_RANK (b) == 1)
1854 /* Treat it as a column matrix B[count,1] */
1855 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1857 /* bystride should never be used for 1-dimensional b.
1858 in case it is we want it to cause a segfault, rather than
1859 an incorrect result. */
1860 bystride = 0xDEADBEEF;
1861 ycount = 1;
1863 else
1865 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1866 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1867 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1870 abase = a->base_addr;
1871 bbase = b->base_addr;
1872 dest = retarray->base_addr;
1874 /* Now that everything is set up, we perform the multiplication
1875 itself. */
1877 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1878 #define min(a,b) ((a) <= (b) ? (a) : (b))
1879 #define max(a,b) ((a) >= (b) ? (a) : (b))
1881 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1882 && (bxstride == 1 || bystride == 1)
1883 && (((float) xcount) * ((float) ycount) * ((float) count)
1884 > POW3(blas_limit)))
1886 const int m = xcount, n = ycount, k = count, ldc = rystride;
1887 const GFC_REAL_4 one = 1, zero = 0;
1888 const int lda = (axstride == 1) ? aystride : axstride,
1889 ldb = (bxstride == 1) ? bystride : bxstride;
1891 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1893 assert (gemm != NULL);
1894 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1895 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1896 &ldc, 1, 1);
1897 return;
1901 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1903 /* This block of code implements a tuned matmul, derived from
1904 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1906 Bo Kagstrom and Per Ling
1907 Department of Computing Science
1908 Umea University
1909 S-901 87 Umea, Sweden
1911 from netlib.org, translated to C, and modified for matmul.m4. */
1913 const GFC_REAL_4 *a, *b;
1914 GFC_REAL_4 *c;
1915 const index_type m = xcount, n = ycount, k = count;
1917 /* System generated locals */
1918 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1919 i1, i2, i3, i4, i5, i6;
1921 /* Local variables */
1922 GFC_REAL_4 t1[65536], /* was [256][256] */
1923 f11, f12, f21, f22, f31, f32, f41, f42,
1924 f13, f14, f23, f24, f33, f34, f43, f44;
1925 index_type i, j, l, ii, jj, ll;
1926 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1928 a = abase;
1929 b = bbase;
1930 c = retarray->base_addr;
1932 /* Parameter adjustments */
1933 c_dim1 = rystride;
1934 c_offset = 1 + c_dim1;
1935 c -= c_offset;
1936 a_dim1 = aystride;
1937 a_offset = 1 + a_dim1;
1938 a -= a_offset;
1939 b_dim1 = bystride;
1940 b_offset = 1 + b_dim1;
1941 b -= b_offset;
1943 /* Early exit if possible */
1944 if (m == 0 || n == 0 || k == 0)
1945 return;
1947 /* Empty c first. */
1948 for (j=1; j<=n; j++)
1949 for (i=1; i<=m; i++)
1950 c[i + j * c_dim1] = (GFC_REAL_4)0;
1952 /* Start turning the crank. */
1953 i1 = n;
1954 for (jj = 1; jj <= i1; jj += 512)
1956 /* Computing MIN */
1957 i2 = 512;
1958 i3 = n - jj + 1;
1959 jsec = min(i2,i3);
1960 ujsec = jsec - jsec % 4;
1961 i2 = k;
1962 for (ll = 1; ll <= i2; ll += 256)
1964 /* Computing MIN */
1965 i3 = 256;
1966 i4 = k - ll + 1;
1967 lsec = min(i3,i4);
1968 ulsec = lsec - lsec % 2;
1970 i3 = m;
1971 for (ii = 1; ii <= i3; ii += 256)
1973 /* Computing MIN */
1974 i4 = 256;
1975 i5 = m - ii + 1;
1976 isec = min(i4,i5);
1977 uisec = isec - isec % 2;
1978 i4 = ll + ulsec - 1;
1979 for (l = ll; l <= i4; l += 2)
1981 i5 = ii + uisec - 1;
1982 for (i = ii; i <= i5; i += 2)
1984 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1985 a[i + l * a_dim1];
1986 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1987 a[i + (l + 1) * a_dim1];
1988 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1989 a[i + 1 + l * a_dim1];
1990 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1991 a[i + 1 + (l + 1) * a_dim1];
1993 if (uisec < isec)
1995 t1[l - ll + 1 + (isec << 8) - 257] =
1996 a[ii + isec - 1 + l * a_dim1];
1997 t1[l - ll + 2 + (isec << 8) - 257] =
1998 a[ii + isec - 1 + (l + 1) * a_dim1];
2001 if (ulsec < lsec)
2003 i4 = ii + isec - 1;
2004 for (i = ii; i<= i4; ++i)
2006 t1[lsec + ((i - ii + 1) << 8) - 257] =
2007 a[i + (ll + lsec - 1) * a_dim1];
2011 uisec = isec - isec % 4;
2012 i4 = jj + ujsec - 1;
2013 for (j = jj; j <= i4; j += 4)
2015 i5 = ii + uisec - 1;
2016 for (i = ii; i <= i5; i += 4)
2018 f11 = c[i + j * c_dim1];
2019 f21 = c[i + 1 + j * c_dim1];
2020 f12 = c[i + (j + 1) * c_dim1];
2021 f22 = c[i + 1 + (j + 1) * c_dim1];
2022 f13 = c[i + (j + 2) * c_dim1];
2023 f23 = c[i + 1 + (j + 2) * c_dim1];
2024 f14 = c[i + (j + 3) * c_dim1];
2025 f24 = c[i + 1 + (j + 3) * c_dim1];
2026 f31 = c[i + 2 + j * c_dim1];
2027 f41 = c[i + 3 + j * c_dim1];
2028 f32 = c[i + 2 + (j + 1) * c_dim1];
2029 f42 = c[i + 3 + (j + 1) * c_dim1];
2030 f33 = c[i + 2 + (j + 2) * c_dim1];
2031 f43 = c[i + 3 + (j + 2) * c_dim1];
2032 f34 = c[i + 2 + (j + 3) * c_dim1];
2033 f44 = c[i + 3 + (j + 3) * c_dim1];
2034 i6 = ll + lsec - 1;
2035 for (l = ll; l <= i6; ++l)
2037 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2038 * b[l + j * b_dim1];
2039 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2040 * b[l + j * b_dim1];
2041 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2042 * b[l + (j + 1) * b_dim1];
2043 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2044 * b[l + (j + 1) * b_dim1];
2045 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2046 * b[l + (j + 2) * b_dim1];
2047 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2048 * b[l + (j + 2) * b_dim1];
2049 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2050 * b[l + (j + 3) * b_dim1];
2051 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2052 * b[l + (j + 3) * b_dim1];
2053 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2054 * b[l + j * b_dim1];
2055 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2056 * b[l + j * b_dim1];
2057 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2058 * b[l + (j + 1) * b_dim1];
2059 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2060 * b[l + (j + 1) * b_dim1];
2061 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2062 * b[l + (j + 2) * b_dim1];
2063 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2064 * b[l + (j + 2) * b_dim1];
2065 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2066 * b[l + (j + 3) * b_dim1];
2067 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2068 * b[l + (j + 3) * b_dim1];
2070 c[i + j * c_dim1] = f11;
2071 c[i + 1 + j * c_dim1] = f21;
2072 c[i + (j + 1) * c_dim1] = f12;
2073 c[i + 1 + (j + 1) * c_dim1] = f22;
2074 c[i + (j + 2) * c_dim1] = f13;
2075 c[i + 1 + (j + 2) * c_dim1] = f23;
2076 c[i + (j + 3) * c_dim1] = f14;
2077 c[i + 1 + (j + 3) * c_dim1] = f24;
2078 c[i + 2 + j * c_dim1] = f31;
2079 c[i + 3 + j * c_dim1] = f41;
2080 c[i + 2 + (j + 1) * c_dim1] = f32;
2081 c[i + 3 + (j + 1) * c_dim1] = f42;
2082 c[i + 2 + (j + 2) * c_dim1] = f33;
2083 c[i + 3 + (j + 2) * c_dim1] = f43;
2084 c[i + 2 + (j + 3) * c_dim1] = f34;
2085 c[i + 3 + (j + 3) * c_dim1] = f44;
2087 if (uisec < isec)
2089 i5 = ii + isec - 1;
2090 for (i = ii + uisec; i <= i5; ++i)
2092 f11 = c[i + j * c_dim1];
2093 f12 = c[i + (j + 1) * c_dim1];
2094 f13 = c[i + (j + 2) * c_dim1];
2095 f14 = c[i + (j + 3) * c_dim1];
2096 i6 = ll + lsec - 1;
2097 for (l = ll; l <= i6; ++l)
2099 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2100 257] * b[l + j * b_dim1];
2101 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2102 257] * b[l + (j + 1) * b_dim1];
2103 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2104 257] * b[l + (j + 2) * b_dim1];
2105 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2106 257] * b[l + (j + 3) * b_dim1];
2108 c[i + j * c_dim1] = f11;
2109 c[i + (j + 1) * c_dim1] = f12;
2110 c[i + (j + 2) * c_dim1] = f13;
2111 c[i + (j + 3) * c_dim1] = f14;
2115 if (ujsec < jsec)
2117 i4 = jj + jsec - 1;
2118 for (j = jj + ujsec; j <= i4; ++j)
2120 i5 = ii + uisec - 1;
2121 for (i = ii; i <= i5; i += 4)
2123 f11 = c[i + j * c_dim1];
2124 f21 = c[i + 1 + j * c_dim1];
2125 f31 = c[i + 2 + j * c_dim1];
2126 f41 = c[i + 3 + j * c_dim1];
2127 i6 = ll + lsec - 1;
2128 for (l = ll; l <= i6; ++l)
2130 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2131 257] * b[l + j * b_dim1];
2132 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2133 257] * b[l + j * b_dim1];
2134 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2135 257] * b[l + j * b_dim1];
2136 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2137 257] * b[l + j * b_dim1];
2139 c[i + j * c_dim1] = f11;
2140 c[i + 1 + j * c_dim1] = f21;
2141 c[i + 2 + j * c_dim1] = f31;
2142 c[i + 3 + j * c_dim1] = f41;
2144 i5 = ii + isec - 1;
2145 for (i = ii + uisec; i <= i5; ++i)
2147 f11 = c[i + j * c_dim1];
2148 i6 = ll + lsec - 1;
2149 for (l = ll; l <= i6; ++l)
2151 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2152 257] * b[l + j * b_dim1];
2154 c[i + j * c_dim1] = f11;
2161 return;
2163 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2165 if (GFC_DESCRIPTOR_RANK (a) != 1)
2167 const GFC_REAL_4 *restrict abase_x;
2168 const GFC_REAL_4 *restrict bbase_y;
2169 GFC_REAL_4 *restrict dest_y;
2170 GFC_REAL_4 s;
2172 for (y = 0; y < ycount; y++)
2174 bbase_y = &bbase[y*bystride];
2175 dest_y = &dest[y*rystride];
2176 for (x = 0; x < xcount; x++)
2178 abase_x = &abase[x*axstride];
2179 s = (GFC_REAL_4) 0;
2180 for (n = 0; n < count; n++)
2181 s += abase_x[n] * bbase_y[n];
2182 dest_y[x] = s;
2186 else
2188 const GFC_REAL_4 *restrict bbase_y;
2189 GFC_REAL_4 s;
2191 for (y = 0; y < ycount; y++)
2193 bbase_y = &bbase[y*bystride];
2194 s = (GFC_REAL_4) 0;
2195 for (n = 0; n < count; n++)
2196 s += abase[n*axstride] * bbase_y[n];
2197 dest[y*rystride] = s;
2201 else if (axstride < aystride)
2203 for (y = 0; y < ycount; y++)
2204 for (x = 0; x < xcount; x++)
2205 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2207 for (y = 0; y < ycount; y++)
2208 for (n = 0; n < count; n++)
2209 for (x = 0; x < xcount; x++)
2210 /* dest[x,y] += a[x,n] * b[n,y] */
2211 dest[x*rxstride + y*rystride] +=
2212 abase[x*axstride + n*aystride] *
2213 bbase[n*bxstride + y*bystride];
2215 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2217 const GFC_REAL_4 *restrict bbase_y;
2218 GFC_REAL_4 s;
2220 for (y = 0; y < ycount; y++)
2222 bbase_y = &bbase[y*bystride];
2223 s = (GFC_REAL_4) 0;
2224 for (n = 0; n < count; n++)
2225 s += abase[n*axstride] * bbase_y[n*bxstride];
2226 dest[y*rxstride] = s;
2229 else
2231 const GFC_REAL_4 *restrict abase_x;
2232 const GFC_REAL_4 *restrict bbase_y;
2233 GFC_REAL_4 *restrict dest_y;
2234 GFC_REAL_4 s;
2236 for (y = 0; y < ycount; y++)
2238 bbase_y = &bbase[y*bystride];
2239 dest_y = &dest[y*rystride];
2240 for (x = 0; x < xcount; x++)
2242 abase_x = &abase[x*axstride];
2243 s = (GFC_REAL_4) 0;
2244 for (n = 0; n < count; n++)
2245 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2246 dest_y[x*rxstride] = s;
2251 #undef POW3
2252 #undef min
2253 #undef max
2256 /* Compiling main function, with selection code for the processor. */
2258 /* Currently, this is i386 only. Adjust for other architectures. */
2260 #include <config/i386/cpuinfo.h>
2261 void matmul_r4 (gfc_array_r4 * const restrict retarray,
2262 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2263 int blas_limit, blas_call gemm)
2265 static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
2266 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2267 int blas_limit, blas_call gemm) = NULL;
2269 if (matmul_p == NULL)
2271 matmul_p = matmul_r4_vanilla;
2272 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2274 /* Run down the available processors in order of preference. */
2275 #ifdef HAVE_AVX512F
2276 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2278 matmul_p = matmul_r4_avx512f;
2279 goto tailcall;
2282 #endif /* HAVE_AVX512F */
2284 #ifdef HAVE_AVX2
2285 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2287 matmul_p = matmul_r4_avx2;
2288 goto tailcall;
2291 #endif
2293 #ifdef HAVE_AVX
2294 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2296 matmul_p = matmul_r4_avx;
2297 goto tailcall;
2299 #endif /* HAVE_AVX */
2303 tailcall:
2304 (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2307 #else /* Just the vanilla function. */
2309 void
2310 matmul_r4 (gfc_array_r4 * const restrict retarray,
2311 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2312 int blas_limit, blas_call gemm)
2314 const GFC_REAL_4 * restrict abase;
2315 const GFC_REAL_4 * restrict bbase;
2316 GFC_REAL_4 * restrict dest;
2318 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2319 index_type x, y, n, count, xcount, ycount;
2321 assert (GFC_DESCRIPTOR_RANK (a) == 2
2322 || GFC_DESCRIPTOR_RANK (b) == 2);
2324 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2326 Either A or B (but not both) can be rank 1:
2328 o One-dimensional argument A is implicitly treated as a row matrix
2329 dimensioned [1,count], so xcount=1.
2331 o One-dimensional argument B is implicitly treated as a column matrix
2332 dimensioned [count, 1], so ycount=1.
2335 if (retarray->base_addr == NULL)
2337 if (GFC_DESCRIPTOR_RANK (a) == 1)
2339 GFC_DIMENSION_SET(retarray->dim[0], 0,
2340 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2342 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2344 GFC_DIMENSION_SET(retarray->dim[0], 0,
2345 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2347 else
2349 GFC_DIMENSION_SET(retarray->dim[0], 0,
2350 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2352 GFC_DIMENSION_SET(retarray->dim[1], 0,
2353 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2354 GFC_DESCRIPTOR_EXTENT(retarray,0));
2357 retarray->base_addr
2358 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
2359 retarray->offset = 0;
2361 else if (unlikely (compile_options.bounds_check))
2363 index_type ret_extent, arg_extent;
2365 if (GFC_DESCRIPTOR_RANK (a) == 1)
2367 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2368 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2369 if (arg_extent != ret_extent)
2370 runtime_error ("Incorrect extent in return array in"
2371 " MATMUL intrinsic: is %ld, should be %ld",
2372 (long int) ret_extent, (long int) arg_extent);
2374 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2376 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2377 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2378 if (arg_extent != ret_extent)
2379 runtime_error ("Incorrect extent in return array in"
2380 " MATMUL intrinsic: is %ld, should be %ld",
2381 (long int) ret_extent, (long int) arg_extent);
2383 else
2385 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2386 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2387 if (arg_extent != ret_extent)
2388 runtime_error ("Incorrect extent in return array in"
2389 " MATMUL intrinsic for dimension 1:"
2390 " is %ld, should be %ld",
2391 (long int) ret_extent, (long int) arg_extent);
2393 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2394 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2395 if (arg_extent != ret_extent)
2396 runtime_error ("Incorrect extent in return array in"
2397 " MATMUL intrinsic for dimension 2:"
2398 " is %ld, should be %ld",
2399 (long int) ret_extent, (long int) arg_extent);
2404 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2406 /* One-dimensional result may be addressed in the code below
2407 either as a row or a column matrix. We want both cases to
2408 work. */
2409 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2411 else
2413 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2414 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2418 if (GFC_DESCRIPTOR_RANK (a) == 1)
2420 /* Treat it as a a row matrix A[1,count]. */
2421 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2422 aystride = 1;
2424 xcount = 1;
2425 count = GFC_DESCRIPTOR_EXTENT(a,0);
2427 else
2429 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2430 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2432 count = GFC_DESCRIPTOR_EXTENT(a,1);
2433 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2436 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2438 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2439 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2442 if (GFC_DESCRIPTOR_RANK (b) == 1)
2444 /* Treat it as a column matrix B[count,1] */
2445 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2447 /* bystride should never be used for 1-dimensional b.
2448 in case it is we want it to cause a segfault, rather than
2449 an incorrect result. */
2450 bystride = 0xDEADBEEF;
2451 ycount = 1;
2453 else
2455 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2456 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2457 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2460 abase = a->base_addr;
2461 bbase = b->base_addr;
2462 dest = retarray->base_addr;
2464 /* Now that everything is set up, we perform the multiplication
2465 itself. */
2467 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2468 #define min(a,b) ((a) <= (b) ? (a) : (b))
2469 #define max(a,b) ((a) >= (b) ? (a) : (b))
2471 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2472 && (bxstride == 1 || bystride == 1)
2473 && (((float) xcount) * ((float) ycount) * ((float) count)
2474 > POW3(blas_limit)))
2476 const int m = xcount, n = ycount, k = count, ldc = rystride;
2477 const GFC_REAL_4 one = 1, zero = 0;
2478 const int lda = (axstride == 1) ? aystride : axstride,
2479 ldb = (bxstride == 1) ? bystride : bxstride;
2481 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2483 assert (gemm != NULL);
2484 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2485 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2486 &ldc, 1, 1);
2487 return;
2491 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2493 /* This block of code implements a tuned matmul, derived from
2494 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2496 Bo Kagstrom and Per Ling
2497 Department of Computing Science
2498 Umea University
2499 S-901 87 Umea, Sweden
2501 from netlib.org, translated to C, and modified for matmul.m4. */
2503 const GFC_REAL_4 *a, *b;
2504 GFC_REAL_4 *c;
2505 const index_type m = xcount, n = ycount, k = count;
2507 /* System generated locals */
2508 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2509 i1, i2, i3, i4, i5, i6;
2511 /* Local variables */
2512 GFC_REAL_4 t1[65536], /* was [256][256] */
2513 f11, f12, f21, f22, f31, f32, f41, f42,
2514 f13, f14, f23, f24, f33, f34, f43, f44;
2515 index_type i, j, l, ii, jj, ll;
2516 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2518 a = abase;
2519 b = bbase;
2520 c = retarray->base_addr;
2522 /* Parameter adjustments */
2523 c_dim1 = rystride;
2524 c_offset = 1 + c_dim1;
2525 c -= c_offset;
2526 a_dim1 = aystride;
2527 a_offset = 1 + a_dim1;
2528 a -= a_offset;
2529 b_dim1 = bystride;
2530 b_offset = 1 + b_dim1;
2531 b -= b_offset;
2533 /* Early exit if possible */
2534 if (m == 0 || n == 0 || k == 0)
2535 return;
2537 /* Empty c first. */
2538 for (j=1; j<=n; j++)
2539 for (i=1; i<=m; i++)
2540 c[i + j * c_dim1] = (GFC_REAL_4)0;
2542 /* Start turning the crank. */
2543 i1 = n;
2544 for (jj = 1; jj <= i1; jj += 512)
2546 /* Computing MIN */
2547 i2 = 512;
2548 i3 = n - jj + 1;
2549 jsec = min(i2,i3);
2550 ujsec = jsec - jsec % 4;
2551 i2 = k;
2552 for (ll = 1; ll <= i2; ll += 256)
2554 /* Computing MIN */
2555 i3 = 256;
2556 i4 = k - ll + 1;
2557 lsec = min(i3,i4);
2558 ulsec = lsec - lsec % 2;
2560 i3 = m;
2561 for (ii = 1; ii <= i3; ii += 256)
2563 /* Computing MIN */
2564 i4 = 256;
2565 i5 = m - ii + 1;
2566 isec = min(i4,i5);
2567 uisec = isec - isec % 2;
2568 i4 = ll + ulsec - 1;
2569 for (l = ll; l <= i4; l += 2)
2571 i5 = ii + uisec - 1;
2572 for (i = ii; i <= i5; i += 2)
2574 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2575 a[i + l * a_dim1];
2576 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2577 a[i + (l + 1) * a_dim1];
2578 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2579 a[i + 1 + l * a_dim1];
2580 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2581 a[i + 1 + (l + 1) * a_dim1];
2583 if (uisec < isec)
2585 t1[l - ll + 1 + (isec << 8) - 257] =
2586 a[ii + isec - 1 + l * a_dim1];
2587 t1[l - ll + 2 + (isec << 8) - 257] =
2588 a[ii + isec - 1 + (l + 1) * a_dim1];
2591 if (ulsec < lsec)
2593 i4 = ii + isec - 1;
2594 for (i = ii; i<= i4; ++i)
2596 t1[lsec + ((i - ii + 1) << 8) - 257] =
2597 a[i + (ll + lsec - 1) * a_dim1];
2601 uisec = isec - isec % 4;
2602 i4 = jj + ujsec - 1;
2603 for (j = jj; j <= i4; j += 4)
2605 i5 = ii + uisec - 1;
2606 for (i = ii; i <= i5; i += 4)
2608 f11 = c[i + j * c_dim1];
2609 f21 = c[i + 1 + j * c_dim1];
2610 f12 = c[i + (j + 1) * c_dim1];
2611 f22 = c[i + 1 + (j + 1) * c_dim1];
2612 f13 = c[i + (j + 2) * c_dim1];
2613 f23 = c[i + 1 + (j + 2) * c_dim1];
2614 f14 = c[i + (j + 3) * c_dim1];
2615 f24 = c[i + 1 + (j + 3) * c_dim1];
2616 f31 = c[i + 2 + j * c_dim1];
2617 f41 = c[i + 3 + j * c_dim1];
2618 f32 = c[i + 2 + (j + 1) * c_dim1];
2619 f42 = c[i + 3 + (j + 1) * c_dim1];
2620 f33 = c[i + 2 + (j + 2) * c_dim1];
2621 f43 = c[i + 3 + (j + 2) * c_dim1];
2622 f34 = c[i + 2 + (j + 3) * c_dim1];
2623 f44 = c[i + 3 + (j + 3) * c_dim1];
2624 i6 = ll + lsec - 1;
2625 for (l = ll; l <= i6; ++l)
2627 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2628 * b[l + j * b_dim1];
2629 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2630 * b[l + j * b_dim1];
2631 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2632 * b[l + (j + 1) * b_dim1];
2633 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2634 * b[l + (j + 1) * b_dim1];
2635 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2636 * b[l + (j + 2) * b_dim1];
2637 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2638 * b[l + (j + 2) * b_dim1];
2639 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2640 * b[l + (j + 3) * b_dim1];
2641 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2642 * b[l + (j + 3) * b_dim1];
2643 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2644 * b[l + j * b_dim1];
2645 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2646 * b[l + j * b_dim1];
2647 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2648 * b[l + (j + 1) * b_dim1];
2649 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2650 * b[l + (j + 1) * b_dim1];
2651 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2652 * b[l + (j + 2) * b_dim1];
2653 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2654 * b[l + (j + 2) * b_dim1];
2655 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2656 * b[l + (j + 3) * b_dim1];
2657 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2658 * b[l + (j + 3) * b_dim1];
2660 c[i + j * c_dim1] = f11;
2661 c[i + 1 + j * c_dim1] = f21;
2662 c[i + (j + 1) * c_dim1] = f12;
2663 c[i + 1 + (j + 1) * c_dim1] = f22;
2664 c[i + (j + 2) * c_dim1] = f13;
2665 c[i + 1 + (j + 2) * c_dim1] = f23;
2666 c[i + (j + 3) * c_dim1] = f14;
2667 c[i + 1 + (j + 3) * c_dim1] = f24;
2668 c[i + 2 + j * c_dim1] = f31;
2669 c[i + 3 + j * c_dim1] = f41;
2670 c[i + 2 + (j + 1) * c_dim1] = f32;
2671 c[i + 3 + (j + 1) * c_dim1] = f42;
2672 c[i + 2 + (j + 2) * c_dim1] = f33;
2673 c[i + 3 + (j + 2) * c_dim1] = f43;
2674 c[i + 2 + (j + 3) * c_dim1] = f34;
2675 c[i + 3 + (j + 3) * c_dim1] = f44;
2677 if (uisec < isec)
2679 i5 = ii + isec - 1;
2680 for (i = ii + uisec; i <= i5; ++i)
2682 f11 = c[i + j * c_dim1];
2683 f12 = c[i + (j + 1) * c_dim1];
2684 f13 = c[i + (j + 2) * c_dim1];
2685 f14 = c[i + (j + 3) * c_dim1];
2686 i6 = ll + lsec - 1;
2687 for (l = ll; l <= i6; ++l)
2689 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2690 257] * b[l + j * b_dim1];
2691 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2692 257] * b[l + (j + 1) * b_dim1];
2693 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2694 257] * b[l + (j + 2) * b_dim1];
2695 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2696 257] * b[l + (j + 3) * b_dim1];
2698 c[i + j * c_dim1] = f11;
2699 c[i + (j + 1) * c_dim1] = f12;
2700 c[i + (j + 2) * c_dim1] = f13;
2701 c[i + (j + 3) * c_dim1] = f14;
2705 if (ujsec < jsec)
2707 i4 = jj + jsec - 1;
2708 for (j = jj + ujsec; j <= i4; ++j)
2710 i5 = ii + uisec - 1;
2711 for (i = ii; i <= i5; i += 4)
2713 f11 = c[i + j * c_dim1];
2714 f21 = c[i + 1 + j * c_dim1];
2715 f31 = c[i + 2 + j * c_dim1];
2716 f41 = c[i + 3 + j * c_dim1];
2717 i6 = ll + lsec - 1;
2718 for (l = ll; l <= i6; ++l)
2720 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2721 257] * b[l + j * b_dim1];
2722 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2723 257] * b[l + j * b_dim1];
2724 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2725 257] * b[l + j * b_dim1];
2726 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2727 257] * b[l + j * b_dim1];
2729 c[i + j * c_dim1] = f11;
2730 c[i + 1 + j * c_dim1] = f21;
2731 c[i + 2 + j * c_dim1] = f31;
2732 c[i + 3 + j * c_dim1] = f41;
2734 i5 = ii + isec - 1;
2735 for (i = ii + uisec; i <= i5; ++i)
2737 f11 = c[i + j * c_dim1];
2738 i6 = ll + lsec - 1;
2739 for (l = ll; l <= i6; ++l)
2741 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2742 257] * b[l + j * b_dim1];
2744 c[i + j * c_dim1] = f11;
2751 return;
2753 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2755 if (GFC_DESCRIPTOR_RANK (a) != 1)
2757 const GFC_REAL_4 *restrict abase_x;
2758 const GFC_REAL_4 *restrict bbase_y;
2759 GFC_REAL_4 *restrict dest_y;
2760 GFC_REAL_4 s;
2762 for (y = 0; y < ycount; y++)
2764 bbase_y = &bbase[y*bystride];
2765 dest_y = &dest[y*rystride];
2766 for (x = 0; x < xcount; x++)
2768 abase_x = &abase[x*axstride];
2769 s = (GFC_REAL_4) 0;
2770 for (n = 0; n < count; n++)
2771 s += abase_x[n] * bbase_y[n];
2772 dest_y[x] = s;
2776 else
2778 const GFC_REAL_4 *restrict bbase_y;
2779 GFC_REAL_4 s;
2781 for (y = 0; y < ycount; y++)
2783 bbase_y = &bbase[y*bystride];
2784 s = (GFC_REAL_4) 0;
2785 for (n = 0; n < count; n++)
2786 s += abase[n*axstride] * bbase_y[n];
2787 dest[y*rystride] = s;
2791 else if (axstride < aystride)
2793 for (y = 0; y < ycount; y++)
2794 for (x = 0; x < xcount; x++)
2795 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2797 for (y = 0; y < ycount; y++)
2798 for (n = 0; n < count; n++)
2799 for (x = 0; x < xcount; x++)
2800 /* dest[x,y] += a[x,n] * b[n,y] */
2801 dest[x*rxstride + y*rystride] +=
2802 abase[x*axstride + n*aystride] *
2803 bbase[n*bxstride + y*bystride];
2805 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2807 const GFC_REAL_4 *restrict bbase_y;
2808 GFC_REAL_4 s;
2810 for (y = 0; y < ycount; y++)
2812 bbase_y = &bbase[y*bystride];
2813 s = (GFC_REAL_4) 0;
2814 for (n = 0; n < count; n++)
2815 s += abase[n*axstride] * bbase_y[n*bxstride];
2816 dest[y*rxstride] = s;
2819 else
2821 const GFC_REAL_4 *restrict abase_x;
2822 const GFC_REAL_4 *restrict bbase_y;
2823 GFC_REAL_4 *restrict dest_y;
2824 GFC_REAL_4 s;
2826 for (y = 0; y < ycount; y++)
2828 bbase_y = &bbase[y*bystride];
2829 dest_y = &dest[y*rystride];
2830 for (x = 0; x < xcount; x++)
2832 abase_x = &abase[x*axstride];
2833 s = (GFC_REAL_4) 0;
2834 for (n = 0; n < count; n++)
2835 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2836 dest_y[x*rxstride] = s;
2841 #undef POW3
2842 #undef min
2843 #undef max
2845 #endif
2846 #endif