Update C++17 library status table in manual
[official-gcc.git] / libgfortran / generated / matmul_r4.c
blobdbb31b05c3bc9d1139cbfc6f4e93b641e376878f
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
31 #if defined (HAVE_GFC_REAL_4)
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
39 const int *, const GFC_REAL_4 *, const int *,
40 const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
41 int, int);
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
49 The equivalent Fortran pseudo-code is:
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
72 extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
73 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_r4);
77 /* Put exhaustive list of possible architectures here here, ORed together. */
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
81 #ifdef HAVE_AVX
82 static void
83 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
84 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
87 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
88 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
91 const GFC_REAL_4 * restrict abase;
92 const GFC_REAL_4 * restrict bbase;
93 GFC_REAL_4 * restrict dest;
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
103 Either A or B (but not both) can be rank 1:
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
112 if (retarray->base_addr == NULL)
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
124 else
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
136 retarray->offset = 0;
138 else if (unlikely (compile_options.bounds_check))
140 index_type ret_extent, arg_extent;
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent, (long int) arg_extent);
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent, (long int) arg_extent);
160 else
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
185 work. */
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
188 else
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = 1;
201 xcount = 1;
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
204 else
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
224 /* bystride should never be used for 1-dimensional b.
225 in case it is we want it to cause a segfault, rather than
226 an incorrect result. */
227 bystride = 0xDEADBEEF;
228 ycount = 1;
230 else
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_REAL_4 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
260 assert (gemm != NULL);
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
263 &ldc, 1, 1);
264 return;
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
275 Umea University
276 S-901 87 Umea, Sweden
278 from netlib.org, translated to C, and modified for matmul.m4. */
280 const GFC_REAL_4 *a, *b;
281 GFC_REAL_4 *c;
282 const index_type m = xcount, n = ycount, k = count;
284 /* System generated locals */
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 i1, i2, i3, i4, i5, i6;
288 /* Local variables */
289 GFC_REAL_4 t1[65536], /* was [256][256] */
290 f11, f12, f21, f22, f31, f32, f41, f42,
291 f13, f14, f23, f24, f33, f34, f43, f44;
292 index_type i, j, l, ii, jj, ll;
293 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
295 a = abase;
296 b = bbase;
297 c = retarray->base_addr;
299 /* Parameter adjustments */
300 c_dim1 = rystride;
301 c_offset = 1 + c_dim1;
302 c -= c_offset;
303 a_dim1 = aystride;
304 a_offset = 1 + a_dim1;
305 a -= a_offset;
306 b_dim1 = bystride;
307 b_offset = 1 + b_dim1;
308 b -= b_offset;
310 /* Early exit if possible */
311 if (m == 0 || n == 0 || k == 0)
312 return;
314 /* Empty c first. */
315 for (j=1; j<=n; j++)
316 for (i=1; i<=m; i++)
317 c[i + j * c_dim1] = (GFC_REAL_4)0;
319 /* Start turning the crank. */
320 i1 = n;
321 for (jj = 1; jj <= i1; jj += 512)
323 /* Computing MIN */
324 i2 = 512;
325 i3 = n - jj + 1;
326 jsec = min(i2,i3);
327 ujsec = jsec - jsec % 4;
328 i2 = k;
329 for (ll = 1; ll <= i2; ll += 256)
331 /* Computing MIN */
332 i3 = 256;
333 i4 = k - ll + 1;
334 lsec = min(i3,i4);
335 ulsec = lsec - lsec % 2;
337 i3 = m;
338 for (ii = 1; ii <= i3; ii += 256)
340 /* Computing MIN */
341 i4 = 256;
342 i5 = m - ii + 1;
343 isec = min(i4,i5);
344 uisec = isec - isec % 2;
345 i4 = ll + ulsec - 1;
346 for (l = ll; l <= i4; l += 2)
348 i5 = ii + uisec - 1;
349 for (i = ii; i <= i5; i += 2)
351 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
352 a[i + l * a_dim1];
353 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
354 a[i + (l + 1) * a_dim1];
355 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
356 a[i + 1 + l * a_dim1];
357 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
358 a[i + 1 + (l + 1) * a_dim1];
360 if (uisec < isec)
362 t1[l - ll + 1 + (isec << 8) - 257] =
363 a[ii + isec - 1 + l * a_dim1];
364 t1[l - ll + 2 + (isec << 8) - 257] =
365 a[ii + isec - 1 + (l + 1) * a_dim1];
368 if (ulsec < lsec)
370 i4 = ii + isec - 1;
371 for (i = ii; i<= i4; ++i)
373 t1[lsec + ((i - ii + 1) << 8) - 257] =
374 a[i + (ll + lsec - 1) * a_dim1];
378 uisec = isec - isec % 4;
379 i4 = jj + ujsec - 1;
380 for (j = jj; j <= i4; j += 4)
382 i5 = ii + uisec - 1;
383 for (i = ii; i <= i5; i += 4)
385 f11 = c[i + j * c_dim1];
386 f21 = c[i + 1 + j * c_dim1];
387 f12 = c[i + (j + 1) * c_dim1];
388 f22 = c[i + 1 + (j + 1) * c_dim1];
389 f13 = c[i + (j + 2) * c_dim1];
390 f23 = c[i + 1 + (j + 2) * c_dim1];
391 f14 = c[i + (j + 3) * c_dim1];
392 f24 = c[i + 1 + (j + 3) * c_dim1];
393 f31 = c[i + 2 + j * c_dim1];
394 f41 = c[i + 3 + j * c_dim1];
395 f32 = c[i + 2 + (j + 1) * c_dim1];
396 f42 = c[i + 3 + (j + 1) * c_dim1];
397 f33 = c[i + 2 + (j + 2) * c_dim1];
398 f43 = c[i + 3 + (j + 2) * c_dim1];
399 f34 = c[i + 2 + (j + 3) * c_dim1];
400 f44 = c[i + 3 + (j + 3) * c_dim1];
401 i6 = ll + lsec - 1;
402 for (l = ll; l <= i6; ++l)
404 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
405 * b[l + j * b_dim1];
406 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
407 * b[l + j * b_dim1];
408 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
409 * b[l + (j + 1) * b_dim1];
410 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
411 * b[l + (j + 1) * b_dim1];
412 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
413 * b[l + (j + 2) * b_dim1];
414 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
415 * b[l + (j + 2) * b_dim1];
416 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
417 * b[l + (j + 3) * b_dim1];
418 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
419 * b[l + (j + 3) * b_dim1];
420 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
421 * b[l + j * b_dim1];
422 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
423 * b[l + j * b_dim1];
424 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
425 * b[l + (j + 1) * b_dim1];
426 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
427 * b[l + (j + 1) * b_dim1];
428 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
429 * b[l + (j + 2) * b_dim1];
430 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
431 * b[l + (j + 2) * b_dim1];
432 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
433 * b[l + (j + 3) * b_dim1];
434 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
435 * b[l + (j + 3) * b_dim1];
437 c[i + j * c_dim1] = f11;
438 c[i + 1 + j * c_dim1] = f21;
439 c[i + (j + 1) * c_dim1] = f12;
440 c[i + 1 + (j + 1) * c_dim1] = f22;
441 c[i + (j + 2) * c_dim1] = f13;
442 c[i + 1 + (j + 2) * c_dim1] = f23;
443 c[i + (j + 3) * c_dim1] = f14;
444 c[i + 1 + (j + 3) * c_dim1] = f24;
445 c[i + 2 + j * c_dim1] = f31;
446 c[i + 3 + j * c_dim1] = f41;
447 c[i + 2 + (j + 1) * c_dim1] = f32;
448 c[i + 3 + (j + 1) * c_dim1] = f42;
449 c[i + 2 + (j + 2) * c_dim1] = f33;
450 c[i + 3 + (j + 2) * c_dim1] = f43;
451 c[i + 2 + (j + 3) * c_dim1] = f34;
452 c[i + 3 + (j + 3) * c_dim1] = f44;
454 if (uisec < isec)
456 i5 = ii + isec - 1;
457 for (i = ii + uisec; i <= i5; ++i)
459 f11 = c[i + j * c_dim1];
460 f12 = c[i + (j + 1) * c_dim1];
461 f13 = c[i + (j + 2) * c_dim1];
462 f14 = c[i + (j + 3) * c_dim1];
463 i6 = ll + lsec - 1;
464 for (l = ll; l <= i6; ++l)
466 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
467 257] * b[l + j * b_dim1];
468 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
469 257] * b[l + (j + 1) * b_dim1];
470 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
471 257] * b[l + (j + 2) * b_dim1];
472 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
473 257] * b[l + (j + 3) * b_dim1];
475 c[i + j * c_dim1] = f11;
476 c[i + (j + 1) * c_dim1] = f12;
477 c[i + (j + 2) * c_dim1] = f13;
478 c[i + (j + 3) * c_dim1] = f14;
482 if (ujsec < jsec)
484 i4 = jj + jsec - 1;
485 for (j = jj + ujsec; j <= i4; ++j)
487 i5 = ii + uisec - 1;
488 for (i = ii; i <= i5; i += 4)
490 f11 = c[i + j * c_dim1];
491 f21 = c[i + 1 + j * c_dim1];
492 f31 = c[i + 2 + j * c_dim1];
493 f41 = c[i + 3 + j * c_dim1];
494 i6 = ll + lsec - 1;
495 for (l = ll; l <= i6; ++l)
497 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498 257] * b[l + j * b_dim1];
499 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
500 257] * b[l + j * b_dim1];
501 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
502 257] * b[l + j * b_dim1];
503 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
504 257] * b[l + j * b_dim1];
506 c[i + j * c_dim1] = f11;
507 c[i + 1 + j * c_dim1] = f21;
508 c[i + 2 + j * c_dim1] = f31;
509 c[i + 3 + j * c_dim1] = f41;
511 i5 = ii + isec - 1;
512 for (i = ii + uisec; i <= i5; ++i)
514 f11 = c[i + j * c_dim1];
515 i6 = ll + lsec - 1;
516 for (l = ll; l <= i6; ++l)
518 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
519 257] * b[l + j * b_dim1];
521 c[i + j * c_dim1] = f11;
528 return;
530 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
532 if (GFC_DESCRIPTOR_RANK (a) != 1)
534 const GFC_REAL_4 *restrict abase_x;
535 const GFC_REAL_4 *restrict bbase_y;
536 GFC_REAL_4 *restrict dest_y;
537 GFC_REAL_4 s;
539 for (y = 0; y < ycount; y++)
541 bbase_y = &bbase[y*bystride];
542 dest_y = &dest[y*rystride];
543 for (x = 0; x < xcount; x++)
545 abase_x = &abase[x*axstride];
546 s = (GFC_REAL_4) 0;
547 for (n = 0; n < count; n++)
548 s += abase_x[n] * bbase_y[n];
549 dest_y[x] = s;
553 else
555 const GFC_REAL_4 *restrict bbase_y;
556 GFC_REAL_4 s;
558 for (y = 0; y < ycount; y++)
560 bbase_y = &bbase[y*bystride];
561 s = (GFC_REAL_4) 0;
562 for (n = 0; n < count; n++)
563 s += abase[n*axstride] * bbase_y[n];
564 dest[y*rystride] = s;
568 else if (axstride < aystride)
570 for (y = 0; y < ycount; y++)
571 for (x = 0; x < xcount; x++)
572 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
574 for (y = 0; y < ycount; y++)
575 for (n = 0; n < count; n++)
576 for (x = 0; x < xcount; x++)
577 /* dest[x,y] += a[x,n] * b[n,y] */
578 dest[x*rxstride + y*rystride] +=
579 abase[x*axstride + n*aystride] *
580 bbase[n*bxstride + y*bystride];
582 else if (GFC_DESCRIPTOR_RANK (a) == 1)
584 const GFC_REAL_4 *restrict bbase_y;
585 GFC_REAL_4 s;
587 for (y = 0; y < ycount; y++)
589 bbase_y = &bbase[y*bystride];
590 s = (GFC_REAL_4) 0;
591 for (n = 0; n < count; n++)
592 s += abase[n*axstride] * bbase_y[n*bxstride];
593 dest[y*rxstride] = s;
596 else
598 const GFC_REAL_4 *restrict abase_x;
599 const GFC_REAL_4 *restrict bbase_y;
600 GFC_REAL_4 *restrict dest_y;
601 GFC_REAL_4 s;
603 for (y = 0; y < ycount; y++)
605 bbase_y = &bbase[y*bystride];
606 dest_y = &dest[y*rystride];
607 for (x = 0; x < xcount; x++)
609 abase_x = &abase[x*axstride];
610 s = (GFC_REAL_4) 0;
611 for (n = 0; n < count; n++)
612 s += abase_x[n*aystride] * bbase_y[n*bxstride];
613 dest_y[x*rxstride] = s;
618 #undef POW3
619 #undef min
620 #undef max
622 #endif /* HAVE_AVX */
624 #ifdef HAVE_AVX2
625 static void
626 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
627 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
628 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
629 static void
630 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
631 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
632 int blas_limit, blas_call gemm)
634 const GFC_REAL_4 * restrict abase;
635 const GFC_REAL_4 * restrict bbase;
636 GFC_REAL_4 * restrict dest;
638 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
639 index_type x, y, n, count, xcount, ycount;
641 assert (GFC_DESCRIPTOR_RANK (a) == 2
642 || GFC_DESCRIPTOR_RANK (b) == 2);
644 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
646 Either A or B (but not both) can be rank 1:
648 o One-dimensional argument A is implicitly treated as a row matrix
649 dimensioned [1,count], so xcount=1.
651 o One-dimensional argument B is implicitly treated as a column matrix
652 dimensioned [count, 1], so ycount=1.
655 if (retarray->base_addr == NULL)
657 if (GFC_DESCRIPTOR_RANK (a) == 1)
659 GFC_DIMENSION_SET(retarray->dim[0], 0,
660 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
662 else if (GFC_DESCRIPTOR_RANK (b) == 1)
664 GFC_DIMENSION_SET(retarray->dim[0], 0,
665 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
667 else
669 GFC_DIMENSION_SET(retarray->dim[0], 0,
670 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
672 GFC_DIMENSION_SET(retarray->dim[1], 0,
673 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
674 GFC_DESCRIPTOR_EXTENT(retarray,0));
677 retarray->base_addr
678 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
679 retarray->offset = 0;
681 else if (unlikely (compile_options.bounds_check))
683 index_type ret_extent, arg_extent;
685 if (GFC_DESCRIPTOR_RANK (a) == 1)
687 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
688 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
689 if (arg_extent != ret_extent)
690 runtime_error ("Incorrect extent in return array in"
691 " MATMUL intrinsic: is %ld, should be %ld",
692 (long int) ret_extent, (long int) arg_extent);
694 else if (GFC_DESCRIPTOR_RANK (b) == 1)
696 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
698 if (arg_extent != ret_extent)
699 runtime_error ("Incorrect extent in return array in"
700 " MATMUL intrinsic: is %ld, should be %ld",
701 (long int) ret_extent, (long int) arg_extent);
703 else
705 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
706 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
707 if (arg_extent != ret_extent)
708 runtime_error ("Incorrect extent in return array in"
709 " MATMUL intrinsic for dimension 1:"
710 " is %ld, should be %ld",
711 (long int) ret_extent, (long int) arg_extent);
713 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
715 if (arg_extent != ret_extent)
716 runtime_error ("Incorrect extent in return array in"
717 " MATMUL intrinsic for dimension 2:"
718 " is %ld, should be %ld",
719 (long int) ret_extent, (long int) arg_extent);
724 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
726 /* One-dimensional result may be addressed in the code below
727 either as a row or a column matrix. We want both cases to
728 work. */
729 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
731 else
733 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
734 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
738 if (GFC_DESCRIPTOR_RANK (a) == 1)
740 /* Treat it as a a row matrix A[1,count]. */
741 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
742 aystride = 1;
744 xcount = 1;
745 count = GFC_DESCRIPTOR_EXTENT(a,0);
747 else
749 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
750 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
752 count = GFC_DESCRIPTOR_EXTENT(a,1);
753 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
756 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
758 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
759 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
762 if (GFC_DESCRIPTOR_RANK (b) == 1)
764 /* Treat it as a column matrix B[count,1] */
765 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
767 /* bystride should never be used for 1-dimensional b.
768 in case it is we want it to cause a segfault, rather than
769 an incorrect result. */
770 bystride = 0xDEADBEEF;
771 ycount = 1;
773 else
775 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
776 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
777 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
780 abase = a->base_addr;
781 bbase = b->base_addr;
782 dest = retarray->base_addr;
784 /* Now that everything is set up, we perform the multiplication
785 itself. */
787 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
788 #define min(a,b) ((a) <= (b) ? (a) : (b))
789 #define max(a,b) ((a) >= (b) ? (a) : (b))
791 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
792 && (bxstride == 1 || bystride == 1)
793 && (((float) xcount) * ((float) ycount) * ((float) count)
794 > POW3(blas_limit)))
796 const int m = xcount, n = ycount, k = count, ldc = rystride;
797 const GFC_REAL_4 one = 1, zero = 0;
798 const int lda = (axstride == 1) ? aystride : axstride,
799 ldb = (bxstride == 1) ? bystride : bxstride;
801 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
803 assert (gemm != NULL);
804 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
805 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
806 &ldc, 1, 1);
807 return;
811 if (rxstride == 1 && axstride == 1 && bxstride == 1)
813 /* This block of code implements a tuned matmul, derived from
814 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
816 Bo Kagstrom and Per Ling
817 Department of Computing Science
818 Umea University
819 S-901 87 Umea, Sweden
821 from netlib.org, translated to C, and modified for matmul.m4. */
823 const GFC_REAL_4 *a, *b;
824 GFC_REAL_4 *c;
825 const index_type m = xcount, n = ycount, k = count;
827 /* System generated locals */
828 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
829 i1, i2, i3, i4, i5, i6;
831 /* Local variables */
832 GFC_REAL_4 t1[65536], /* was [256][256] */
833 f11, f12, f21, f22, f31, f32, f41, f42,
834 f13, f14, f23, f24, f33, f34, f43, f44;
835 index_type i, j, l, ii, jj, ll;
836 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
838 a = abase;
839 b = bbase;
840 c = retarray->base_addr;
842 /* Parameter adjustments */
843 c_dim1 = rystride;
844 c_offset = 1 + c_dim1;
845 c -= c_offset;
846 a_dim1 = aystride;
847 a_offset = 1 + a_dim1;
848 a -= a_offset;
849 b_dim1 = bystride;
850 b_offset = 1 + b_dim1;
851 b -= b_offset;
853 /* Early exit if possible */
854 if (m == 0 || n == 0 || k == 0)
855 return;
857 /* Empty c first. */
858 for (j=1; j<=n; j++)
859 for (i=1; i<=m; i++)
860 c[i + j * c_dim1] = (GFC_REAL_4)0;
862 /* Start turning the crank. */
863 i1 = n;
864 for (jj = 1; jj <= i1; jj += 512)
866 /* Computing MIN */
867 i2 = 512;
868 i3 = n - jj + 1;
869 jsec = min(i2,i3);
870 ujsec = jsec - jsec % 4;
871 i2 = k;
872 for (ll = 1; ll <= i2; ll += 256)
874 /* Computing MIN */
875 i3 = 256;
876 i4 = k - ll + 1;
877 lsec = min(i3,i4);
878 ulsec = lsec - lsec % 2;
880 i3 = m;
881 for (ii = 1; ii <= i3; ii += 256)
883 /* Computing MIN */
884 i4 = 256;
885 i5 = m - ii + 1;
886 isec = min(i4,i5);
887 uisec = isec - isec % 2;
888 i4 = ll + ulsec - 1;
889 for (l = ll; l <= i4; l += 2)
891 i5 = ii + uisec - 1;
892 for (i = ii; i <= i5; i += 2)
894 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
895 a[i + l * a_dim1];
896 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
897 a[i + (l + 1) * a_dim1];
898 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
899 a[i + 1 + l * a_dim1];
900 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
901 a[i + 1 + (l + 1) * a_dim1];
903 if (uisec < isec)
905 t1[l - ll + 1 + (isec << 8) - 257] =
906 a[ii + isec - 1 + l * a_dim1];
907 t1[l - ll + 2 + (isec << 8) - 257] =
908 a[ii + isec - 1 + (l + 1) * a_dim1];
911 if (ulsec < lsec)
913 i4 = ii + isec - 1;
914 for (i = ii; i<= i4; ++i)
916 t1[lsec + ((i - ii + 1) << 8) - 257] =
917 a[i + (ll + lsec - 1) * a_dim1];
921 uisec = isec - isec % 4;
922 i4 = jj + ujsec - 1;
923 for (j = jj; j <= i4; j += 4)
925 i5 = ii + uisec - 1;
926 for (i = ii; i <= i5; i += 4)
928 f11 = c[i + j * c_dim1];
929 f21 = c[i + 1 + j * c_dim1];
930 f12 = c[i + (j + 1) * c_dim1];
931 f22 = c[i + 1 + (j + 1) * c_dim1];
932 f13 = c[i + (j + 2) * c_dim1];
933 f23 = c[i + 1 + (j + 2) * c_dim1];
934 f14 = c[i + (j + 3) * c_dim1];
935 f24 = c[i + 1 + (j + 3) * c_dim1];
936 f31 = c[i + 2 + j * c_dim1];
937 f41 = c[i + 3 + j * c_dim1];
938 f32 = c[i + 2 + (j + 1) * c_dim1];
939 f42 = c[i + 3 + (j + 1) * c_dim1];
940 f33 = c[i + 2 + (j + 2) * c_dim1];
941 f43 = c[i + 3 + (j + 2) * c_dim1];
942 f34 = c[i + 2 + (j + 3) * c_dim1];
943 f44 = c[i + 3 + (j + 3) * c_dim1];
944 i6 = ll + lsec - 1;
945 for (l = ll; l <= i6; ++l)
947 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
948 * b[l + j * b_dim1];
949 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
950 * b[l + j * b_dim1];
951 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
952 * b[l + (j + 1) * b_dim1];
953 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
954 * b[l + (j + 1) * b_dim1];
955 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
956 * b[l + (j + 2) * b_dim1];
957 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
958 * b[l + (j + 2) * b_dim1];
959 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
960 * b[l + (j + 3) * b_dim1];
961 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
962 * b[l + (j + 3) * b_dim1];
963 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
964 * b[l + j * b_dim1];
965 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
966 * b[l + j * b_dim1];
967 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
968 * b[l + (j + 1) * b_dim1];
969 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
970 * b[l + (j + 1) * b_dim1];
971 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
972 * b[l + (j + 2) * b_dim1];
973 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
974 * b[l + (j + 2) * b_dim1];
975 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
976 * b[l + (j + 3) * b_dim1];
977 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
978 * b[l + (j + 3) * b_dim1];
980 c[i + j * c_dim1] = f11;
981 c[i + 1 + j * c_dim1] = f21;
982 c[i + (j + 1) * c_dim1] = f12;
983 c[i + 1 + (j + 1) * c_dim1] = f22;
984 c[i + (j + 2) * c_dim1] = f13;
985 c[i + 1 + (j + 2) * c_dim1] = f23;
986 c[i + (j + 3) * c_dim1] = f14;
987 c[i + 1 + (j + 3) * c_dim1] = f24;
988 c[i + 2 + j * c_dim1] = f31;
989 c[i + 3 + j * c_dim1] = f41;
990 c[i + 2 + (j + 1) * c_dim1] = f32;
991 c[i + 3 + (j + 1) * c_dim1] = f42;
992 c[i + 2 + (j + 2) * c_dim1] = f33;
993 c[i + 3 + (j + 2) * c_dim1] = f43;
994 c[i + 2 + (j + 3) * c_dim1] = f34;
995 c[i + 3 + (j + 3) * c_dim1] = f44;
997 if (uisec < isec)
999 i5 = ii + isec - 1;
1000 for (i = ii + uisec; i <= i5; ++i)
1002 f11 = c[i + j * c_dim1];
1003 f12 = c[i + (j + 1) * c_dim1];
1004 f13 = c[i + (j + 2) * c_dim1];
1005 f14 = c[i + (j + 3) * c_dim1];
1006 i6 = ll + lsec - 1;
1007 for (l = ll; l <= i6; ++l)
1009 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1010 257] * b[l + j * b_dim1];
1011 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1012 257] * b[l + (j + 1) * b_dim1];
1013 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1014 257] * b[l + (j + 2) * b_dim1];
1015 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1016 257] * b[l + (j + 3) * b_dim1];
1018 c[i + j * c_dim1] = f11;
1019 c[i + (j + 1) * c_dim1] = f12;
1020 c[i + (j + 2) * c_dim1] = f13;
1021 c[i + (j + 3) * c_dim1] = f14;
1025 if (ujsec < jsec)
1027 i4 = jj + jsec - 1;
1028 for (j = jj + ujsec; j <= i4; ++j)
1030 i5 = ii + uisec - 1;
1031 for (i = ii; i <= i5; i += 4)
1033 f11 = c[i + j * c_dim1];
1034 f21 = c[i + 1 + j * c_dim1];
1035 f31 = c[i + 2 + j * c_dim1];
1036 f41 = c[i + 3 + j * c_dim1];
1037 i6 = ll + lsec - 1;
1038 for (l = ll; l <= i6; ++l)
1040 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041 257] * b[l + j * b_dim1];
1042 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1043 257] * b[l + j * b_dim1];
1044 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1045 257] * b[l + j * b_dim1];
1046 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1047 257] * b[l + j * b_dim1];
1049 c[i + j * c_dim1] = f11;
1050 c[i + 1 + j * c_dim1] = f21;
1051 c[i + 2 + j * c_dim1] = f31;
1052 c[i + 3 + j * c_dim1] = f41;
1054 i5 = ii + isec - 1;
1055 for (i = ii + uisec; i <= i5; ++i)
1057 f11 = c[i + j * c_dim1];
1058 i6 = ll + lsec - 1;
1059 for (l = ll; l <= i6; ++l)
1061 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1062 257] * b[l + j * b_dim1];
1064 c[i + j * c_dim1] = f11;
1071 return;
1073 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1075 if (GFC_DESCRIPTOR_RANK (a) != 1)
1077 const GFC_REAL_4 *restrict abase_x;
1078 const GFC_REAL_4 *restrict bbase_y;
1079 GFC_REAL_4 *restrict dest_y;
1080 GFC_REAL_4 s;
1082 for (y = 0; y < ycount; y++)
1084 bbase_y = &bbase[y*bystride];
1085 dest_y = &dest[y*rystride];
1086 for (x = 0; x < xcount; x++)
1088 abase_x = &abase[x*axstride];
1089 s = (GFC_REAL_4) 0;
1090 for (n = 0; n < count; n++)
1091 s += abase_x[n] * bbase_y[n];
1092 dest_y[x] = s;
1096 else
1098 const GFC_REAL_4 *restrict bbase_y;
1099 GFC_REAL_4 s;
1101 for (y = 0; y < ycount; y++)
1103 bbase_y = &bbase[y*bystride];
1104 s = (GFC_REAL_4) 0;
1105 for (n = 0; n < count; n++)
1106 s += abase[n*axstride] * bbase_y[n];
1107 dest[y*rystride] = s;
1111 else if (axstride < aystride)
1113 for (y = 0; y < ycount; y++)
1114 for (x = 0; x < xcount; x++)
1115 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1117 for (y = 0; y < ycount; y++)
1118 for (n = 0; n < count; n++)
1119 for (x = 0; x < xcount; x++)
1120 /* dest[x,y] += a[x,n] * b[n,y] */
1121 dest[x*rxstride + y*rystride] +=
1122 abase[x*axstride + n*aystride] *
1123 bbase[n*bxstride + y*bystride];
1125 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1127 const GFC_REAL_4 *restrict bbase_y;
1128 GFC_REAL_4 s;
1130 for (y = 0; y < ycount; y++)
1132 bbase_y = &bbase[y*bystride];
1133 s = (GFC_REAL_4) 0;
1134 for (n = 0; n < count; n++)
1135 s += abase[n*axstride] * bbase_y[n*bxstride];
1136 dest[y*rxstride] = s;
1139 else
1141 const GFC_REAL_4 *restrict abase_x;
1142 const GFC_REAL_4 *restrict bbase_y;
1143 GFC_REAL_4 *restrict dest_y;
1144 GFC_REAL_4 s;
1146 for (y = 0; y < ycount; y++)
1148 bbase_y = &bbase[y*bystride];
1149 dest_y = &dest[y*rystride];
1150 for (x = 0; x < xcount; x++)
1152 abase_x = &abase[x*axstride];
1153 s = (GFC_REAL_4) 0;
1154 for (n = 0; n < count; n++)
1155 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1156 dest_y[x*rxstride] = s;
1161 #undef POW3
1162 #undef min
1163 #undef max
1165 #endif /* HAVE_AVX2 */
1167 #ifdef HAVE_AVX512F
1168 static void
1169 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1170 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1171 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1172 static void
1173 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1174 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1175 int blas_limit, blas_call gemm)
1177 const GFC_REAL_4 * restrict abase;
1178 const GFC_REAL_4 * restrict bbase;
1179 GFC_REAL_4 * restrict dest;
1181 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1182 index_type x, y, n, count, xcount, ycount;
1184 assert (GFC_DESCRIPTOR_RANK (a) == 2
1185 || GFC_DESCRIPTOR_RANK (b) == 2);
1187 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1189 Either A or B (but not both) can be rank 1:
1191 o One-dimensional argument A is implicitly treated as a row matrix
1192 dimensioned [1,count], so xcount=1.
1194 o One-dimensional argument B is implicitly treated as a column matrix
1195 dimensioned [count, 1], so ycount=1.
1198 if (retarray->base_addr == NULL)
1200 if (GFC_DESCRIPTOR_RANK (a) == 1)
1202 GFC_DIMENSION_SET(retarray->dim[0], 0,
1203 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1205 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1207 GFC_DIMENSION_SET(retarray->dim[0], 0,
1208 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1210 else
1212 GFC_DIMENSION_SET(retarray->dim[0], 0,
1213 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1215 GFC_DIMENSION_SET(retarray->dim[1], 0,
1216 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1217 GFC_DESCRIPTOR_EXTENT(retarray,0));
1220 retarray->base_addr
1221 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1222 retarray->offset = 0;
1224 else if (unlikely (compile_options.bounds_check))
1226 index_type ret_extent, arg_extent;
1228 if (GFC_DESCRIPTOR_RANK (a) == 1)
1230 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1231 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1232 if (arg_extent != ret_extent)
1233 runtime_error ("Incorrect extent in return array in"
1234 " MATMUL intrinsic: is %ld, should be %ld",
1235 (long int) ret_extent, (long int) arg_extent);
1237 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1239 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1240 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1241 if (arg_extent != ret_extent)
1242 runtime_error ("Incorrect extent in return array in"
1243 " MATMUL intrinsic: is %ld, should be %ld",
1244 (long int) ret_extent, (long int) arg_extent);
1246 else
1248 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1249 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1250 if (arg_extent != ret_extent)
1251 runtime_error ("Incorrect extent in return array in"
1252 " MATMUL intrinsic for dimension 1:"
1253 " is %ld, should be %ld",
1254 (long int) ret_extent, (long int) arg_extent);
1256 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1257 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1258 if (arg_extent != ret_extent)
1259 runtime_error ("Incorrect extent in return array in"
1260 " MATMUL intrinsic for dimension 2:"
1261 " is %ld, should be %ld",
1262 (long int) ret_extent, (long int) arg_extent);
1267 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1269 /* One-dimensional result may be addressed in the code below
1270 either as a row or a column matrix. We want both cases to
1271 work. */
1272 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1274 else
1276 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1277 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1281 if (GFC_DESCRIPTOR_RANK (a) == 1)
1283 /* Treat it as a a row matrix A[1,count]. */
1284 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1285 aystride = 1;
1287 xcount = 1;
1288 count = GFC_DESCRIPTOR_EXTENT(a,0);
1290 else
1292 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1293 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1295 count = GFC_DESCRIPTOR_EXTENT(a,1);
1296 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1299 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1301 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1302 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1305 if (GFC_DESCRIPTOR_RANK (b) == 1)
1307 /* Treat it as a column matrix B[count,1] */
1308 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1310 /* bystride should never be used for 1-dimensional b.
1311 in case it is we want it to cause a segfault, rather than
1312 an incorrect result. */
1313 bystride = 0xDEADBEEF;
1314 ycount = 1;
1316 else
1318 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1319 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1320 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1323 abase = a->base_addr;
1324 bbase = b->base_addr;
1325 dest = retarray->base_addr;
1327 /* Now that everything is set up, we perform the multiplication
1328 itself. */
1330 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1331 #define min(a,b) ((a) <= (b) ? (a) : (b))
1332 #define max(a,b) ((a) >= (b) ? (a) : (b))
1334 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1335 && (bxstride == 1 || bystride == 1)
1336 && (((float) xcount) * ((float) ycount) * ((float) count)
1337 > POW3(blas_limit)))
1339 const int m = xcount, n = ycount, k = count, ldc = rystride;
1340 const GFC_REAL_4 one = 1, zero = 0;
1341 const int lda = (axstride == 1) ? aystride : axstride,
1342 ldb = (bxstride == 1) ? bystride : bxstride;
1344 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1346 assert (gemm != NULL);
1347 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1348 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1349 &ldc, 1, 1);
1350 return;
1354 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1356 /* This block of code implements a tuned matmul, derived from
1357 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1359 Bo Kagstrom and Per Ling
1360 Department of Computing Science
1361 Umea University
1362 S-901 87 Umea, Sweden
1364 from netlib.org, translated to C, and modified for matmul.m4. */
1366 const GFC_REAL_4 *a, *b;
1367 GFC_REAL_4 *c;
1368 const index_type m = xcount, n = ycount, k = count;
1370 /* System generated locals */
1371 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1372 i1, i2, i3, i4, i5, i6;
1374 /* Local variables */
1375 GFC_REAL_4 t1[65536], /* was [256][256] */
1376 f11, f12, f21, f22, f31, f32, f41, f42,
1377 f13, f14, f23, f24, f33, f34, f43, f44;
1378 index_type i, j, l, ii, jj, ll;
1379 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1381 a = abase;
1382 b = bbase;
1383 c = retarray->base_addr;
1385 /* Parameter adjustments */
1386 c_dim1 = rystride;
1387 c_offset = 1 + c_dim1;
1388 c -= c_offset;
1389 a_dim1 = aystride;
1390 a_offset = 1 + a_dim1;
1391 a -= a_offset;
1392 b_dim1 = bystride;
1393 b_offset = 1 + b_dim1;
1394 b -= b_offset;
1396 /* Early exit if possible */
1397 if (m == 0 || n == 0 || k == 0)
1398 return;
1400 /* Empty c first. */
1401 for (j=1; j<=n; j++)
1402 for (i=1; i<=m; i++)
1403 c[i + j * c_dim1] = (GFC_REAL_4)0;
1405 /* Start turning the crank. */
1406 i1 = n;
1407 for (jj = 1; jj <= i1; jj += 512)
1409 /* Computing MIN */
1410 i2 = 512;
1411 i3 = n - jj + 1;
1412 jsec = min(i2,i3);
1413 ujsec = jsec - jsec % 4;
1414 i2 = k;
1415 for (ll = 1; ll <= i2; ll += 256)
1417 /* Computing MIN */
1418 i3 = 256;
1419 i4 = k - ll + 1;
1420 lsec = min(i3,i4);
1421 ulsec = lsec - lsec % 2;
1423 i3 = m;
1424 for (ii = 1; ii <= i3; ii += 256)
1426 /* Computing MIN */
1427 i4 = 256;
1428 i5 = m - ii + 1;
1429 isec = min(i4,i5);
1430 uisec = isec - isec % 2;
1431 i4 = ll + ulsec - 1;
1432 for (l = ll; l <= i4; l += 2)
1434 i5 = ii + uisec - 1;
1435 for (i = ii; i <= i5; i += 2)
1437 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1438 a[i + l * a_dim1];
1439 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1440 a[i + (l + 1) * a_dim1];
1441 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1442 a[i + 1 + l * a_dim1];
1443 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1444 a[i + 1 + (l + 1) * a_dim1];
1446 if (uisec < isec)
1448 t1[l - ll + 1 + (isec << 8) - 257] =
1449 a[ii + isec - 1 + l * a_dim1];
1450 t1[l - ll + 2 + (isec << 8) - 257] =
1451 a[ii + isec - 1 + (l + 1) * a_dim1];
1454 if (ulsec < lsec)
1456 i4 = ii + isec - 1;
1457 for (i = ii; i<= i4; ++i)
1459 t1[lsec + ((i - ii + 1) << 8) - 257] =
1460 a[i + (ll + lsec - 1) * a_dim1];
1464 uisec = isec - isec % 4;
1465 i4 = jj + ujsec - 1;
1466 for (j = jj; j <= i4; j += 4)
1468 i5 = ii + uisec - 1;
1469 for (i = ii; i <= i5; i += 4)
1471 f11 = c[i + j * c_dim1];
1472 f21 = c[i + 1 + j * c_dim1];
1473 f12 = c[i + (j + 1) * c_dim1];
1474 f22 = c[i + 1 + (j + 1) * c_dim1];
1475 f13 = c[i + (j + 2) * c_dim1];
1476 f23 = c[i + 1 + (j + 2) * c_dim1];
1477 f14 = c[i + (j + 3) * c_dim1];
1478 f24 = c[i + 1 + (j + 3) * c_dim1];
1479 f31 = c[i + 2 + j * c_dim1];
1480 f41 = c[i + 3 + j * c_dim1];
1481 f32 = c[i + 2 + (j + 1) * c_dim1];
1482 f42 = c[i + 3 + (j + 1) * c_dim1];
1483 f33 = c[i + 2 + (j + 2) * c_dim1];
1484 f43 = c[i + 3 + (j + 2) * c_dim1];
1485 f34 = c[i + 2 + (j + 3) * c_dim1];
1486 f44 = c[i + 3 + (j + 3) * c_dim1];
1487 i6 = ll + lsec - 1;
1488 for (l = ll; l <= i6; ++l)
1490 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1491 * b[l + j * b_dim1];
1492 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1493 * b[l + j * b_dim1];
1494 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1495 * b[l + (j + 1) * b_dim1];
1496 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1497 * b[l + (j + 1) * b_dim1];
1498 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1499 * b[l + (j + 2) * b_dim1];
1500 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1501 * b[l + (j + 2) * b_dim1];
1502 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1503 * b[l + (j + 3) * b_dim1];
1504 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1505 * b[l + (j + 3) * b_dim1];
1506 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1507 * b[l + j * b_dim1];
1508 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1509 * b[l + j * b_dim1];
1510 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1511 * b[l + (j + 1) * b_dim1];
1512 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1513 * b[l + (j + 1) * b_dim1];
1514 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1515 * b[l + (j + 2) * b_dim1];
1516 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1517 * b[l + (j + 2) * b_dim1];
1518 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1519 * b[l + (j + 3) * b_dim1];
1520 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1521 * b[l + (j + 3) * b_dim1];
1523 c[i + j * c_dim1] = f11;
1524 c[i + 1 + j * c_dim1] = f21;
1525 c[i + (j + 1) * c_dim1] = f12;
1526 c[i + 1 + (j + 1) * c_dim1] = f22;
1527 c[i + (j + 2) * c_dim1] = f13;
1528 c[i + 1 + (j + 2) * c_dim1] = f23;
1529 c[i + (j + 3) * c_dim1] = f14;
1530 c[i + 1 + (j + 3) * c_dim1] = f24;
1531 c[i + 2 + j * c_dim1] = f31;
1532 c[i + 3 + j * c_dim1] = f41;
1533 c[i + 2 + (j + 1) * c_dim1] = f32;
1534 c[i + 3 + (j + 1) * c_dim1] = f42;
1535 c[i + 2 + (j + 2) * c_dim1] = f33;
1536 c[i + 3 + (j + 2) * c_dim1] = f43;
1537 c[i + 2 + (j + 3) * c_dim1] = f34;
1538 c[i + 3 + (j + 3) * c_dim1] = f44;
1540 if (uisec < isec)
1542 i5 = ii + isec - 1;
1543 for (i = ii + uisec; i <= i5; ++i)
1545 f11 = c[i + j * c_dim1];
1546 f12 = c[i + (j + 1) * c_dim1];
1547 f13 = c[i + (j + 2) * c_dim1];
1548 f14 = c[i + (j + 3) * c_dim1];
1549 i6 = ll + lsec - 1;
1550 for (l = ll; l <= i6; ++l)
1552 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1553 257] * b[l + j * b_dim1];
1554 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1555 257] * b[l + (j + 1) * b_dim1];
1556 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1557 257] * b[l + (j + 2) * b_dim1];
1558 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1559 257] * b[l + (j + 3) * b_dim1];
1561 c[i + j * c_dim1] = f11;
1562 c[i + (j + 1) * c_dim1] = f12;
1563 c[i + (j + 2) * c_dim1] = f13;
1564 c[i + (j + 3) * c_dim1] = f14;
1568 if (ujsec < jsec)
1570 i4 = jj + jsec - 1;
1571 for (j = jj + ujsec; j <= i4; ++j)
1573 i5 = ii + uisec - 1;
1574 for (i = ii; i <= i5; i += 4)
1576 f11 = c[i + j * c_dim1];
1577 f21 = c[i + 1 + j * c_dim1];
1578 f31 = c[i + 2 + j * c_dim1];
1579 f41 = c[i + 3 + j * c_dim1];
1580 i6 = ll + lsec - 1;
1581 for (l = ll; l <= i6; ++l)
1583 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1584 257] * b[l + j * b_dim1];
1585 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1586 257] * b[l + j * b_dim1];
1587 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1588 257] * b[l + j * b_dim1];
1589 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1590 257] * b[l + j * b_dim1];
1592 c[i + j * c_dim1] = f11;
1593 c[i + 1 + j * c_dim1] = f21;
1594 c[i + 2 + j * c_dim1] = f31;
1595 c[i + 3 + j * c_dim1] = f41;
1597 i5 = ii + isec - 1;
1598 for (i = ii + uisec; i <= i5; ++i)
1600 f11 = c[i + j * c_dim1];
1601 i6 = ll + lsec - 1;
1602 for (l = ll; l <= i6; ++l)
1604 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1605 257] * b[l + j * b_dim1];
1607 c[i + j * c_dim1] = f11;
1614 return;
1616 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1618 if (GFC_DESCRIPTOR_RANK (a) != 1)
1620 const GFC_REAL_4 *restrict abase_x;
1621 const GFC_REAL_4 *restrict bbase_y;
1622 GFC_REAL_4 *restrict dest_y;
1623 GFC_REAL_4 s;
1625 for (y = 0; y < ycount; y++)
1627 bbase_y = &bbase[y*bystride];
1628 dest_y = &dest[y*rystride];
1629 for (x = 0; x < xcount; x++)
1631 abase_x = &abase[x*axstride];
1632 s = (GFC_REAL_4) 0;
1633 for (n = 0; n < count; n++)
1634 s += abase_x[n] * bbase_y[n];
1635 dest_y[x] = s;
1639 else
1641 const GFC_REAL_4 *restrict bbase_y;
1642 GFC_REAL_4 s;
1644 for (y = 0; y < ycount; y++)
1646 bbase_y = &bbase[y*bystride];
1647 s = (GFC_REAL_4) 0;
1648 for (n = 0; n < count; n++)
1649 s += abase[n*axstride] * bbase_y[n];
1650 dest[y*rystride] = s;
1654 else if (axstride < aystride)
1656 for (y = 0; y < ycount; y++)
1657 for (x = 0; x < xcount; x++)
1658 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1660 for (y = 0; y < ycount; y++)
1661 for (n = 0; n < count; n++)
1662 for (x = 0; x < xcount; x++)
1663 /* dest[x,y] += a[x,n] * b[n,y] */
1664 dest[x*rxstride + y*rystride] +=
1665 abase[x*axstride + n*aystride] *
1666 bbase[n*bxstride + y*bystride];
1668 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1670 const GFC_REAL_4 *restrict bbase_y;
1671 GFC_REAL_4 s;
1673 for (y = 0; y < ycount; y++)
1675 bbase_y = &bbase[y*bystride];
1676 s = (GFC_REAL_4) 0;
1677 for (n = 0; n < count; n++)
1678 s += abase[n*axstride] * bbase_y[n*bxstride];
1679 dest[y*rxstride] = s;
1682 else
1684 const GFC_REAL_4 *restrict abase_x;
1685 const GFC_REAL_4 *restrict bbase_y;
1686 GFC_REAL_4 *restrict dest_y;
1687 GFC_REAL_4 s;
1689 for (y = 0; y < ycount; y++)
1691 bbase_y = &bbase[y*bystride];
1692 dest_y = &dest[y*rystride];
1693 for (x = 0; x < xcount; x++)
1695 abase_x = &abase[x*axstride];
1696 s = (GFC_REAL_4) 0;
1697 for (n = 0; n < count; n++)
1698 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1699 dest_y[x*rxstride] = s;
1704 #undef POW3
1705 #undef min
1706 #undef max
1708 #endif /* HAVE_AVX512F */
1710 /* Function to fall back to if there is no special processor-specific version. */
1711 static void
1712 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
1713 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1714 int blas_limit, blas_call gemm)
1716 const GFC_REAL_4 * restrict abase;
1717 const GFC_REAL_4 * restrict bbase;
1718 GFC_REAL_4 * restrict dest;
1720 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1721 index_type x, y, n, count, xcount, ycount;
1723 assert (GFC_DESCRIPTOR_RANK (a) == 2
1724 || GFC_DESCRIPTOR_RANK (b) == 2);
1726 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1728 Either A or B (but not both) can be rank 1:
1730 o One-dimensional argument A is implicitly treated as a row matrix
1731 dimensioned [1,count], so xcount=1.
1733 o One-dimensional argument B is implicitly treated as a column matrix
1734 dimensioned [count, 1], so ycount=1.
1737 if (retarray->base_addr == NULL)
1739 if (GFC_DESCRIPTOR_RANK (a) == 1)
1741 GFC_DIMENSION_SET(retarray->dim[0], 0,
1742 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1744 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1746 GFC_DIMENSION_SET(retarray->dim[0], 0,
1747 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1749 else
1751 GFC_DIMENSION_SET(retarray->dim[0], 0,
1752 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1754 GFC_DIMENSION_SET(retarray->dim[1], 0,
1755 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1756 GFC_DESCRIPTOR_EXTENT(retarray,0));
1759 retarray->base_addr
1760 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1761 retarray->offset = 0;
1763 else if (unlikely (compile_options.bounds_check))
1765 index_type ret_extent, arg_extent;
1767 if (GFC_DESCRIPTOR_RANK (a) == 1)
1769 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1770 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1771 if (arg_extent != ret_extent)
1772 runtime_error ("Incorrect extent in return array in"
1773 " MATMUL intrinsic: is %ld, should be %ld",
1774 (long int) ret_extent, (long int) arg_extent);
1776 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1778 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1779 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1780 if (arg_extent != ret_extent)
1781 runtime_error ("Incorrect extent in return array in"
1782 " MATMUL intrinsic: is %ld, should be %ld",
1783 (long int) ret_extent, (long int) arg_extent);
1785 else
1787 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1788 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1789 if (arg_extent != ret_extent)
1790 runtime_error ("Incorrect extent in return array in"
1791 " MATMUL intrinsic for dimension 1:"
1792 " is %ld, should be %ld",
1793 (long int) ret_extent, (long int) arg_extent);
1795 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1796 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1797 if (arg_extent != ret_extent)
1798 runtime_error ("Incorrect extent in return array in"
1799 " MATMUL intrinsic for dimension 2:"
1800 " is %ld, should be %ld",
1801 (long int) ret_extent, (long int) arg_extent);
1806 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1808 /* One-dimensional result may be addressed in the code below
1809 either as a row or a column matrix. We want both cases to
1810 work. */
1811 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1813 else
1815 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1816 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1820 if (GFC_DESCRIPTOR_RANK (a) == 1)
1822 /* Treat it as a a row matrix A[1,count]. */
1823 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1824 aystride = 1;
1826 xcount = 1;
1827 count = GFC_DESCRIPTOR_EXTENT(a,0);
1829 else
1831 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1832 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1834 count = GFC_DESCRIPTOR_EXTENT(a,1);
1835 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1838 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1840 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1841 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1844 if (GFC_DESCRIPTOR_RANK (b) == 1)
1846 /* Treat it as a column matrix B[count,1] */
1847 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1849 /* bystride should never be used for 1-dimensional b.
1850 in case it is we want it to cause a segfault, rather than
1851 an incorrect result. */
1852 bystride = 0xDEADBEEF;
1853 ycount = 1;
1855 else
1857 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1858 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1859 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1862 abase = a->base_addr;
1863 bbase = b->base_addr;
1864 dest = retarray->base_addr;
1866 /* Now that everything is set up, we perform the multiplication
1867 itself. */
1869 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1870 #define min(a,b) ((a) <= (b) ? (a) : (b))
1871 #define max(a,b) ((a) >= (b) ? (a) : (b))
1873 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1874 && (bxstride == 1 || bystride == 1)
1875 && (((float) xcount) * ((float) ycount) * ((float) count)
1876 > POW3(blas_limit)))
1878 const int m = xcount, n = ycount, k = count, ldc = rystride;
1879 const GFC_REAL_4 one = 1, zero = 0;
1880 const int lda = (axstride == 1) ? aystride : axstride,
1881 ldb = (bxstride == 1) ? bystride : bxstride;
1883 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1885 assert (gemm != NULL);
1886 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1887 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1888 &ldc, 1, 1);
1889 return;
1893 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1895 /* This block of code implements a tuned matmul, derived from
1896 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1898 Bo Kagstrom and Per Ling
1899 Department of Computing Science
1900 Umea University
1901 S-901 87 Umea, Sweden
1903 from netlib.org, translated to C, and modified for matmul.m4. */
1905 const GFC_REAL_4 *a, *b;
1906 GFC_REAL_4 *c;
1907 const index_type m = xcount, n = ycount, k = count;
1909 /* System generated locals */
1910 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1911 i1, i2, i3, i4, i5, i6;
1913 /* Local variables */
1914 GFC_REAL_4 t1[65536], /* was [256][256] */
1915 f11, f12, f21, f22, f31, f32, f41, f42,
1916 f13, f14, f23, f24, f33, f34, f43, f44;
1917 index_type i, j, l, ii, jj, ll;
1918 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1920 a = abase;
1921 b = bbase;
1922 c = retarray->base_addr;
1924 /* Parameter adjustments */
1925 c_dim1 = rystride;
1926 c_offset = 1 + c_dim1;
1927 c -= c_offset;
1928 a_dim1 = aystride;
1929 a_offset = 1 + a_dim1;
1930 a -= a_offset;
1931 b_dim1 = bystride;
1932 b_offset = 1 + b_dim1;
1933 b -= b_offset;
1935 /* Early exit if possible */
1936 if (m == 0 || n == 0 || k == 0)
1937 return;
1939 /* Empty c first. */
1940 for (j=1; j<=n; j++)
1941 for (i=1; i<=m; i++)
1942 c[i + j * c_dim1] = (GFC_REAL_4)0;
1944 /* Start turning the crank. */
1945 i1 = n;
1946 for (jj = 1; jj <= i1; jj += 512)
1948 /* Computing MIN */
1949 i2 = 512;
1950 i3 = n - jj + 1;
1951 jsec = min(i2,i3);
1952 ujsec = jsec - jsec % 4;
1953 i2 = k;
1954 for (ll = 1; ll <= i2; ll += 256)
1956 /* Computing MIN */
1957 i3 = 256;
1958 i4 = k - ll + 1;
1959 lsec = min(i3,i4);
1960 ulsec = lsec - lsec % 2;
1962 i3 = m;
1963 for (ii = 1; ii <= i3; ii += 256)
1965 /* Computing MIN */
1966 i4 = 256;
1967 i5 = m - ii + 1;
1968 isec = min(i4,i5);
1969 uisec = isec - isec % 2;
1970 i4 = ll + ulsec - 1;
1971 for (l = ll; l <= i4; l += 2)
1973 i5 = ii + uisec - 1;
1974 for (i = ii; i <= i5; i += 2)
1976 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1977 a[i + l * a_dim1];
1978 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1979 a[i + (l + 1) * a_dim1];
1980 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1981 a[i + 1 + l * a_dim1];
1982 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1983 a[i + 1 + (l + 1) * a_dim1];
1985 if (uisec < isec)
1987 t1[l - ll + 1 + (isec << 8) - 257] =
1988 a[ii + isec - 1 + l * a_dim1];
1989 t1[l - ll + 2 + (isec << 8) - 257] =
1990 a[ii + isec - 1 + (l + 1) * a_dim1];
1993 if (ulsec < lsec)
1995 i4 = ii + isec - 1;
1996 for (i = ii; i<= i4; ++i)
1998 t1[lsec + ((i - ii + 1) << 8) - 257] =
1999 a[i + (ll + lsec - 1) * a_dim1];
2003 uisec = isec - isec % 4;
2004 i4 = jj + ujsec - 1;
2005 for (j = jj; j <= i4; j += 4)
2007 i5 = ii + uisec - 1;
2008 for (i = ii; i <= i5; i += 4)
2010 f11 = c[i + j * c_dim1];
2011 f21 = c[i + 1 + j * c_dim1];
2012 f12 = c[i + (j + 1) * c_dim1];
2013 f22 = c[i + 1 + (j + 1) * c_dim1];
2014 f13 = c[i + (j + 2) * c_dim1];
2015 f23 = c[i + 1 + (j + 2) * c_dim1];
2016 f14 = c[i + (j + 3) * c_dim1];
2017 f24 = c[i + 1 + (j + 3) * c_dim1];
2018 f31 = c[i + 2 + j * c_dim1];
2019 f41 = c[i + 3 + j * c_dim1];
2020 f32 = c[i + 2 + (j + 1) * c_dim1];
2021 f42 = c[i + 3 + (j + 1) * c_dim1];
2022 f33 = c[i + 2 + (j + 2) * c_dim1];
2023 f43 = c[i + 3 + (j + 2) * c_dim1];
2024 f34 = c[i + 2 + (j + 3) * c_dim1];
2025 f44 = c[i + 3 + (j + 3) * c_dim1];
2026 i6 = ll + lsec - 1;
2027 for (l = ll; l <= i6; ++l)
2029 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2030 * b[l + j * b_dim1];
2031 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2032 * b[l + j * b_dim1];
2033 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2034 * b[l + (j + 1) * b_dim1];
2035 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2036 * b[l + (j + 1) * b_dim1];
2037 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2038 * b[l + (j + 2) * b_dim1];
2039 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2040 * b[l + (j + 2) * b_dim1];
2041 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2042 * b[l + (j + 3) * b_dim1];
2043 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2044 * b[l + (j + 3) * b_dim1];
2045 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2046 * b[l + j * b_dim1];
2047 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2048 * b[l + j * b_dim1];
2049 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2050 * b[l + (j + 1) * b_dim1];
2051 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2052 * b[l + (j + 1) * b_dim1];
2053 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2054 * b[l + (j + 2) * b_dim1];
2055 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2056 * b[l + (j + 2) * b_dim1];
2057 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2058 * b[l + (j + 3) * b_dim1];
2059 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2060 * b[l + (j + 3) * b_dim1];
2062 c[i + j * c_dim1] = f11;
2063 c[i + 1 + j * c_dim1] = f21;
2064 c[i + (j + 1) * c_dim1] = f12;
2065 c[i + 1 + (j + 1) * c_dim1] = f22;
2066 c[i + (j + 2) * c_dim1] = f13;
2067 c[i + 1 + (j + 2) * c_dim1] = f23;
2068 c[i + (j + 3) * c_dim1] = f14;
2069 c[i + 1 + (j + 3) * c_dim1] = f24;
2070 c[i + 2 + j * c_dim1] = f31;
2071 c[i + 3 + j * c_dim1] = f41;
2072 c[i + 2 + (j + 1) * c_dim1] = f32;
2073 c[i + 3 + (j + 1) * c_dim1] = f42;
2074 c[i + 2 + (j + 2) * c_dim1] = f33;
2075 c[i + 3 + (j + 2) * c_dim1] = f43;
2076 c[i + 2 + (j + 3) * c_dim1] = f34;
2077 c[i + 3 + (j + 3) * c_dim1] = f44;
2079 if (uisec < isec)
2081 i5 = ii + isec - 1;
2082 for (i = ii + uisec; i <= i5; ++i)
2084 f11 = c[i + j * c_dim1];
2085 f12 = c[i + (j + 1) * c_dim1];
2086 f13 = c[i + (j + 2) * c_dim1];
2087 f14 = c[i + (j + 3) * c_dim1];
2088 i6 = ll + lsec - 1;
2089 for (l = ll; l <= i6; ++l)
2091 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2092 257] * b[l + j * b_dim1];
2093 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2094 257] * b[l + (j + 1) * b_dim1];
2095 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2096 257] * b[l + (j + 2) * b_dim1];
2097 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2098 257] * b[l + (j + 3) * b_dim1];
2100 c[i + j * c_dim1] = f11;
2101 c[i + (j + 1) * c_dim1] = f12;
2102 c[i + (j + 2) * c_dim1] = f13;
2103 c[i + (j + 3) * c_dim1] = f14;
2107 if (ujsec < jsec)
2109 i4 = jj + jsec - 1;
2110 for (j = jj + ujsec; j <= i4; ++j)
2112 i5 = ii + uisec - 1;
2113 for (i = ii; i <= i5; i += 4)
2115 f11 = c[i + j * c_dim1];
2116 f21 = c[i + 1 + j * c_dim1];
2117 f31 = c[i + 2 + j * c_dim1];
2118 f41 = c[i + 3 + j * c_dim1];
2119 i6 = ll + lsec - 1;
2120 for (l = ll; l <= i6; ++l)
2122 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2123 257] * b[l + j * b_dim1];
2124 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2125 257] * b[l + j * b_dim1];
2126 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2127 257] * b[l + j * b_dim1];
2128 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2129 257] * b[l + j * b_dim1];
2131 c[i + j * c_dim1] = f11;
2132 c[i + 1 + j * c_dim1] = f21;
2133 c[i + 2 + j * c_dim1] = f31;
2134 c[i + 3 + j * c_dim1] = f41;
2136 i5 = ii + isec - 1;
2137 for (i = ii + uisec; i <= i5; ++i)
2139 f11 = c[i + j * c_dim1];
2140 i6 = ll + lsec - 1;
2141 for (l = ll; l <= i6; ++l)
2143 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2144 257] * b[l + j * b_dim1];
2146 c[i + j * c_dim1] = f11;
2153 return;
2155 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2157 if (GFC_DESCRIPTOR_RANK (a) != 1)
2159 const GFC_REAL_4 *restrict abase_x;
2160 const GFC_REAL_4 *restrict bbase_y;
2161 GFC_REAL_4 *restrict dest_y;
2162 GFC_REAL_4 s;
2164 for (y = 0; y < ycount; y++)
2166 bbase_y = &bbase[y*bystride];
2167 dest_y = &dest[y*rystride];
2168 for (x = 0; x < xcount; x++)
2170 abase_x = &abase[x*axstride];
2171 s = (GFC_REAL_4) 0;
2172 for (n = 0; n < count; n++)
2173 s += abase_x[n] * bbase_y[n];
2174 dest_y[x] = s;
2178 else
2180 const GFC_REAL_4 *restrict bbase_y;
2181 GFC_REAL_4 s;
2183 for (y = 0; y < ycount; y++)
2185 bbase_y = &bbase[y*bystride];
2186 s = (GFC_REAL_4) 0;
2187 for (n = 0; n < count; n++)
2188 s += abase[n*axstride] * bbase_y[n];
2189 dest[y*rystride] = s;
2193 else if (axstride < aystride)
2195 for (y = 0; y < ycount; y++)
2196 for (x = 0; x < xcount; x++)
2197 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2199 for (y = 0; y < ycount; y++)
2200 for (n = 0; n < count; n++)
2201 for (x = 0; x < xcount; x++)
2202 /* dest[x,y] += a[x,n] * b[n,y] */
2203 dest[x*rxstride + y*rystride] +=
2204 abase[x*axstride + n*aystride] *
2205 bbase[n*bxstride + y*bystride];
2207 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2209 const GFC_REAL_4 *restrict bbase_y;
2210 GFC_REAL_4 s;
2212 for (y = 0; y < ycount; y++)
2214 bbase_y = &bbase[y*bystride];
2215 s = (GFC_REAL_4) 0;
2216 for (n = 0; n < count; n++)
2217 s += abase[n*axstride] * bbase_y[n*bxstride];
2218 dest[y*rxstride] = s;
2221 else
2223 const GFC_REAL_4 *restrict abase_x;
2224 const GFC_REAL_4 *restrict bbase_y;
2225 GFC_REAL_4 *restrict dest_y;
2226 GFC_REAL_4 s;
2228 for (y = 0; y < ycount; y++)
2230 bbase_y = &bbase[y*bystride];
2231 dest_y = &dest[y*rystride];
2232 for (x = 0; x < xcount; x++)
2234 abase_x = &abase[x*axstride];
2235 s = (GFC_REAL_4) 0;
2236 for (n = 0; n < count; n++)
2237 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2238 dest_y[x*rxstride] = s;
2243 #undef POW3
2244 #undef min
2245 #undef max
2248 /* Compiling main function, with selection code for the processor. */
2250 /* Currently, this is i386 only. Adjust for other architectures. */
2252 #include <config/i386/cpuinfo.h>
2253 void matmul_r4 (gfc_array_r4 * const restrict retarray,
2254 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2255 int blas_limit, blas_call gemm)
2257 static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
2258 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2259 int blas_limit, blas_call gemm);
2261 void (*matmul_fn) (gfc_array_r4 * const restrict retarray,
2262 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2263 int blas_limit, blas_call gemm);
2265 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2266 if (matmul_fn == NULL)
2268 matmul_fn = matmul_r4_vanilla;
2269 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2271 /* Run down the available processors in order of preference. */
2272 #ifdef HAVE_AVX512F
2273 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2275 matmul_fn = matmul_r4_avx512f;
2276 goto store;
2279 #endif /* HAVE_AVX512F */
2281 #ifdef HAVE_AVX2
2282 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2283 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2285 matmul_fn = matmul_r4_avx2;
2286 goto store;
2289 #endif
2291 #ifdef HAVE_AVX
2292 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2294 matmul_fn = matmul_r4_avx;
2295 goto store;
2297 #endif /* HAVE_AVX */
2299 store:
2300 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2303 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2306 #else /* Just the vanilla function. */
2308 void
2309 matmul_r4 (gfc_array_r4 * const restrict retarray,
2310 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2311 int blas_limit, blas_call gemm)
2313 const GFC_REAL_4 * restrict abase;
2314 const GFC_REAL_4 * restrict bbase;
2315 GFC_REAL_4 * restrict dest;
2317 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2318 index_type x, y, n, count, xcount, ycount;
2320 assert (GFC_DESCRIPTOR_RANK (a) == 2
2321 || GFC_DESCRIPTOR_RANK (b) == 2);
2323 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2325 Either A or B (but not both) can be rank 1:
2327 o One-dimensional argument A is implicitly treated as a row matrix
2328 dimensioned [1,count], so xcount=1.
2330 o One-dimensional argument B is implicitly treated as a column matrix
2331 dimensioned [count, 1], so ycount=1.
2334 if (retarray->base_addr == NULL)
2336 if (GFC_DESCRIPTOR_RANK (a) == 1)
2338 GFC_DIMENSION_SET(retarray->dim[0], 0,
2339 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2341 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2343 GFC_DIMENSION_SET(retarray->dim[0], 0,
2344 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2346 else
2348 GFC_DIMENSION_SET(retarray->dim[0], 0,
2349 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2351 GFC_DIMENSION_SET(retarray->dim[1], 0,
2352 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2353 GFC_DESCRIPTOR_EXTENT(retarray,0));
2356 retarray->base_addr
2357 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
2358 retarray->offset = 0;
2360 else if (unlikely (compile_options.bounds_check))
2362 index_type ret_extent, arg_extent;
2364 if (GFC_DESCRIPTOR_RANK (a) == 1)
2366 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2367 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2368 if (arg_extent != ret_extent)
2369 runtime_error ("Incorrect extent in return array in"
2370 " MATMUL intrinsic: is %ld, should be %ld",
2371 (long int) ret_extent, (long int) arg_extent);
2373 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2375 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2376 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2377 if (arg_extent != ret_extent)
2378 runtime_error ("Incorrect extent in return array in"
2379 " MATMUL intrinsic: is %ld, should be %ld",
2380 (long int) ret_extent, (long int) arg_extent);
2382 else
2384 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2385 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2386 if (arg_extent != ret_extent)
2387 runtime_error ("Incorrect extent in return array in"
2388 " MATMUL intrinsic for dimension 1:"
2389 " is %ld, should be %ld",
2390 (long int) ret_extent, (long int) arg_extent);
2392 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2393 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2394 if (arg_extent != ret_extent)
2395 runtime_error ("Incorrect extent in return array in"
2396 " MATMUL intrinsic for dimension 2:"
2397 " is %ld, should be %ld",
2398 (long int) ret_extent, (long int) arg_extent);
2403 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2405 /* One-dimensional result may be addressed in the code below
2406 either as a row or a column matrix. We want both cases to
2407 work. */
2408 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2410 else
2412 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2413 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2417 if (GFC_DESCRIPTOR_RANK (a) == 1)
2419 /* Treat it as a a row matrix A[1,count]. */
2420 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2421 aystride = 1;
2423 xcount = 1;
2424 count = GFC_DESCRIPTOR_EXTENT(a,0);
2426 else
2428 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2429 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2431 count = GFC_DESCRIPTOR_EXTENT(a,1);
2432 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2435 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2437 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2438 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2441 if (GFC_DESCRIPTOR_RANK (b) == 1)
2443 /* Treat it as a column matrix B[count,1] */
2444 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2446 /* bystride should never be used for 1-dimensional b.
2447 in case it is we want it to cause a segfault, rather than
2448 an incorrect result. */
2449 bystride = 0xDEADBEEF;
2450 ycount = 1;
2452 else
2454 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2455 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2456 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2459 abase = a->base_addr;
2460 bbase = b->base_addr;
2461 dest = retarray->base_addr;
2463 /* Now that everything is set up, we perform the multiplication
2464 itself. */
2466 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2467 #define min(a,b) ((a) <= (b) ? (a) : (b))
2468 #define max(a,b) ((a) >= (b) ? (a) : (b))
2470 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2471 && (bxstride == 1 || bystride == 1)
2472 && (((float) xcount) * ((float) ycount) * ((float) count)
2473 > POW3(blas_limit)))
2475 const int m = xcount, n = ycount, k = count, ldc = rystride;
2476 const GFC_REAL_4 one = 1, zero = 0;
2477 const int lda = (axstride == 1) ? aystride : axstride,
2478 ldb = (bxstride == 1) ? bystride : bxstride;
2480 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2482 assert (gemm != NULL);
2483 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2484 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2485 &ldc, 1, 1);
2486 return;
2490 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2492 /* This block of code implements a tuned matmul, derived from
2493 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2495 Bo Kagstrom and Per Ling
2496 Department of Computing Science
2497 Umea University
2498 S-901 87 Umea, Sweden
2500 from netlib.org, translated to C, and modified for matmul.m4. */
2502 const GFC_REAL_4 *a, *b;
2503 GFC_REAL_4 *c;
2504 const index_type m = xcount, n = ycount, k = count;
2506 /* System generated locals */
2507 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2508 i1, i2, i3, i4, i5, i6;
2510 /* Local variables */
2511 GFC_REAL_4 t1[65536], /* was [256][256] */
2512 f11, f12, f21, f22, f31, f32, f41, f42,
2513 f13, f14, f23, f24, f33, f34, f43, f44;
2514 index_type i, j, l, ii, jj, ll;
2515 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2517 a = abase;
2518 b = bbase;
2519 c = retarray->base_addr;
2521 /* Parameter adjustments */
2522 c_dim1 = rystride;
2523 c_offset = 1 + c_dim1;
2524 c -= c_offset;
2525 a_dim1 = aystride;
2526 a_offset = 1 + a_dim1;
2527 a -= a_offset;
2528 b_dim1 = bystride;
2529 b_offset = 1 + b_dim1;
2530 b -= b_offset;
2532 /* Early exit if possible */
2533 if (m == 0 || n == 0 || k == 0)
2534 return;
2536 /* Empty c first. */
2537 for (j=1; j<=n; j++)
2538 for (i=1; i<=m; i++)
2539 c[i + j * c_dim1] = (GFC_REAL_4)0;
2541 /* Start turning the crank. */
2542 i1 = n;
2543 for (jj = 1; jj <= i1; jj += 512)
2545 /* Computing MIN */
2546 i2 = 512;
2547 i3 = n - jj + 1;
2548 jsec = min(i2,i3);
2549 ujsec = jsec - jsec % 4;
2550 i2 = k;
2551 for (ll = 1; ll <= i2; ll += 256)
2553 /* Computing MIN */
2554 i3 = 256;
2555 i4 = k - ll + 1;
2556 lsec = min(i3,i4);
2557 ulsec = lsec - lsec % 2;
2559 i3 = m;
2560 for (ii = 1; ii <= i3; ii += 256)
2562 /* Computing MIN */
2563 i4 = 256;
2564 i5 = m - ii + 1;
2565 isec = min(i4,i5);
2566 uisec = isec - isec % 2;
2567 i4 = ll + ulsec - 1;
2568 for (l = ll; l <= i4; l += 2)
2570 i5 = ii + uisec - 1;
2571 for (i = ii; i <= i5; i += 2)
2573 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2574 a[i + l * a_dim1];
2575 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2576 a[i + (l + 1) * a_dim1];
2577 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2578 a[i + 1 + l * a_dim1];
2579 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2580 a[i + 1 + (l + 1) * a_dim1];
2582 if (uisec < isec)
2584 t1[l - ll + 1 + (isec << 8) - 257] =
2585 a[ii + isec - 1 + l * a_dim1];
2586 t1[l - ll + 2 + (isec << 8) - 257] =
2587 a[ii + isec - 1 + (l + 1) * a_dim1];
2590 if (ulsec < lsec)
2592 i4 = ii + isec - 1;
2593 for (i = ii; i<= i4; ++i)
2595 t1[lsec + ((i - ii + 1) << 8) - 257] =
2596 a[i + (ll + lsec - 1) * a_dim1];
2600 uisec = isec - isec % 4;
2601 i4 = jj + ujsec - 1;
2602 for (j = jj; j <= i4; j += 4)
2604 i5 = ii + uisec - 1;
2605 for (i = ii; i <= i5; i += 4)
2607 f11 = c[i + j * c_dim1];
2608 f21 = c[i + 1 + j * c_dim1];
2609 f12 = c[i + (j + 1) * c_dim1];
2610 f22 = c[i + 1 + (j + 1) * c_dim1];
2611 f13 = c[i + (j + 2) * c_dim1];
2612 f23 = c[i + 1 + (j + 2) * c_dim1];
2613 f14 = c[i + (j + 3) * c_dim1];
2614 f24 = c[i + 1 + (j + 3) * c_dim1];
2615 f31 = c[i + 2 + j * c_dim1];
2616 f41 = c[i + 3 + j * c_dim1];
2617 f32 = c[i + 2 + (j + 1) * c_dim1];
2618 f42 = c[i + 3 + (j + 1) * c_dim1];
2619 f33 = c[i + 2 + (j + 2) * c_dim1];
2620 f43 = c[i + 3 + (j + 2) * c_dim1];
2621 f34 = c[i + 2 + (j + 3) * c_dim1];
2622 f44 = c[i + 3 + (j + 3) * c_dim1];
2623 i6 = ll + lsec - 1;
2624 for (l = ll; l <= i6; ++l)
2626 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2627 * b[l + j * b_dim1];
2628 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2629 * b[l + j * b_dim1];
2630 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2631 * b[l + (j + 1) * b_dim1];
2632 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2633 * b[l + (j + 1) * b_dim1];
2634 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2635 * b[l + (j + 2) * b_dim1];
2636 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2637 * b[l + (j + 2) * b_dim1];
2638 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2639 * b[l + (j + 3) * b_dim1];
2640 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2641 * b[l + (j + 3) * b_dim1];
2642 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2643 * b[l + j * b_dim1];
2644 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2645 * b[l + j * b_dim1];
2646 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2647 * b[l + (j + 1) * b_dim1];
2648 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2649 * b[l + (j + 1) * b_dim1];
2650 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2651 * b[l + (j + 2) * b_dim1];
2652 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2653 * b[l + (j + 2) * b_dim1];
2654 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2655 * b[l + (j + 3) * b_dim1];
2656 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2657 * b[l + (j + 3) * b_dim1];
2659 c[i + j * c_dim1] = f11;
2660 c[i + 1 + j * c_dim1] = f21;
2661 c[i + (j + 1) * c_dim1] = f12;
2662 c[i + 1 + (j + 1) * c_dim1] = f22;
2663 c[i + (j + 2) * c_dim1] = f13;
2664 c[i + 1 + (j + 2) * c_dim1] = f23;
2665 c[i + (j + 3) * c_dim1] = f14;
2666 c[i + 1 + (j + 3) * c_dim1] = f24;
2667 c[i + 2 + j * c_dim1] = f31;
2668 c[i + 3 + j * c_dim1] = f41;
2669 c[i + 2 + (j + 1) * c_dim1] = f32;
2670 c[i + 3 + (j + 1) * c_dim1] = f42;
2671 c[i + 2 + (j + 2) * c_dim1] = f33;
2672 c[i + 3 + (j + 2) * c_dim1] = f43;
2673 c[i + 2 + (j + 3) * c_dim1] = f34;
2674 c[i + 3 + (j + 3) * c_dim1] = f44;
2676 if (uisec < isec)
2678 i5 = ii + isec - 1;
2679 for (i = ii + uisec; i <= i5; ++i)
2681 f11 = c[i + j * c_dim1];
2682 f12 = c[i + (j + 1) * c_dim1];
2683 f13 = c[i + (j + 2) * c_dim1];
2684 f14 = c[i + (j + 3) * c_dim1];
2685 i6 = ll + lsec - 1;
2686 for (l = ll; l <= i6; ++l)
2688 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689 257] * b[l + j * b_dim1];
2690 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2691 257] * b[l + (j + 1) * b_dim1];
2692 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2693 257] * b[l + (j + 2) * b_dim1];
2694 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2695 257] * b[l + (j + 3) * b_dim1];
2697 c[i + j * c_dim1] = f11;
2698 c[i + (j + 1) * c_dim1] = f12;
2699 c[i + (j + 2) * c_dim1] = f13;
2700 c[i + (j + 3) * c_dim1] = f14;
2704 if (ujsec < jsec)
2706 i4 = jj + jsec - 1;
2707 for (j = jj + ujsec; j <= i4; ++j)
2709 i5 = ii + uisec - 1;
2710 for (i = ii; i <= i5; i += 4)
2712 f11 = c[i + j * c_dim1];
2713 f21 = c[i + 1 + j * c_dim1];
2714 f31 = c[i + 2 + j * c_dim1];
2715 f41 = c[i + 3 + j * c_dim1];
2716 i6 = ll + lsec - 1;
2717 for (l = ll; l <= i6; ++l)
2719 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2720 257] * b[l + j * b_dim1];
2721 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2722 257] * b[l + j * b_dim1];
2723 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2724 257] * b[l + j * b_dim1];
2725 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2726 257] * b[l + j * b_dim1];
2728 c[i + j * c_dim1] = f11;
2729 c[i + 1 + j * c_dim1] = f21;
2730 c[i + 2 + j * c_dim1] = f31;
2731 c[i + 3 + j * c_dim1] = f41;
2733 i5 = ii + isec - 1;
2734 for (i = ii + uisec; i <= i5; ++i)
2736 f11 = c[i + j * c_dim1];
2737 i6 = ll + lsec - 1;
2738 for (l = ll; l <= i6; ++l)
2740 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2741 257] * b[l + j * b_dim1];
2743 c[i + j * c_dim1] = f11;
2750 return;
2752 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2754 if (GFC_DESCRIPTOR_RANK (a) != 1)
2756 const GFC_REAL_4 *restrict abase_x;
2757 const GFC_REAL_4 *restrict bbase_y;
2758 GFC_REAL_4 *restrict dest_y;
2759 GFC_REAL_4 s;
2761 for (y = 0; y < ycount; y++)
2763 bbase_y = &bbase[y*bystride];
2764 dest_y = &dest[y*rystride];
2765 for (x = 0; x < xcount; x++)
2767 abase_x = &abase[x*axstride];
2768 s = (GFC_REAL_4) 0;
2769 for (n = 0; n < count; n++)
2770 s += abase_x[n] * bbase_y[n];
2771 dest_y[x] = s;
2775 else
2777 const GFC_REAL_4 *restrict bbase_y;
2778 GFC_REAL_4 s;
2780 for (y = 0; y < ycount; y++)
2782 bbase_y = &bbase[y*bystride];
2783 s = (GFC_REAL_4) 0;
2784 for (n = 0; n < count; n++)
2785 s += abase[n*axstride] * bbase_y[n];
2786 dest[y*rystride] = s;
2790 else if (axstride < aystride)
2792 for (y = 0; y < ycount; y++)
2793 for (x = 0; x < xcount; x++)
2794 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2796 for (y = 0; y < ycount; y++)
2797 for (n = 0; n < count; n++)
2798 for (x = 0; x < xcount; x++)
2799 /* dest[x,y] += a[x,n] * b[n,y] */
2800 dest[x*rxstride + y*rystride] +=
2801 abase[x*axstride + n*aystride] *
2802 bbase[n*bxstride + y*bystride];
2804 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2806 const GFC_REAL_4 *restrict bbase_y;
2807 GFC_REAL_4 s;
2809 for (y = 0; y < ycount; y++)
2811 bbase_y = &bbase[y*bystride];
2812 s = (GFC_REAL_4) 0;
2813 for (n = 0; n < count; n++)
2814 s += abase[n*axstride] * bbase_y[n*bxstride];
2815 dest[y*rxstride] = s;
2818 else
2820 const GFC_REAL_4 *restrict abase_x;
2821 const GFC_REAL_4 *restrict bbase_y;
2822 GFC_REAL_4 *restrict dest_y;
2823 GFC_REAL_4 s;
2825 for (y = 0; y < ycount; y++)
2827 bbase_y = &bbase[y*bystride];
2828 dest_y = &dest[y*rystride];
2829 for (x = 0; x < xcount; x++)
2831 abase_x = &abase[x*axstride];
2832 s = (GFC_REAL_4) 0;
2833 for (n = 0; n < count; n++)
2834 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2835 dest_y[x*rxstride] = s;
2840 #undef POW3
2841 #undef min
2842 #undef max
2844 #endif
2845 #endif