Improve max_insns_skipped logic
[official-gcc.git] / libgfortran / generated / matmul_r4.c
blob5d13e6713d6e810d865eb59c4951a2e6a5e59b11
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
31 #if defined (HAVE_GFC_REAL_4)
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
39 const int *, const GFC_REAL_4 *, const int *,
40 const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
41 int, int);
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
49 The equivalent Fortran pseudo-code is:
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
72 extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
73 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_r4);
77 /* Put exhaustive list of possible architectures here here, ORed together. */
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
81 #ifdef HAVE_AVX
82 static void
83 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
84 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
87 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
88 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
91 const GFC_REAL_4 * restrict abase;
92 const GFC_REAL_4 * restrict bbase;
93 GFC_REAL_4 * restrict dest;
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
103 Either A or B (but not both) can be rank 1:
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
112 if (retarray->base_addr == NULL)
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
124 else
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
136 retarray->offset = 0;
138 else if (unlikely (compile_options.bounds_check))
140 index_type ret_extent, arg_extent;
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent, (long int) arg_extent);
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent, (long int) arg_extent);
160 else
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
185 work. */
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
188 else
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = 1;
201 xcount = 1;
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
204 else
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
224 /* bystride should never be used for 1-dimensional b.
225 The value is only used for calculation of the
226 memory by the buffer. */
227 bystride = 256;
228 ycount = 1;
230 else
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_REAL_4 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
260 assert (gemm != NULL);
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
263 &ldc, 1, 1);
264 return;
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
275 Umea University
276 S-901 87 Umea, Sweden
278 from netlib.org, translated to C, and modified for matmul.m4. */
280 const GFC_REAL_4 *a, *b;
281 GFC_REAL_4 *c;
282 const index_type m = xcount, n = ycount, k = count;
284 /* System generated locals */
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 i1, i2, i3, i4, i5, i6;
288 /* Local variables */
289 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
290 f13, f14, f23, f24, f33, f34, f43, f44;
291 index_type i, j, l, ii, jj, ll;
292 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
293 GFC_REAL_4 *t1;
295 a = abase;
296 b = bbase;
297 c = retarray->base_addr;
299 /* Parameter adjustments */
300 c_dim1 = rystride;
301 c_offset = 1 + c_dim1;
302 c -= c_offset;
303 a_dim1 = aystride;
304 a_offset = 1 + a_dim1;
305 a -= a_offset;
306 b_dim1 = bystride;
307 b_offset = 1 + b_dim1;
308 b -= b_offset;
310 /* Empty c first. */
311 for (j=1; j<=n; j++)
312 for (i=1; i<=m; i++)
313 c[i + j * c_dim1] = (GFC_REAL_4)0;
315 /* Early exit if possible */
316 if (m == 0 || n == 0 || k == 0)
317 return;
319 /* Adjust size of t1 to what is needed. */
320 index_type t1_dim;
321 t1_dim = (a_dim1-1) * 256 + b_dim1;
322 if (t1_dim > 65536)
323 t1_dim = 65536;
325 t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
327 /* Start turning the crank. */
328 i1 = n;
329 for (jj = 1; jj <= i1; jj += 512)
331 /* Computing MIN */
332 i2 = 512;
333 i3 = n - jj + 1;
334 jsec = min(i2,i3);
335 ujsec = jsec - jsec % 4;
336 i2 = k;
337 for (ll = 1; ll <= i2; ll += 256)
339 /* Computing MIN */
340 i3 = 256;
341 i4 = k - ll + 1;
342 lsec = min(i3,i4);
343 ulsec = lsec - lsec % 2;
345 i3 = m;
346 for (ii = 1; ii <= i3; ii += 256)
348 /* Computing MIN */
349 i4 = 256;
350 i5 = m - ii + 1;
351 isec = min(i4,i5);
352 uisec = isec - isec % 2;
353 i4 = ll + ulsec - 1;
354 for (l = ll; l <= i4; l += 2)
356 i5 = ii + uisec - 1;
357 for (i = ii; i <= i5; i += 2)
359 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
360 a[i + l * a_dim1];
361 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
362 a[i + (l + 1) * a_dim1];
363 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
364 a[i + 1 + l * a_dim1];
365 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
366 a[i + 1 + (l + 1) * a_dim1];
368 if (uisec < isec)
370 t1[l - ll + 1 + (isec << 8) - 257] =
371 a[ii + isec - 1 + l * a_dim1];
372 t1[l - ll + 2 + (isec << 8) - 257] =
373 a[ii + isec - 1 + (l + 1) * a_dim1];
376 if (ulsec < lsec)
378 i4 = ii + isec - 1;
379 for (i = ii; i<= i4; ++i)
381 t1[lsec + ((i - ii + 1) << 8) - 257] =
382 a[i + (ll + lsec - 1) * a_dim1];
386 uisec = isec - isec % 4;
387 i4 = jj + ujsec - 1;
388 for (j = jj; j <= i4; j += 4)
390 i5 = ii + uisec - 1;
391 for (i = ii; i <= i5; i += 4)
393 f11 = c[i + j * c_dim1];
394 f21 = c[i + 1 + j * c_dim1];
395 f12 = c[i + (j + 1) * c_dim1];
396 f22 = c[i + 1 + (j + 1) * c_dim1];
397 f13 = c[i + (j + 2) * c_dim1];
398 f23 = c[i + 1 + (j + 2) * c_dim1];
399 f14 = c[i + (j + 3) * c_dim1];
400 f24 = c[i + 1 + (j + 3) * c_dim1];
401 f31 = c[i + 2 + j * c_dim1];
402 f41 = c[i + 3 + j * c_dim1];
403 f32 = c[i + 2 + (j + 1) * c_dim1];
404 f42 = c[i + 3 + (j + 1) * c_dim1];
405 f33 = c[i + 2 + (j + 2) * c_dim1];
406 f43 = c[i + 3 + (j + 2) * c_dim1];
407 f34 = c[i + 2 + (j + 3) * c_dim1];
408 f44 = c[i + 3 + (j + 3) * c_dim1];
409 i6 = ll + lsec - 1;
410 for (l = ll; l <= i6; ++l)
412 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
413 * b[l + j * b_dim1];
414 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
415 * b[l + j * b_dim1];
416 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
417 * b[l + (j + 1) * b_dim1];
418 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
419 * b[l + (j + 1) * b_dim1];
420 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
421 * b[l + (j + 2) * b_dim1];
422 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
423 * b[l + (j + 2) * b_dim1];
424 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
425 * b[l + (j + 3) * b_dim1];
426 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
427 * b[l + (j + 3) * b_dim1];
428 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
429 * b[l + j * b_dim1];
430 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
431 * b[l + j * b_dim1];
432 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
433 * b[l + (j + 1) * b_dim1];
434 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
435 * b[l + (j + 1) * b_dim1];
436 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
437 * b[l + (j + 2) * b_dim1];
438 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
439 * b[l + (j + 2) * b_dim1];
440 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
441 * b[l + (j + 3) * b_dim1];
442 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
443 * b[l + (j + 3) * b_dim1];
445 c[i + j * c_dim1] = f11;
446 c[i + 1 + j * c_dim1] = f21;
447 c[i + (j + 1) * c_dim1] = f12;
448 c[i + 1 + (j + 1) * c_dim1] = f22;
449 c[i + (j + 2) * c_dim1] = f13;
450 c[i + 1 + (j + 2) * c_dim1] = f23;
451 c[i + (j + 3) * c_dim1] = f14;
452 c[i + 1 + (j + 3) * c_dim1] = f24;
453 c[i + 2 + j * c_dim1] = f31;
454 c[i + 3 + j * c_dim1] = f41;
455 c[i + 2 + (j + 1) * c_dim1] = f32;
456 c[i + 3 + (j + 1) * c_dim1] = f42;
457 c[i + 2 + (j + 2) * c_dim1] = f33;
458 c[i + 3 + (j + 2) * c_dim1] = f43;
459 c[i + 2 + (j + 3) * c_dim1] = f34;
460 c[i + 3 + (j + 3) * c_dim1] = f44;
462 if (uisec < isec)
464 i5 = ii + isec - 1;
465 for (i = ii + uisec; i <= i5; ++i)
467 f11 = c[i + j * c_dim1];
468 f12 = c[i + (j + 1) * c_dim1];
469 f13 = c[i + (j + 2) * c_dim1];
470 f14 = c[i + (j + 3) * c_dim1];
471 i6 = ll + lsec - 1;
472 for (l = ll; l <= i6; ++l)
474 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
475 257] * b[l + j * b_dim1];
476 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
477 257] * b[l + (j + 1) * b_dim1];
478 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
479 257] * b[l + (j + 2) * b_dim1];
480 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
481 257] * b[l + (j + 3) * b_dim1];
483 c[i + j * c_dim1] = f11;
484 c[i + (j + 1) * c_dim1] = f12;
485 c[i + (j + 2) * c_dim1] = f13;
486 c[i + (j + 3) * c_dim1] = f14;
490 if (ujsec < jsec)
492 i4 = jj + jsec - 1;
493 for (j = jj + ujsec; j <= i4; ++j)
495 i5 = ii + uisec - 1;
496 for (i = ii; i <= i5; i += 4)
498 f11 = c[i + j * c_dim1];
499 f21 = c[i + 1 + j * c_dim1];
500 f31 = c[i + 2 + j * c_dim1];
501 f41 = c[i + 3 + j * c_dim1];
502 i6 = ll + lsec - 1;
503 for (l = ll; l <= i6; ++l)
505 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
506 257] * b[l + j * b_dim1];
507 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
508 257] * b[l + j * b_dim1];
509 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
510 257] * b[l + j * b_dim1];
511 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
512 257] * b[l + j * b_dim1];
514 c[i + j * c_dim1] = f11;
515 c[i + 1 + j * c_dim1] = f21;
516 c[i + 2 + j * c_dim1] = f31;
517 c[i + 3 + j * c_dim1] = f41;
519 i5 = ii + isec - 1;
520 for (i = ii + uisec; i <= i5; ++i)
522 f11 = c[i + j * c_dim1];
523 i6 = ll + lsec - 1;
524 for (l = ll; l <= i6; ++l)
526 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
527 257] * b[l + j * b_dim1];
529 c[i + j * c_dim1] = f11;
536 free(t1);
537 return;
539 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
541 if (GFC_DESCRIPTOR_RANK (a) != 1)
543 const GFC_REAL_4 *restrict abase_x;
544 const GFC_REAL_4 *restrict bbase_y;
545 GFC_REAL_4 *restrict dest_y;
546 GFC_REAL_4 s;
548 for (y = 0; y < ycount; y++)
550 bbase_y = &bbase[y*bystride];
551 dest_y = &dest[y*rystride];
552 for (x = 0; x < xcount; x++)
554 abase_x = &abase[x*axstride];
555 s = (GFC_REAL_4) 0;
556 for (n = 0; n < count; n++)
557 s += abase_x[n] * bbase_y[n];
558 dest_y[x] = s;
562 else
564 const GFC_REAL_4 *restrict bbase_y;
565 GFC_REAL_4 s;
567 for (y = 0; y < ycount; y++)
569 bbase_y = &bbase[y*bystride];
570 s = (GFC_REAL_4) 0;
571 for (n = 0; n < count; n++)
572 s += abase[n*axstride] * bbase_y[n];
573 dest[y*rystride] = s;
577 else if (axstride < aystride)
579 for (y = 0; y < ycount; y++)
580 for (x = 0; x < xcount; x++)
581 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
583 for (y = 0; y < ycount; y++)
584 for (n = 0; n < count; n++)
585 for (x = 0; x < xcount; x++)
586 /* dest[x,y] += a[x,n] * b[n,y] */
587 dest[x*rxstride + y*rystride] +=
588 abase[x*axstride + n*aystride] *
589 bbase[n*bxstride + y*bystride];
591 else if (GFC_DESCRIPTOR_RANK (a) == 1)
593 const GFC_REAL_4 *restrict bbase_y;
594 GFC_REAL_4 s;
596 for (y = 0; y < ycount; y++)
598 bbase_y = &bbase[y*bystride];
599 s = (GFC_REAL_4) 0;
600 for (n = 0; n < count; n++)
601 s += abase[n*axstride] * bbase_y[n*bxstride];
602 dest[y*rxstride] = s;
605 else
607 const GFC_REAL_4 *restrict abase_x;
608 const GFC_REAL_4 *restrict bbase_y;
609 GFC_REAL_4 *restrict dest_y;
610 GFC_REAL_4 s;
612 for (y = 0; y < ycount; y++)
614 bbase_y = &bbase[y*bystride];
615 dest_y = &dest[y*rystride];
616 for (x = 0; x < xcount; x++)
618 abase_x = &abase[x*axstride];
619 s = (GFC_REAL_4) 0;
620 for (n = 0; n < count; n++)
621 s += abase_x[n*aystride] * bbase_y[n*bxstride];
622 dest_y[x*rxstride] = s;
627 #undef POW3
628 #undef min
629 #undef max
631 #endif /* HAVE_AVX */
633 #ifdef HAVE_AVX2
634 static void
635 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
636 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
637 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
638 static void
639 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
640 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
641 int blas_limit, blas_call gemm)
643 const GFC_REAL_4 * restrict abase;
644 const GFC_REAL_4 * restrict bbase;
645 GFC_REAL_4 * restrict dest;
647 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
648 index_type x, y, n, count, xcount, ycount;
650 assert (GFC_DESCRIPTOR_RANK (a) == 2
651 || GFC_DESCRIPTOR_RANK (b) == 2);
653 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
655 Either A or B (but not both) can be rank 1:
657 o One-dimensional argument A is implicitly treated as a row matrix
658 dimensioned [1,count], so xcount=1.
660 o One-dimensional argument B is implicitly treated as a column matrix
661 dimensioned [count, 1], so ycount=1.
664 if (retarray->base_addr == NULL)
666 if (GFC_DESCRIPTOR_RANK (a) == 1)
668 GFC_DIMENSION_SET(retarray->dim[0], 0,
669 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
671 else if (GFC_DESCRIPTOR_RANK (b) == 1)
673 GFC_DIMENSION_SET(retarray->dim[0], 0,
674 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
676 else
678 GFC_DIMENSION_SET(retarray->dim[0], 0,
679 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
681 GFC_DIMENSION_SET(retarray->dim[1], 0,
682 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
683 GFC_DESCRIPTOR_EXTENT(retarray,0));
686 retarray->base_addr
687 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
688 retarray->offset = 0;
690 else if (unlikely (compile_options.bounds_check))
692 index_type ret_extent, arg_extent;
694 if (GFC_DESCRIPTOR_RANK (a) == 1)
696 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
698 if (arg_extent != ret_extent)
699 runtime_error ("Incorrect extent in return array in"
700 " MATMUL intrinsic: is %ld, should be %ld",
701 (long int) ret_extent, (long int) arg_extent);
703 else if (GFC_DESCRIPTOR_RANK (b) == 1)
705 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
706 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
707 if (arg_extent != ret_extent)
708 runtime_error ("Incorrect extent in return array in"
709 " MATMUL intrinsic: is %ld, should be %ld",
710 (long int) ret_extent, (long int) arg_extent);
712 else
714 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
715 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
716 if (arg_extent != ret_extent)
717 runtime_error ("Incorrect extent in return array in"
718 " MATMUL intrinsic for dimension 1:"
719 " is %ld, should be %ld",
720 (long int) ret_extent, (long int) arg_extent);
722 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
723 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
724 if (arg_extent != ret_extent)
725 runtime_error ("Incorrect extent in return array in"
726 " MATMUL intrinsic for dimension 2:"
727 " is %ld, should be %ld",
728 (long int) ret_extent, (long int) arg_extent);
733 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
735 /* One-dimensional result may be addressed in the code below
736 either as a row or a column matrix. We want both cases to
737 work. */
738 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
740 else
742 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
743 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
747 if (GFC_DESCRIPTOR_RANK (a) == 1)
749 /* Treat it as a a row matrix A[1,count]. */
750 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
751 aystride = 1;
753 xcount = 1;
754 count = GFC_DESCRIPTOR_EXTENT(a,0);
756 else
758 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
759 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
761 count = GFC_DESCRIPTOR_EXTENT(a,1);
762 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
765 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
767 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
768 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
771 if (GFC_DESCRIPTOR_RANK (b) == 1)
773 /* Treat it as a column matrix B[count,1] */
774 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
776 /* bystride should never be used for 1-dimensional b.
777 The value is only used for calculation of the
778 memory by the buffer. */
779 bystride = 256;
780 ycount = 1;
782 else
784 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
785 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
786 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
789 abase = a->base_addr;
790 bbase = b->base_addr;
791 dest = retarray->base_addr;
793 /* Now that everything is set up, we perform the multiplication
794 itself. */
796 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
797 #define min(a,b) ((a) <= (b) ? (a) : (b))
798 #define max(a,b) ((a) >= (b) ? (a) : (b))
800 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
801 && (bxstride == 1 || bystride == 1)
802 && (((float) xcount) * ((float) ycount) * ((float) count)
803 > POW3(blas_limit)))
805 const int m = xcount, n = ycount, k = count, ldc = rystride;
806 const GFC_REAL_4 one = 1, zero = 0;
807 const int lda = (axstride == 1) ? aystride : axstride,
808 ldb = (bxstride == 1) ? bystride : bxstride;
810 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
812 assert (gemm != NULL);
813 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
814 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
815 &ldc, 1, 1);
816 return;
820 if (rxstride == 1 && axstride == 1 && bxstride == 1)
822 /* This block of code implements a tuned matmul, derived from
823 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
825 Bo Kagstrom and Per Ling
826 Department of Computing Science
827 Umea University
828 S-901 87 Umea, Sweden
830 from netlib.org, translated to C, and modified for matmul.m4. */
832 const GFC_REAL_4 *a, *b;
833 GFC_REAL_4 *c;
834 const index_type m = xcount, n = ycount, k = count;
836 /* System generated locals */
837 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
838 i1, i2, i3, i4, i5, i6;
840 /* Local variables */
841 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
842 f13, f14, f23, f24, f33, f34, f43, f44;
843 index_type i, j, l, ii, jj, ll;
844 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
845 GFC_REAL_4 *t1;
847 a = abase;
848 b = bbase;
849 c = retarray->base_addr;
851 /* Parameter adjustments */
852 c_dim1 = rystride;
853 c_offset = 1 + c_dim1;
854 c -= c_offset;
855 a_dim1 = aystride;
856 a_offset = 1 + a_dim1;
857 a -= a_offset;
858 b_dim1 = bystride;
859 b_offset = 1 + b_dim1;
860 b -= b_offset;
862 /* Empty c first. */
863 for (j=1; j<=n; j++)
864 for (i=1; i<=m; i++)
865 c[i + j * c_dim1] = (GFC_REAL_4)0;
867 /* Early exit if possible */
868 if (m == 0 || n == 0 || k == 0)
869 return;
871 /* Adjust size of t1 to what is needed. */
872 index_type t1_dim;
873 t1_dim = (a_dim1-1) * 256 + b_dim1;
874 if (t1_dim > 65536)
875 t1_dim = 65536;
877 t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
879 /* Start turning the crank. */
880 i1 = n;
881 for (jj = 1; jj <= i1; jj += 512)
883 /* Computing MIN */
884 i2 = 512;
885 i3 = n - jj + 1;
886 jsec = min(i2,i3);
887 ujsec = jsec - jsec % 4;
888 i2 = k;
889 for (ll = 1; ll <= i2; ll += 256)
891 /* Computing MIN */
892 i3 = 256;
893 i4 = k - ll + 1;
894 lsec = min(i3,i4);
895 ulsec = lsec - lsec % 2;
897 i3 = m;
898 for (ii = 1; ii <= i3; ii += 256)
900 /* Computing MIN */
901 i4 = 256;
902 i5 = m - ii + 1;
903 isec = min(i4,i5);
904 uisec = isec - isec % 2;
905 i4 = ll + ulsec - 1;
906 for (l = ll; l <= i4; l += 2)
908 i5 = ii + uisec - 1;
909 for (i = ii; i <= i5; i += 2)
911 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
912 a[i + l * a_dim1];
913 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
914 a[i + (l + 1) * a_dim1];
915 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
916 a[i + 1 + l * a_dim1];
917 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
918 a[i + 1 + (l + 1) * a_dim1];
920 if (uisec < isec)
922 t1[l - ll + 1 + (isec << 8) - 257] =
923 a[ii + isec - 1 + l * a_dim1];
924 t1[l - ll + 2 + (isec << 8) - 257] =
925 a[ii + isec - 1 + (l + 1) * a_dim1];
928 if (ulsec < lsec)
930 i4 = ii + isec - 1;
931 for (i = ii; i<= i4; ++i)
933 t1[lsec + ((i - ii + 1) << 8) - 257] =
934 a[i + (ll + lsec - 1) * a_dim1];
938 uisec = isec - isec % 4;
939 i4 = jj + ujsec - 1;
940 for (j = jj; j <= i4; j += 4)
942 i5 = ii + uisec - 1;
943 for (i = ii; i <= i5; i += 4)
945 f11 = c[i + j * c_dim1];
946 f21 = c[i + 1 + j * c_dim1];
947 f12 = c[i + (j + 1) * c_dim1];
948 f22 = c[i + 1 + (j + 1) * c_dim1];
949 f13 = c[i + (j + 2) * c_dim1];
950 f23 = c[i + 1 + (j + 2) * c_dim1];
951 f14 = c[i + (j + 3) * c_dim1];
952 f24 = c[i + 1 + (j + 3) * c_dim1];
953 f31 = c[i + 2 + j * c_dim1];
954 f41 = c[i + 3 + j * c_dim1];
955 f32 = c[i + 2 + (j + 1) * c_dim1];
956 f42 = c[i + 3 + (j + 1) * c_dim1];
957 f33 = c[i + 2 + (j + 2) * c_dim1];
958 f43 = c[i + 3 + (j + 2) * c_dim1];
959 f34 = c[i + 2 + (j + 3) * c_dim1];
960 f44 = c[i + 3 + (j + 3) * c_dim1];
961 i6 = ll + lsec - 1;
962 for (l = ll; l <= i6; ++l)
964 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
965 * b[l + j * b_dim1];
966 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
967 * b[l + j * b_dim1];
968 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
969 * b[l + (j + 1) * b_dim1];
970 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
971 * b[l + (j + 1) * b_dim1];
972 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
973 * b[l + (j + 2) * b_dim1];
974 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
975 * b[l + (j + 2) * b_dim1];
976 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
977 * b[l + (j + 3) * b_dim1];
978 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
979 * b[l + (j + 3) * b_dim1];
980 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
981 * b[l + j * b_dim1];
982 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
983 * b[l + j * b_dim1];
984 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
985 * b[l + (j + 1) * b_dim1];
986 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
987 * b[l + (j + 1) * b_dim1];
988 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
989 * b[l + (j + 2) * b_dim1];
990 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
991 * b[l + (j + 2) * b_dim1];
992 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
993 * b[l + (j + 3) * b_dim1];
994 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
995 * b[l + (j + 3) * b_dim1];
997 c[i + j * c_dim1] = f11;
998 c[i + 1 + j * c_dim1] = f21;
999 c[i + (j + 1) * c_dim1] = f12;
1000 c[i + 1 + (j + 1) * c_dim1] = f22;
1001 c[i + (j + 2) * c_dim1] = f13;
1002 c[i + 1 + (j + 2) * c_dim1] = f23;
1003 c[i + (j + 3) * c_dim1] = f14;
1004 c[i + 1 + (j + 3) * c_dim1] = f24;
1005 c[i + 2 + j * c_dim1] = f31;
1006 c[i + 3 + j * c_dim1] = f41;
1007 c[i + 2 + (j + 1) * c_dim1] = f32;
1008 c[i + 3 + (j + 1) * c_dim1] = f42;
1009 c[i + 2 + (j + 2) * c_dim1] = f33;
1010 c[i + 3 + (j + 2) * c_dim1] = f43;
1011 c[i + 2 + (j + 3) * c_dim1] = f34;
1012 c[i + 3 + (j + 3) * c_dim1] = f44;
1014 if (uisec < isec)
1016 i5 = ii + isec - 1;
1017 for (i = ii + uisec; i <= i5; ++i)
1019 f11 = c[i + j * c_dim1];
1020 f12 = c[i + (j + 1) * c_dim1];
1021 f13 = c[i + (j + 2) * c_dim1];
1022 f14 = c[i + (j + 3) * c_dim1];
1023 i6 = ll + lsec - 1;
1024 for (l = ll; l <= i6; ++l)
1026 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027 257] * b[l + j * b_dim1];
1028 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029 257] * b[l + (j + 1) * b_dim1];
1030 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031 257] * b[l + (j + 2) * b_dim1];
1032 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033 257] * b[l + (j + 3) * b_dim1];
1035 c[i + j * c_dim1] = f11;
1036 c[i + (j + 1) * c_dim1] = f12;
1037 c[i + (j + 2) * c_dim1] = f13;
1038 c[i + (j + 3) * c_dim1] = f14;
1042 if (ujsec < jsec)
1044 i4 = jj + jsec - 1;
1045 for (j = jj + ujsec; j <= i4; ++j)
1047 i5 = ii + uisec - 1;
1048 for (i = ii; i <= i5; i += 4)
1050 f11 = c[i + j * c_dim1];
1051 f21 = c[i + 1 + j * c_dim1];
1052 f31 = c[i + 2 + j * c_dim1];
1053 f41 = c[i + 3 + j * c_dim1];
1054 i6 = ll + lsec - 1;
1055 for (l = ll; l <= i6; ++l)
1057 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058 257] * b[l + j * b_dim1];
1059 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060 257] * b[l + j * b_dim1];
1061 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062 257] * b[l + j * b_dim1];
1063 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064 257] * b[l + j * b_dim1];
1066 c[i + j * c_dim1] = f11;
1067 c[i + 1 + j * c_dim1] = f21;
1068 c[i + 2 + j * c_dim1] = f31;
1069 c[i + 3 + j * c_dim1] = f41;
1071 i5 = ii + isec - 1;
1072 for (i = ii + uisec; i <= i5; ++i)
1074 f11 = c[i + j * c_dim1];
1075 i6 = ll + lsec - 1;
1076 for (l = ll; l <= i6; ++l)
1078 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079 257] * b[l + j * b_dim1];
1081 c[i + j * c_dim1] = f11;
1088 free(t1);
1089 return;
1091 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1093 if (GFC_DESCRIPTOR_RANK (a) != 1)
1095 const GFC_REAL_4 *restrict abase_x;
1096 const GFC_REAL_4 *restrict bbase_y;
1097 GFC_REAL_4 *restrict dest_y;
1098 GFC_REAL_4 s;
1100 for (y = 0; y < ycount; y++)
1102 bbase_y = &bbase[y*bystride];
1103 dest_y = &dest[y*rystride];
1104 for (x = 0; x < xcount; x++)
1106 abase_x = &abase[x*axstride];
1107 s = (GFC_REAL_4) 0;
1108 for (n = 0; n < count; n++)
1109 s += abase_x[n] * bbase_y[n];
1110 dest_y[x] = s;
1114 else
1116 const GFC_REAL_4 *restrict bbase_y;
1117 GFC_REAL_4 s;
1119 for (y = 0; y < ycount; y++)
1121 bbase_y = &bbase[y*bystride];
1122 s = (GFC_REAL_4) 0;
1123 for (n = 0; n < count; n++)
1124 s += abase[n*axstride] * bbase_y[n];
1125 dest[y*rystride] = s;
1129 else if (axstride < aystride)
1131 for (y = 0; y < ycount; y++)
1132 for (x = 0; x < xcount; x++)
1133 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1135 for (y = 0; y < ycount; y++)
1136 for (n = 0; n < count; n++)
1137 for (x = 0; x < xcount; x++)
1138 /* dest[x,y] += a[x,n] * b[n,y] */
1139 dest[x*rxstride + y*rystride] +=
1140 abase[x*axstride + n*aystride] *
1141 bbase[n*bxstride + y*bystride];
1143 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1145 const GFC_REAL_4 *restrict bbase_y;
1146 GFC_REAL_4 s;
1148 for (y = 0; y < ycount; y++)
1150 bbase_y = &bbase[y*bystride];
1151 s = (GFC_REAL_4) 0;
1152 for (n = 0; n < count; n++)
1153 s += abase[n*axstride] * bbase_y[n*bxstride];
1154 dest[y*rxstride] = s;
1157 else
1159 const GFC_REAL_4 *restrict abase_x;
1160 const GFC_REAL_4 *restrict bbase_y;
1161 GFC_REAL_4 *restrict dest_y;
1162 GFC_REAL_4 s;
1164 for (y = 0; y < ycount; y++)
1166 bbase_y = &bbase[y*bystride];
1167 dest_y = &dest[y*rystride];
1168 for (x = 0; x < xcount; x++)
1170 abase_x = &abase[x*axstride];
1171 s = (GFC_REAL_4) 0;
1172 for (n = 0; n < count; n++)
1173 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174 dest_y[x*rxstride] = s;
1179 #undef POW3
1180 #undef min
1181 #undef max
1183 #endif /* HAVE_AVX2 */
1185 #ifdef HAVE_AVX512F
1186 static void
1187 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1188 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1189 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1190 static void
1191 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1192 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1193 int blas_limit, blas_call gemm)
1195 const GFC_REAL_4 * restrict abase;
1196 const GFC_REAL_4 * restrict bbase;
1197 GFC_REAL_4 * restrict dest;
1199 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1200 index_type x, y, n, count, xcount, ycount;
1202 assert (GFC_DESCRIPTOR_RANK (a) == 2
1203 || GFC_DESCRIPTOR_RANK (b) == 2);
1205 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1207 Either A or B (but not both) can be rank 1:
1209 o One-dimensional argument A is implicitly treated as a row matrix
1210 dimensioned [1,count], so xcount=1.
1212 o One-dimensional argument B is implicitly treated as a column matrix
1213 dimensioned [count, 1], so ycount=1.
1216 if (retarray->base_addr == NULL)
1218 if (GFC_DESCRIPTOR_RANK (a) == 1)
1220 GFC_DIMENSION_SET(retarray->dim[0], 0,
1221 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1223 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1225 GFC_DIMENSION_SET(retarray->dim[0], 0,
1226 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1228 else
1230 GFC_DIMENSION_SET(retarray->dim[0], 0,
1231 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1233 GFC_DIMENSION_SET(retarray->dim[1], 0,
1234 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1235 GFC_DESCRIPTOR_EXTENT(retarray,0));
1238 retarray->base_addr
1239 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1240 retarray->offset = 0;
1242 else if (unlikely (compile_options.bounds_check))
1244 index_type ret_extent, arg_extent;
1246 if (GFC_DESCRIPTOR_RANK (a) == 1)
1248 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1249 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1250 if (arg_extent != ret_extent)
1251 runtime_error ("Incorrect extent in return array in"
1252 " MATMUL intrinsic: is %ld, should be %ld",
1253 (long int) ret_extent, (long int) arg_extent);
1255 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1257 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1258 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1259 if (arg_extent != ret_extent)
1260 runtime_error ("Incorrect extent in return array in"
1261 " MATMUL intrinsic: is %ld, should be %ld",
1262 (long int) ret_extent, (long int) arg_extent);
1264 else
1266 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1267 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1268 if (arg_extent != ret_extent)
1269 runtime_error ("Incorrect extent in return array in"
1270 " MATMUL intrinsic for dimension 1:"
1271 " is %ld, should be %ld",
1272 (long int) ret_extent, (long int) arg_extent);
1274 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1275 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1276 if (arg_extent != ret_extent)
1277 runtime_error ("Incorrect extent in return array in"
1278 " MATMUL intrinsic for dimension 2:"
1279 " is %ld, should be %ld",
1280 (long int) ret_extent, (long int) arg_extent);
1285 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1287 /* One-dimensional result may be addressed in the code below
1288 either as a row or a column matrix. We want both cases to
1289 work. */
1290 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1292 else
1294 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1295 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1299 if (GFC_DESCRIPTOR_RANK (a) == 1)
1301 /* Treat it as a a row matrix A[1,count]. */
1302 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1303 aystride = 1;
1305 xcount = 1;
1306 count = GFC_DESCRIPTOR_EXTENT(a,0);
1308 else
1310 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1311 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1313 count = GFC_DESCRIPTOR_EXTENT(a,1);
1314 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1317 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1319 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1320 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1323 if (GFC_DESCRIPTOR_RANK (b) == 1)
1325 /* Treat it as a column matrix B[count,1] */
1326 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1328 /* bystride should never be used for 1-dimensional b.
1329 The value is only used for calculation of the
1330 memory by the buffer. */
1331 bystride = 256;
1332 ycount = 1;
1334 else
1336 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1337 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1338 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1341 abase = a->base_addr;
1342 bbase = b->base_addr;
1343 dest = retarray->base_addr;
1345 /* Now that everything is set up, we perform the multiplication
1346 itself. */
1348 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1349 #define min(a,b) ((a) <= (b) ? (a) : (b))
1350 #define max(a,b) ((a) >= (b) ? (a) : (b))
1352 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1353 && (bxstride == 1 || bystride == 1)
1354 && (((float) xcount) * ((float) ycount) * ((float) count)
1355 > POW3(blas_limit)))
1357 const int m = xcount, n = ycount, k = count, ldc = rystride;
1358 const GFC_REAL_4 one = 1, zero = 0;
1359 const int lda = (axstride == 1) ? aystride : axstride,
1360 ldb = (bxstride == 1) ? bystride : bxstride;
1362 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1364 assert (gemm != NULL);
1365 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1366 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1367 &ldc, 1, 1);
1368 return;
1372 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1374 /* This block of code implements a tuned matmul, derived from
1375 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1377 Bo Kagstrom and Per Ling
1378 Department of Computing Science
1379 Umea University
1380 S-901 87 Umea, Sweden
1382 from netlib.org, translated to C, and modified for matmul.m4. */
1384 const GFC_REAL_4 *a, *b;
1385 GFC_REAL_4 *c;
1386 const index_type m = xcount, n = ycount, k = count;
1388 /* System generated locals */
1389 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1390 i1, i2, i3, i4, i5, i6;
1392 /* Local variables */
1393 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
1394 f13, f14, f23, f24, f33, f34, f43, f44;
1395 index_type i, j, l, ii, jj, ll;
1396 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1397 GFC_REAL_4 *t1;
1399 a = abase;
1400 b = bbase;
1401 c = retarray->base_addr;
1403 /* Parameter adjustments */
1404 c_dim1 = rystride;
1405 c_offset = 1 + c_dim1;
1406 c -= c_offset;
1407 a_dim1 = aystride;
1408 a_offset = 1 + a_dim1;
1409 a -= a_offset;
1410 b_dim1 = bystride;
1411 b_offset = 1 + b_dim1;
1412 b -= b_offset;
1414 /* Empty c first. */
1415 for (j=1; j<=n; j++)
1416 for (i=1; i<=m; i++)
1417 c[i + j * c_dim1] = (GFC_REAL_4)0;
1419 /* Early exit if possible */
1420 if (m == 0 || n == 0 || k == 0)
1421 return;
1423 /* Adjust size of t1 to what is needed. */
1424 index_type t1_dim;
1425 t1_dim = (a_dim1-1) * 256 + b_dim1;
1426 if (t1_dim > 65536)
1427 t1_dim = 65536;
1429 t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
1431 /* Start turning the crank. */
1432 i1 = n;
1433 for (jj = 1; jj <= i1; jj += 512)
1435 /* Computing MIN */
1436 i2 = 512;
1437 i3 = n - jj + 1;
1438 jsec = min(i2,i3);
1439 ujsec = jsec - jsec % 4;
1440 i2 = k;
1441 for (ll = 1; ll <= i2; ll += 256)
1443 /* Computing MIN */
1444 i3 = 256;
1445 i4 = k - ll + 1;
1446 lsec = min(i3,i4);
1447 ulsec = lsec - lsec % 2;
1449 i3 = m;
1450 for (ii = 1; ii <= i3; ii += 256)
1452 /* Computing MIN */
1453 i4 = 256;
1454 i5 = m - ii + 1;
1455 isec = min(i4,i5);
1456 uisec = isec - isec % 2;
1457 i4 = ll + ulsec - 1;
1458 for (l = ll; l <= i4; l += 2)
1460 i5 = ii + uisec - 1;
1461 for (i = ii; i <= i5; i += 2)
1463 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1464 a[i + l * a_dim1];
1465 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1466 a[i + (l + 1) * a_dim1];
1467 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1468 a[i + 1 + l * a_dim1];
1469 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1470 a[i + 1 + (l + 1) * a_dim1];
1472 if (uisec < isec)
1474 t1[l - ll + 1 + (isec << 8) - 257] =
1475 a[ii + isec - 1 + l * a_dim1];
1476 t1[l - ll + 2 + (isec << 8) - 257] =
1477 a[ii + isec - 1 + (l + 1) * a_dim1];
1480 if (ulsec < lsec)
1482 i4 = ii + isec - 1;
1483 for (i = ii; i<= i4; ++i)
1485 t1[lsec + ((i - ii + 1) << 8) - 257] =
1486 a[i + (ll + lsec - 1) * a_dim1];
1490 uisec = isec - isec % 4;
1491 i4 = jj + ujsec - 1;
1492 for (j = jj; j <= i4; j += 4)
1494 i5 = ii + uisec - 1;
1495 for (i = ii; i <= i5; i += 4)
1497 f11 = c[i + j * c_dim1];
1498 f21 = c[i + 1 + j * c_dim1];
1499 f12 = c[i + (j + 1) * c_dim1];
1500 f22 = c[i + 1 + (j + 1) * c_dim1];
1501 f13 = c[i + (j + 2) * c_dim1];
1502 f23 = c[i + 1 + (j + 2) * c_dim1];
1503 f14 = c[i + (j + 3) * c_dim1];
1504 f24 = c[i + 1 + (j + 3) * c_dim1];
1505 f31 = c[i + 2 + j * c_dim1];
1506 f41 = c[i + 3 + j * c_dim1];
1507 f32 = c[i + 2 + (j + 1) * c_dim1];
1508 f42 = c[i + 3 + (j + 1) * c_dim1];
1509 f33 = c[i + 2 + (j + 2) * c_dim1];
1510 f43 = c[i + 3 + (j + 2) * c_dim1];
1511 f34 = c[i + 2 + (j + 3) * c_dim1];
1512 f44 = c[i + 3 + (j + 3) * c_dim1];
1513 i6 = ll + lsec - 1;
1514 for (l = ll; l <= i6; ++l)
1516 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1517 * b[l + j * b_dim1];
1518 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1519 * b[l + j * b_dim1];
1520 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1521 * b[l + (j + 1) * b_dim1];
1522 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1523 * b[l + (j + 1) * b_dim1];
1524 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1525 * b[l + (j + 2) * b_dim1];
1526 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1527 * b[l + (j + 2) * b_dim1];
1528 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1529 * b[l + (j + 3) * b_dim1];
1530 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1531 * b[l + (j + 3) * b_dim1];
1532 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1533 * b[l + j * b_dim1];
1534 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1535 * b[l + j * b_dim1];
1536 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1537 * b[l + (j + 1) * b_dim1];
1538 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1539 * b[l + (j + 1) * b_dim1];
1540 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1541 * b[l + (j + 2) * b_dim1];
1542 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1543 * b[l + (j + 2) * b_dim1];
1544 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1545 * b[l + (j + 3) * b_dim1];
1546 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1547 * b[l + (j + 3) * b_dim1];
1549 c[i + j * c_dim1] = f11;
1550 c[i + 1 + j * c_dim1] = f21;
1551 c[i + (j + 1) * c_dim1] = f12;
1552 c[i + 1 + (j + 1) * c_dim1] = f22;
1553 c[i + (j + 2) * c_dim1] = f13;
1554 c[i + 1 + (j + 2) * c_dim1] = f23;
1555 c[i + (j + 3) * c_dim1] = f14;
1556 c[i + 1 + (j + 3) * c_dim1] = f24;
1557 c[i + 2 + j * c_dim1] = f31;
1558 c[i + 3 + j * c_dim1] = f41;
1559 c[i + 2 + (j + 1) * c_dim1] = f32;
1560 c[i + 3 + (j + 1) * c_dim1] = f42;
1561 c[i + 2 + (j + 2) * c_dim1] = f33;
1562 c[i + 3 + (j + 2) * c_dim1] = f43;
1563 c[i + 2 + (j + 3) * c_dim1] = f34;
1564 c[i + 3 + (j + 3) * c_dim1] = f44;
1566 if (uisec < isec)
1568 i5 = ii + isec - 1;
1569 for (i = ii + uisec; i <= i5; ++i)
1571 f11 = c[i + j * c_dim1];
1572 f12 = c[i + (j + 1) * c_dim1];
1573 f13 = c[i + (j + 2) * c_dim1];
1574 f14 = c[i + (j + 3) * c_dim1];
1575 i6 = ll + lsec - 1;
1576 for (l = ll; l <= i6; ++l)
1578 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1579 257] * b[l + j * b_dim1];
1580 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1581 257] * b[l + (j + 1) * b_dim1];
1582 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1583 257] * b[l + (j + 2) * b_dim1];
1584 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1585 257] * b[l + (j + 3) * b_dim1];
1587 c[i + j * c_dim1] = f11;
1588 c[i + (j + 1) * c_dim1] = f12;
1589 c[i + (j + 2) * c_dim1] = f13;
1590 c[i + (j + 3) * c_dim1] = f14;
1594 if (ujsec < jsec)
1596 i4 = jj + jsec - 1;
1597 for (j = jj + ujsec; j <= i4; ++j)
1599 i5 = ii + uisec - 1;
1600 for (i = ii; i <= i5; i += 4)
1602 f11 = c[i + j * c_dim1];
1603 f21 = c[i + 1 + j * c_dim1];
1604 f31 = c[i + 2 + j * c_dim1];
1605 f41 = c[i + 3 + j * c_dim1];
1606 i6 = ll + lsec - 1;
1607 for (l = ll; l <= i6; ++l)
1609 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1610 257] * b[l + j * b_dim1];
1611 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1612 257] * b[l + j * b_dim1];
1613 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1614 257] * b[l + j * b_dim1];
1615 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1616 257] * b[l + j * b_dim1];
1618 c[i + j * c_dim1] = f11;
1619 c[i + 1 + j * c_dim1] = f21;
1620 c[i + 2 + j * c_dim1] = f31;
1621 c[i + 3 + j * c_dim1] = f41;
1623 i5 = ii + isec - 1;
1624 for (i = ii + uisec; i <= i5; ++i)
1626 f11 = c[i + j * c_dim1];
1627 i6 = ll + lsec - 1;
1628 for (l = ll; l <= i6; ++l)
1630 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1631 257] * b[l + j * b_dim1];
1633 c[i + j * c_dim1] = f11;
1640 free(t1);
1641 return;
1643 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1645 if (GFC_DESCRIPTOR_RANK (a) != 1)
1647 const GFC_REAL_4 *restrict abase_x;
1648 const GFC_REAL_4 *restrict bbase_y;
1649 GFC_REAL_4 *restrict dest_y;
1650 GFC_REAL_4 s;
1652 for (y = 0; y < ycount; y++)
1654 bbase_y = &bbase[y*bystride];
1655 dest_y = &dest[y*rystride];
1656 for (x = 0; x < xcount; x++)
1658 abase_x = &abase[x*axstride];
1659 s = (GFC_REAL_4) 0;
1660 for (n = 0; n < count; n++)
1661 s += abase_x[n] * bbase_y[n];
1662 dest_y[x] = s;
1666 else
1668 const GFC_REAL_4 *restrict bbase_y;
1669 GFC_REAL_4 s;
1671 for (y = 0; y < ycount; y++)
1673 bbase_y = &bbase[y*bystride];
1674 s = (GFC_REAL_4) 0;
1675 for (n = 0; n < count; n++)
1676 s += abase[n*axstride] * bbase_y[n];
1677 dest[y*rystride] = s;
1681 else if (axstride < aystride)
1683 for (y = 0; y < ycount; y++)
1684 for (x = 0; x < xcount; x++)
1685 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1687 for (y = 0; y < ycount; y++)
1688 for (n = 0; n < count; n++)
1689 for (x = 0; x < xcount; x++)
1690 /* dest[x,y] += a[x,n] * b[n,y] */
1691 dest[x*rxstride + y*rystride] +=
1692 abase[x*axstride + n*aystride] *
1693 bbase[n*bxstride + y*bystride];
1695 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1697 const GFC_REAL_4 *restrict bbase_y;
1698 GFC_REAL_4 s;
1700 for (y = 0; y < ycount; y++)
1702 bbase_y = &bbase[y*bystride];
1703 s = (GFC_REAL_4) 0;
1704 for (n = 0; n < count; n++)
1705 s += abase[n*axstride] * bbase_y[n*bxstride];
1706 dest[y*rxstride] = s;
1709 else
1711 const GFC_REAL_4 *restrict abase_x;
1712 const GFC_REAL_4 *restrict bbase_y;
1713 GFC_REAL_4 *restrict dest_y;
1714 GFC_REAL_4 s;
1716 for (y = 0; y < ycount; y++)
1718 bbase_y = &bbase[y*bystride];
1719 dest_y = &dest[y*rystride];
1720 for (x = 0; x < xcount; x++)
1722 abase_x = &abase[x*axstride];
1723 s = (GFC_REAL_4) 0;
1724 for (n = 0; n < count; n++)
1725 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1726 dest_y[x*rxstride] = s;
1731 #undef POW3
1732 #undef min
1733 #undef max
1735 #endif /* HAVE_AVX512F */
1737 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
1739 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1740 void
1741 matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
1742 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1743 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1744 internal_proto(matmul_r4_avx128_fma3);
1745 #endif
1747 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1748 void
1749 matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
1750 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1751 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1752 internal_proto(matmul_r4_avx128_fma4);
1753 #endif
1755 /* Function to fall back to if there is no special processor-specific version. */
1756 static void
1757 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
1758 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1759 int blas_limit, blas_call gemm)
1761 const GFC_REAL_4 * restrict abase;
1762 const GFC_REAL_4 * restrict bbase;
1763 GFC_REAL_4 * restrict dest;
1765 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1766 index_type x, y, n, count, xcount, ycount;
1768 assert (GFC_DESCRIPTOR_RANK (a) == 2
1769 || GFC_DESCRIPTOR_RANK (b) == 2);
1771 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1773 Either A or B (but not both) can be rank 1:
1775 o One-dimensional argument A is implicitly treated as a row matrix
1776 dimensioned [1,count], so xcount=1.
1778 o One-dimensional argument B is implicitly treated as a column matrix
1779 dimensioned [count, 1], so ycount=1.
1782 if (retarray->base_addr == NULL)
1784 if (GFC_DESCRIPTOR_RANK (a) == 1)
1786 GFC_DIMENSION_SET(retarray->dim[0], 0,
1787 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1789 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1791 GFC_DIMENSION_SET(retarray->dim[0], 0,
1792 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1794 else
1796 GFC_DIMENSION_SET(retarray->dim[0], 0,
1797 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1799 GFC_DIMENSION_SET(retarray->dim[1], 0,
1800 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1801 GFC_DESCRIPTOR_EXTENT(retarray,0));
1804 retarray->base_addr
1805 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1806 retarray->offset = 0;
1808 else if (unlikely (compile_options.bounds_check))
1810 index_type ret_extent, arg_extent;
1812 if (GFC_DESCRIPTOR_RANK (a) == 1)
1814 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1815 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1816 if (arg_extent != ret_extent)
1817 runtime_error ("Incorrect extent in return array in"
1818 " MATMUL intrinsic: is %ld, should be %ld",
1819 (long int) ret_extent, (long int) arg_extent);
1821 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1823 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1824 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1825 if (arg_extent != ret_extent)
1826 runtime_error ("Incorrect extent in return array in"
1827 " MATMUL intrinsic: is %ld, should be %ld",
1828 (long int) ret_extent, (long int) arg_extent);
1830 else
1832 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1833 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1834 if (arg_extent != ret_extent)
1835 runtime_error ("Incorrect extent in return array in"
1836 " MATMUL intrinsic for dimension 1:"
1837 " is %ld, should be %ld",
1838 (long int) ret_extent, (long int) arg_extent);
1840 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1841 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1842 if (arg_extent != ret_extent)
1843 runtime_error ("Incorrect extent in return array in"
1844 " MATMUL intrinsic for dimension 2:"
1845 " is %ld, should be %ld",
1846 (long int) ret_extent, (long int) arg_extent);
1851 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1853 /* One-dimensional result may be addressed in the code below
1854 either as a row or a column matrix. We want both cases to
1855 work. */
1856 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1858 else
1860 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1861 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1865 if (GFC_DESCRIPTOR_RANK (a) == 1)
1867 /* Treat it as a a row matrix A[1,count]. */
1868 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1869 aystride = 1;
1871 xcount = 1;
1872 count = GFC_DESCRIPTOR_EXTENT(a,0);
1874 else
1876 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1877 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1879 count = GFC_DESCRIPTOR_EXTENT(a,1);
1880 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1883 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1885 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1886 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1889 if (GFC_DESCRIPTOR_RANK (b) == 1)
1891 /* Treat it as a column matrix B[count,1] */
1892 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1894 /* bystride should never be used for 1-dimensional b.
1895 The value is only used for calculation of the
1896 memory by the buffer. */
1897 bystride = 256;
1898 ycount = 1;
1900 else
1902 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1903 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1904 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1907 abase = a->base_addr;
1908 bbase = b->base_addr;
1909 dest = retarray->base_addr;
1911 /* Now that everything is set up, we perform the multiplication
1912 itself. */
1914 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1915 #define min(a,b) ((a) <= (b) ? (a) : (b))
1916 #define max(a,b) ((a) >= (b) ? (a) : (b))
1918 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1919 && (bxstride == 1 || bystride == 1)
1920 && (((float) xcount) * ((float) ycount) * ((float) count)
1921 > POW3(blas_limit)))
1923 const int m = xcount, n = ycount, k = count, ldc = rystride;
1924 const GFC_REAL_4 one = 1, zero = 0;
1925 const int lda = (axstride == 1) ? aystride : axstride,
1926 ldb = (bxstride == 1) ? bystride : bxstride;
1928 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1930 assert (gemm != NULL);
1931 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1932 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1933 &ldc, 1, 1);
1934 return;
1938 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1940 /* This block of code implements a tuned matmul, derived from
1941 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1943 Bo Kagstrom and Per Ling
1944 Department of Computing Science
1945 Umea University
1946 S-901 87 Umea, Sweden
1948 from netlib.org, translated to C, and modified for matmul.m4. */
1950 const GFC_REAL_4 *a, *b;
1951 GFC_REAL_4 *c;
1952 const index_type m = xcount, n = ycount, k = count;
1954 /* System generated locals */
1955 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1956 i1, i2, i3, i4, i5, i6;
1958 /* Local variables */
1959 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
1960 f13, f14, f23, f24, f33, f34, f43, f44;
1961 index_type i, j, l, ii, jj, ll;
1962 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1963 GFC_REAL_4 *t1;
1965 a = abase;
1966 b = bbase;
1967 c = retarray->base_addr;
1969 /* Parameter adjustments */
1970 c_dim1 = rystride;
1971 c_offset = 1 + c_dim1;
1972 c -= c_offset;
1973 a_dim1 = aystride;
1974 a_offset = 1 + a_dim1;
1975 a -= a_offset;
1976 b_dim1 = bystride;
1977 b_offset = 1 + b_dim1;
1978 b -= b_offset;
1980 /* Empty c first. */
1981 for (j=1; j<=n; j++)
1982 for (i=1; i<=m; i++)
1983 c[i + j * c_dim1] = (GFC_REAL_4)0;
1985 /* Early exit if possible */
1986 if (m == 0 || n == 0 || k == 0)
1987 return;
1989 /* Adjust size of t1 to what is needed. */
1990 index_type t1_dim;
1991 t1_dim = (a_dim1-1) * 256 + b_dim1;
1992 if (t1_dim > 65536)
1993 t1_dim = 65536;
1995 t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
1997 /* Start turning the crank. */
1998 i1 = n;
1999 for (jj = 1; jj <= i1; jj += 512)
2001 /* Computing MIN */
2002 i2 = 512;
2003 i3 = n - jj + 1;
2004 jsec = min(i2,i3);
2005 ujsec = jsec - jsec % 4;
2006 i2 = k;
2007 for (ll = 1; ll <= i2; ll += 256)
2009 /* Computing MIN */
2010 i3 = 256;
2011 i4 = k - ll + 1;
2012 lsec = min(i3,i4);
2013 ulsec = lsec - lsec % 2;
2015 i3 = m;
2016 for (ii = 1; ii <= i3; ii += 256)
2018 /* Computing MIN */
2019 i4 = 256;
2020 i5 = m - ii + 1;
2021 isec = min(i4,i5);
2022 uisec = isec - isec % 2;
2023 i4 = ll + ulsec - 1;
2024 for (l = ll; l <= i4; l += 2)
2026 i5 = ii + uisec - 1;
2027 for (i = ii; i <= i5; i += 2)
2029 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2030 a[i + l * a_dim1];
2031 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2032 a[i + (l + 1) * a_dim1];
2033 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2034 a[i + 1 + l * a_dim1];
2035 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2036 a[i + 1 + (l + 1) * a_dim1];
2038 if (uisec < isec)
2040 t1[l - ll + 1 + (isec << 8) - 257] =
2041 a[ii + isec - 1 + l * a_dim1];
2042 t1[l - ll + 2 + (isec << 8) - 257] =
2043 a[ii + isec - 1 + (l + 1) * a_dim1];
2046 if (ulsec < lsec)
2048 i4 = ii + isec - 1;
2049 for (i = ii; i<= i4; ++i)
2051 t1[lsec + ((i - ii + 1) << 8) - 257] =
2052 a[i + (ll + lsec - 1) * a_dim1];
2056 uisec = isec - isec % 4;
2057 i4 = jj + ujsec - 1;
2058 for (j = jj; j <= i4; j += 4)
2060 i5 = ii + uisec - 1;
2061 for (i = ii; i <= i5; i += 4)
2063 f11 = c[i + j * c_dim1];
2064 f21 = c[i + 1 + j * c_dim1];
2065 f12 = c[i + (j + 1) * c_dim1];
2066 f22 = c[i + 1 + (j + 1) * c_dim1];
2067 f13 = c[i + (j + 2) * c_dim1];
2068 f23 = c[i + 1 + (j + 2) * c_dim1];
2069 f14 = c[i + (j + 3) * c_dim1];
2070 f24 = c[i + 1 + (j + 3) * c_dim1];
2071 f31 = c[i + 2 + j * c_dim1];
2072 f41 = c[i + 3 + j * c_dim1];
2073 f32 = c[i + 2 + (j + 1) * c_dim1];
2074 f42 = c[i + 3 + (j + 1) * c_dim1];
2075 f33 = c[i + 2 + (j + 2) * c_dim1];
2076 f43 = c[i + 3 + (j + 2) * c_dim1];
2077 f34 = c[i + 2 + (j + 3) * c_dim1];
2078 f44 = c[i + 3 + (j + 3) * c_dim1];
2079 i6 = ll + lsec - 1;
2080 for (l = ll; l <= i6; ++l)
2082 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2083 * b[l + j * b_dim1];
2084 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2085 * b[l + j * b_dim1];
2086 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2087 * b[l + (j + 1) * b_dim1];
2088 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2089 * b[l + (j + 1) * b_dim1];
2090 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2091 * b[l + (j + 2) * b_dim1];
2092 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2093 * b[l + (j + 2) * b_dim1];
2094 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2095 * b[l + (j + 3) * b_dim1];
2096 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2097 * b[l + (j + 3) * b_dim1];
2098 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2099 * b[l + j * b_dim1];
2100 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2101 * b[l + j * b_dim1];
2102 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2103 * b[l + (j + 1) * b_dim1];
2104 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2105 * b[l + (j + 1) * b_dim1];
2106 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2107 * b[l + (j + 2) * b_dim1];
2108 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2109 * b[l + (j + 2) * b_dim1];
2110 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2111 * b[l + (j + 3) * b_dim1];
2112 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2113 * b[l + (j + 3) * b_dim1];
2115 c[i + j * c_dim1] = f11;
2116 c[i + 1 + j * c_dim1] = f21;
2117 c[i + (j + 1) * c_dim1] = f12;
2118 c[i + 1 + (j + 1) * c_dim1] = f22;
2119 c[i + (j + 2) * c_dim1] = f13;
2120 c[i + 1 + (j + 2) * c_dim1] = f23;
2121 c[i + (j + 3) * c_dim1] = f14;
2122 c[i + 1 + (j + 3) * c_dim1] = f24;
2123 c[i + 2 + j * c_dim1] = f31;
2124 c[i + 3 + j * c_dim1] = f41;
2125 c[i + 2 + (j + 1) * c_dim1] = f32;
2126 c[i + 3 + (j + 1) * c_dim1] = f42;
2127 c[i + 2 + (j + 2) * c_dim1] = f33;
2128 c[i + 3 + (j + 2) * c_dim1] = f43;
2129 c[i + 2 + (j + 3) * c_dim1] = f34;
2130 c[i + 3 + (j + 3) * c_dim1] = f44;
2132 if (uisec < isec)
2134 i5 = ii + isec - 1;
2135 for (i = ii + uisec; i <= i5; ++i)
2137 f11 = c[i + j * c_dim1];
2138 f12 = c[i + (j + 1) * c_dim1];
2139 f13 = c[i + (j + 2) * c_dim1];
2140 f14 = c[i + (j + 3) * c_dim1];
2141 i6 = ll + lsec - 1;
2142 for (l = ll; l <= i6; ++l)
2144 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2145 257] * b[l + j * b_dim1];
2146 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2147 257] * b[l + (j + 1) * b_dim1];
2148 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2149 257] * b[l + (j + 2) * b_dim1];
2150 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2151 257] * b[l + (j + 3) * b_dim1];
2153 c[i + j * c_dim1] = f11;
2154 c[i + (j + 1) * c_dim1] = f12;
2155 c[i + (j + 2) * c_dim1] = f13;
2156 c[i + (j + 3) * c_dim1] = f14;
2160 if (ujsec < jsec)
2162 i4 = jj + jsec - 1;
2163 for (j = jj + ujsec; j <= i4; ++j)
2165 i5 = ii + uisec - 1;
2166 for (i = ii; i <= i5; i += 4)
2168 f11 = c[i + j * c_dim1];
2169 f21 = c[i + 1 + j * c_dim1];
2170 f31 = c[i + 2 + j * c_dim1];
2171 f41 = c[i + 3 + j * c_dim1];
2172 i6 = ll + lsec - 1;
2173 for (l = ll; l <= i6; ++l)
2175 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2176 257] * b[l + j * b_dim1];
2177 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2178 257] * b[l + j * b_dim1];
2179 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2180 257] * b[l + j * b_dim1];
2181 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2182 257] * b[l + j * b_dim1];
2184 c[i + j * c_dim1] = f11;
2185 c[i + 1 + j * c_dim1] = f21;
2186 c[i + 2 + j * c_dim1] = f31;
2187 c[i + 3 + j * c_dim1] = f41;
2189 i5 = ii + isec - 1;
2190 for (i = ii + uisec; i <= i5; ++i)
2192 f11 = c[i + j * c_dim1];
2193 i6 = ll + lsec - 1;
2194 for (l = ll; l <= i6; ++l)
2196 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2197 257] * b[l + j * b_dim1];
2199 c[i + j * c_dim1] = f11;
2206 free(t1);
2207 return;
2209 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2211 if (GFC_DESCRIPTOR_RANK (a) != 1)
2213 const GFC_REAL_4 *restrict abase_x;
2214 const GFC_REAL_4 *restrict bbase_y;
2215 GFC_REAL_4 *restrict dest_y;
2216 GFC_REAL_4 s;
2218 for (y = 0; y < ycount; y++)
2220 bbase_y = &bbase[y*bystride];
2221 dest_y = &dest[y*rystride];
2222 for (x = 0; x < xcount; x++)
2224 abase_x = &abase[x*axstride];
2225 s = (GFC_REAL_4) 0;
2226 for (n = 0; n < count; n++)
2227 s += abase_x[n] * bbase_y[n];
2228 dest_y[x] = s;
2232 else
2234 const GFC_REAL_4 *restrict bbase_y;
2235 GFC_REAL_4 s;
2237 for (y = 0; y < ycount; y++)
2239 bbase_y = &bbase[y*bystride];
2240 s = (GFC_REAL_4) 0;
2241 for (n = 0; n < count; n++)
2242 s += abase[n*axstride] * bbase_y[n];
2243 dest[y*rystride] = s;
2247 else if (axstride < aystride)
2249 for (y = 0; y < ycount; y++)
2250 for (x = 0; x < xcount; x++)
2251 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2253 for (y = 0; y < ycount; y++)
2254 for (n = 0; n < count; n++)
2255 for (x = 0; x < xcount; x++)
2256 /* dest[x,y] += a[x,n] * b[n,y] */
2257 dest[x*rxstride + y*rystride] +=
2258 abase[x*axstride + n*aystride] *
2259 bbase[n*bxstride + y*bystride];
2261 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2263 const GFC_REAL_4 *restrict bbase_y;
2264 GFC_REAL_4 s;
2266 for (y = 0; y < ycount; y++)
2268 bbase_y = &bbase[y*bystride];
2269 s = (GFC_REAL_4) 0;
2270 for (n = 0; n < count; n++)
2271 s += abase[n*axstride] * bbase_y[n*bxstride];
2272 dest[y*rxstride] = s;
2275 else
2277 const GFC_REAL_4 *restrict abase_x;
2278 const GFC_REAL_4 *restrict bbase_y;
2279 GFC_REAL_4 *restrict dest_y;
2280 GFC_REAL_4 s;
2282 for (y = 0; y < ycount; y++)
2284 bbase_y = &bbase[y*bystride];
2285 dest_y = &dest[y*rystride];
2286 for (x = 0; x < xcount; x++)
2288 abase_x = &abase[x*axstride];
2289 s = (GFC_REAL_4) 0;
2290 for (n = 0; n < count; n++)
2291 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2292 dest_y[x*rxstride] = s;
2297 #undef POW3
2298 #undef min
2299 #undef max
2302 /* Compiling main function, with selection code for the processor. */
2304 /* Currently, this is i386 only. Adjust for other architectures. */
2306 #include <config/i386/cpuinfo.h>
2307 void matmul_r4 (gfc_array_r4 * const restrict retarray,
2308 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2309 int blas_limit, blas_call gemm)
2311 static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
2312 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2313 int blas_limit, blas_call gemm);
2315 void (*matmul_fn) (gfc_array_r4 * const restrict retarray,
2316 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2317 int blas_limit, blas_call gemm);
2319 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2320 if (matmul_fn == NULL)
2322 matmul_fn = matmul_r4_vanilla;
2323 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2325 /* Run down the available processors in order of preference. */
2326 #ifdef HAVE_AVX512F
2327 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2329 matmul_fn = matmul_r4_avx512f;
2330 goto store;
2333 #endif /* HAVE_AVX512F */
2335 #ifdef HAVE_AVX2
2336 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2337 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2339 matmul_fn = matmul_r4_avx2;
2340 goto store;
2343 #endif
2345 #ifdef HAVE_AVX
2346 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2348 matmul_fn = matmul_r4_avx;
2349 goto store;
2351 #endif /* HAVE_AVX */
2353 else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2355 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2356 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2357 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2359 matmul_fn = matmul_r4_avx128_fma3;
2360 goto store;
2362 #endif
2363 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2364 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2365 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2367 matmul_fn = matmul_r4_avx128_fma4;
2368 goto store;
2370 #endif
2373 store:
2374 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2377 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2380 #else /* Just the vanilla function. */
2382 void
2383 matmul_r4 (gfc_array_r4 * const restrict retarray,
2384 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2385 int blas_limit, blas_call gemm)
2387 const GFC_REAL_4 * restrict abase;
2388 const GFC_REAL_4 * restrict bbase;
2389 GFC_REAL_4 * restrict dest;
2391 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2392 index_type x, y, n, count, xcount, ycount;
2394 assert (GFC_DESCRIPTOR_RANK (a) == 2
2395 || GFC_DESCRIPTOR_RANK (b) == 2);
2397 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2399 Either A or B (but not both) can be rank 1:
2401 o One-dimensional argument A is implicitly treated as a row matrix
2402 dimensioned [1,count], so xcount=1.
2404 o One-dimensional argument B is implicitly treated as a column matrix
2405 dimensioned [count, 1], so ycount=1.
2408 if (retarray->base_addr == NULL)
2410 if (GFC_DESCRIPTOR_RANK (a) == 1)
2412 GFC_DIMENSION_SET(retarray->dim[0], 0,
2413 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2415 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2417 GFC_DIMENSION_SET(retarray->dim[0], 0,
2418 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2420 else
2422 GFC_DIMENSION_SET(retarray->dim[0], 0,
2423 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2425 GFC_DIMENSION_SET(retarray->dim[1], 0,
2426 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2427 GFC_DESCRIPTOR_EXTENT(retarray,0));
2430 retarray->base_addr
2431 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
2432 retarray->offset = 0;
2434 else if (unlikely (compile_options.bounds_check))
2436 index_type ret_extent, arg_extent;
2438 if (GFC_DESCRIPTOR_RANK (a) == 1)
2440 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2441 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2442 if (arg_extent != ret_extent)
2443 runtime_error ("Incorrect extent in return array in"
2444 " MATMUL intrinsic: is %ld, should be %ld",
2445 (long int) ret_extent, (long int) arg_extent);
2447 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2449 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2450 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2451 if (arg_extent != ret_extent)
2452 runtime_error ("Incorrect extent in return array in"
2453 " MATMUL intrinsic: is %ld, should be %ld",
2454 (long int) ret_extent, (long int) arg_extent);
2456 else
2458 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2459 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2460 if (arg_extent != ret_extent)
2461 runtime_error ("Incorrect extent in return array in"
2462 " MATMUL intrinsic for dimension 1:"
2463 " is %ld, should be %ld",
2464 (long int) ret_extent, (long int) arg_extent);
2466 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2467 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2468 if (arg_extent != ret_extent)
2469 runtime_error ("Incorrect extent in return array in"
2470 " MATMUL intrinsic for dimension 2:"
2471 " is %ld, should be %ld",
2472 (long int) ret_extent, (long int) arg_extent);
2477 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2479 /* One-dimensional result may be addressed in the code below
2480 either as a row or a column matrix. We want both cases to
2481 work. */
2482 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2484 else
2486 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2487 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2491 if (GFC_DESCRIPTOR_RANK (a) == 1)
2493 /* Treat it as a a row matrix A[1,count]. */
2494 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2495 aystride = 1;
2497 xcount = 1;
2498 count = GFC_DESCRIPTOR_EXTENT(a,0);
2500 else
2502 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2503 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2505 count = GFC_DESCRIPTOR_EXTENT(a,1);
2506 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2509 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2511 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2512 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2515 if (GFC_DESCRIPTOR_RANK (b) == 1)
2517 /* Treat it as a column matrix B[count,1] */
2518 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2520 /* bystride should never be used for 1-dimensional b.
2521 The value is only used for calculation of the
2522 memory by the buffer. */
2523 bystride = 256;
2524 ycount = 1;
2526 else
2528 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2529 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2530 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2533 abase = a->base_addr;
2534 bbase = b->base_addr;
2535 dest = retarray->base_addr;
2537 /* Now that everything is set up, we perform the multiplication
2538 itself. */
2540 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2541 #define min(a,b) ((a) <= (b) ? (a) : (b))
2542 #define max(a,b) ((a) >= (b) ? (a) : (b))
2544 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2545 && (bxstride == 1 || bystride == 1)
2546 && (((float) xcount) * ((float) ycount) * ((float) count)
2547 > POW3(blas_limit)))
2549 const int m = xcount, n = ycount, k = count, ldc = rystride;
2550 const GFC_REAL_4 one = 1, zero = 0;
2551 const int lda = (axstride == 1) ? aystride : axstride,
2552 ldb = (bxstride == 1) ? bystride : bxstride;
2554 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2556 assert (gemm != NULL);
2557 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2558 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2559 &ldc, 1, 1);
2560 return;
2564 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2566 /* This block of code implements a tuned matmul, derived from
2567 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2569 Bo Kagstrom and Per Ling
2570 Department of Computing Science
2571 Umea University
2572 S-901 87 Umea, Sweden
2574 from netlib.org, translated to C, and modified for matmul.m4. */
2576 const GFC_REAL_4 *a, *b;
2577 GFC_REAL_4 *c;
2578 const index_type m = xcount, n = ycount, k = count;
2580 /* System generated locals */
2581 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2582 i1, i2, i3, i4, i5, i6;
2584 /* Local variables */
2585 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
2586 f13, f14, f23, f24, f33, f34, f43, f44;
2587 index_type i, j, l, ii, jj, ll;
2588 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2589 GFC_REAL_4 *t1;
2591 a = abase;
2592 b = bbase;
2593 c = retarray->base_addr;
2595 /* Parameter adjustments */
2596 c_dim1 = rystride;
2597 c_offset = 1 + c_dim1;
2598 c -= c_offset;
2599 a_dim1 = aystride;
2600 a_offset = 1 + a_dim1;
2601 a -= a_offset;
2602 b_dim1 = bystride;
2603 b_offset = 1 + b_dim1;
2604 b -= b_offset;
2606 /* Empty c first. */
2607 for (j=1; j<=n; j++)
2608 for (i=1; i<=m; i++)
2609 c[i + j * c_dim1] = (GFC_REAL_4)0;
2611 /* Early exit if possible */
2612 if (m == 0 || n == 0 || k == 0)
2613 return;
2615 /* Adjust size of t1 to what is needed. */
2616 index_type t1_dim;
2617 t1_dim = (a_dim1-1) * 256 + b_dim1;
2618 if (t1_dim > 65536)
2619 t1_dim = 65536;
2621 t1 = malloc (t1_dim * sizeof(GFC_REAL_4));
2623 /* Start turning the crank. */
2624 i1 = n;
2625 for (jj = 1; jj <= i1; jj += 512)
2627 /* Computing MIN */
2628 i2 = 512;
2629 i3 = n - jj + 1;
2630 jsec = min(i2,i3);
2631 ujsec = jsec - jsec % 4;
2632 i2 = k;
2633 for (ll = 1; ll <= i2; ll += 256)
2635 /* Computing MIN */
2636 i3 = 256;
2637 i4 = k - ll + 1;
2638 lsec = min(i3,i4);
2639 ulsec = lsec - lsec % 2;
2641 i3 = m;
2642 for (ii = 1; ii <= i3; ii += 256)
2644 /* Computing MIN */
2645 i4 = 256;
2646 i5 = m - ii + 1;
2647 isec = min(i4,i5);
2648 uisec = isec - isec % 2;
2649 i4 = ll + ulsec - 1;
2650 for (l = ll; l <= i4; l += 2)
2652 i5 = ii + uisec - 1;
2653 for (i = ii; i <= i5; i += 2)
2655 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2656 a[i + l * a_dim1];
2657 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2658 a[i + (l + 1) * a_dim1];
2659 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2660 a[i + 1 + l * a_dim1];
2661 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2662 a[i + 1 + (l + 1) * a_dim1];
2664 if (uisec < isec)
2666 t1[l - ll + 1 + (isec << 8) - 257] =
2667 a[ii + isec - 1 + l * a_dim1];
2668 t1[l - ll + 2 + (isec << 8) - 257] =
2669 a[ii + isec - 1 + (l + 1) * a_dim1];
2672 if (ulsec < lsec)
2674 i4 = ii + isec - 1;
2675 for (i = ii; i<= i4; ++i)
2677 t1[lsec + ((i - ii + 1) << 8) - 257] =
2678 a[i + (ll + lsec - 1) * a_dim1];
2682 uisec = isec - isec % 4;
2683 i4 = jj + ujsec - 1;
2684 for (j = jj; j <= i4; j += 4)
2686 i5 = ii + uisec - 1;
2687 for (i = ii; i <= i5; i += 4)
2689 f11 = c[i + j * c_dim1];
2690 f21 = c[i + 1 + j * c_dim1];
2691 f12 = c[i + (j + 1) * c_dim1];
2692 f22 = c[i + 1 + (j + 1) * c_dim1];
2693 f13 = c[i + (j + 2) * c_dim1];
2694 f23 = c[i + 1 + (j + 2) * c_dim1];
2695 f14 = c[i + (j + 3) * c_dim1];
2696 f24 = c[i + 1 + (j + 3) * c_dim1];
2697 f31 = c[i + 2 + j * c_dim1];
2698 f41 = c[i + 3 + j * c_dim1];
2699 f32 = c[i + 2 + (j + 1) * c_dim1];
2700 f42 = c[i + 3 + (j + 1) * c_dim1];
2701 f33 = c[i + 2 + (j + 2) * c_dim1];
2702 f43 = c[i + 3 + (j + 2) * c_dim1];
2703 f34 = c[i + 2 + (j + 3) * c_dim1];
2704 f44 = c[i + 3 + (j + 3) * c_dim1];
2705 i6 = ll + lsec - 1;
2706 for (l = ll; l <= i6; ++l)
2708 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2709 * b[l + j * b_dim1];
2710 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2711 * b[l + j * b_dim1];
2712 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2713 * b[l + (j + 1) * b_dim1];
2714 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2715 * b[l + (j + 1) * b_dim1];
2716 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2717 * b[l + (j + 2) * b_dim1];
2718 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2719 * b[l + (j + 2) * b_dim1];
2720 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2721 * b[l + (j + 3) * b_dim1];
2722 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2723 * b[l + (j + 3) * b_dim1];
2724 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2725 * b[l + j * b_dim1];
2726 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2727 * b[l + j * b_dim1];
2728 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2729 * b[l + (j + 1) * b_dim1];
2730 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2731 * b[l + (j + 1) * b_dim1];
2732 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2733 * b[l + (j + 2) * b_dim1];
2734 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2735 * b[l + (j + 2) * b_dim1];
2736 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2737 * b[l + (j + 3) * b_dim1];
2738 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2739 * b[l + (j + 3) * b_dim1];
2741 c[i + j * c_dim1] = f11;
2742 c[i + 1 + j * c_dim1] = f21;
2743 c[i + (j + 1) * c_dim1] = f12;
2744 c[i + 1 + (j + 1) * c_dim1] = f22;
2745 c[i + (j + 2) * c_dim1] = f13;
2746 c[i + 1 + (j + 2) * c_dim1] = f23;
2747 c[i + (j + 3) * c_dim1] = f14;
2748 c[i + 1 + (j + 3) * c_dim1] = f24;
2749 c[i + 2 + j * c_dim1] = f31;
2750 c[i + 3 + j * c_dim1] = f41;
2751 c[i + 2 + (j + 1) * c_dim1] = f32;
2752 c[i + 3 + (j + 1) * c_dim1] = f42;
2753 c[i + 2 + (j + 2) * c_dim1] = f33;
2754 c[i + 3 + (j + 2) * c_dim1] = f43;
2755 c[i + 2 + (j + 3) * c_dim1] = f34;
2756 c[i + 3 + (j + 3) * c_dim1] = f44;
2758 if (uisec < isec)
2760 i5 = ii + isec - 1;
2761 for (i = ii + uisec; i <= i5; ++i)
2763 f11 = c[i + j * c_dim1];
2764 f12 = c[i + (j + 1) * c_dim1];
2765 f13 = c[i + (j + 2) * c_dim1];
2766 f14 = c[i + (j + 3) * c_dim1];
2767 i6 = ll + lsec - 1;
2768 for (l = ll; l <= i6; ++l)
2770 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2771 257] * b[l + j * b_dim1];
2772 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2773 257] * b[l + (j + 1) * b_dim1];
2774 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2775 257] * b[l + (j + 2) * b_dim1];
2776 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2777 257] * b[l + (j + 3) * b_dim1];
2779 c[i + j * c_dim1] = f11;
2780 c[i + (j + 1) * c_dim1] = f12;
2781 c[i + (j + 2) * c_dim1] = f13;
2782 c[i + (j + 3) * c_dim1] = f14;
2786 if (ujsec < jsec)
2788 i4 = jj + jsec - 1;
2789 for (j = jj + ujsec; j <= i4; ++j)
2791 i5 = ii + uisec - 1;
2792 for (i = ii; i <= i5; i += 4)
2794 f11 = c[i + j * c_dim1];
2795 f21 = c[i + 1 + j * c_dim1];
2796 f31 = c[i + 2 + j * c_dim1];
2797 f41 = c[i + 3 + j * c_dim1];
2798 i6 = ll + lsec - 1;
2799 for (l = ll; l <= i6; ++l)
2801 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2802 257] * b[l + j * b_dim1];
2803 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2804 257] * b[l + j * b_dim1];
2805 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2806 257] * b[l + j * b_dim1];
2807 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2808 257] * b[l + j * b_dim1];
2810 c[i + j * c_dim1] = f11;
2811 c[i + 1 + j * c_dim1] = f21;
2812 c[i + 2 + j * c_dim1] = f31;
2813 c[i + 3 + j * c_dim1] = f41;
2815 i5 = ii + isec - 1;
2816 for (i = ii + uisec; i <= i5; ++i)
2818 f11 = c[i + j * c_dim1];
2819 i6 = ll + lsec - 1;
2820 for (l = ll; l <= i6; ++l)
2822 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2823 257] * b[l + j * b_dim1];
2825 c[i + j * c_dim1] = f11;
2832 free(t1);
2833 return;
2835 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2837 if (GFC_DESCRIPTOR_RANK (a) != 1)
2839 const GFC_REAL_4 *restrict abase_x;
2840 const GFC_REAL_4 *restrict bbase_y;
2841 GFC_REAL_4 *restrict dest_y;
2842 GFC_REAL_4 s;
2844 for (y = 0; y < ycount; y++)
2846 bbase_y = &bbase[y*bystride];
2847 dest_y = &dest[y*rystride];
2848 for (x = 0; x < xcount; x++)
2850 abase_x = &abase[x*axstride];
2851 s = (GFC_REAL_4) 0;
2852 for (n = 0; n < count; n++)
2853 s += abase_x[n] * bbase_y[n];
2854 dest_y[x] = s;
2858 else
2860 const GFC_REAL_4 *restrict bbase_y;
2861 GFC_REAL_4 s;
2863 for (y = 0; y < ycount; y++)
2865 bbase_y = &bbase[y*bystride];
2866 s = (GFC_REAL_4) 0;
2867 for (n = 0; n < count; n++)
2868 s += abase[n*axstride] * bbase_y[n];
2869 dest[y*rystride] = s;
2873 else if (axstride < aystride)
2875 for (y = 0; y < ycount; y++)
2876 for (x = 0; x < xcount; x++)
2877 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2879 for (y = 0; y < ycount; y++)
2880 for (n = 0; n < count; n++)
2881 for (x = 0; x < xcount; x++)
2882 /* dest[x,y] += a[x,n] * b[n,y] */
2883 dest[x*rxstride + y*rystride] +=
2884 abase[x*axstride + n*aystride] *
2885 bbase[n*bxstride + y*bystride];
2887 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2889 const GFC_REAL_4 *restrict bbase_y;
2890 GFC_REAL_4 s;
2892 for (y = 0; y < ycount; y++)
2894 bbase_y = &bbase[y*bystride];
2895 s = (GFC_REAL_4) 0;
2896 for (n = 0; n < count; n++)
2897 s += abase[n*axstride] * bbase_y[n*bxstride];
2898 dest[y*rxstride] = s;
2901 else
2903 const GFC_REAL_4 *restrict abase_x;
2904 const GFC_REAL_4 *restrict bbase_y;
2905 GFC_REAL_4 *restrict dest_y;
2906 GFC_REAL_4 s;
2908 for (y = 0; y < ycount; y++)
2910 bbase_y = &bbase[y*bystride];
2911 dest_y = &dest[y*rystride];
2912 for (x = 0; x < xcount; x++)
2914 abase_x = &abase[x*axstride];
2915 s = (GFC_REAL_4) 0;
2916 for (n = 0; n < count; n++)
2917 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2918 dest_y[x*rxstride] = s;
2923 #undef POW3
2924 #undef min
2925 #undef max
2927 #endif
2928 #endif