gcc:
[official-gcc.git] / libgfortran / generated / matmul_c10.c
blob462d71e23f5aa95d656aee8771f2aa9327964406
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
31 #if defined (HAVE_GFC_COMPLEX_10)
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
39 const int *, const GFC_COMPLEX_10 *, const int *,
40 const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
41 int, int);
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
49 The equivalent Fortran pseudo-code is:
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
72 extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
73 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_c10);
77 /* Put exhaustive list of possible architectures here here, ORed together. */
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
81 #ifdef HAVE_AVX
82 static void
83 matmul_c10_avx (gfc_array_c10 * const restrict retarray,
84 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
87 matmul_c10_avx (gfc_array_c10 * const restrict retarray,
88 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
91 const GFC_COMPLEX_10 * restrict abase;
92 const GFC_COMPLEX_10 * restrict bbase;
93 GFC_COMPLEX_10 * restrict dest;
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
103 Either A or B (but not both) can be rank 1:
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
112 if (retarray->base_addr == NULL)
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
124 else
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
136 retarray->offset = 0;
138 else if (unlikely (compile_options.bounds_check))
140 index_type ret_extent, arg_extent;
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent, (long int) arg_extent);
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent, (long int) arg_extent);
160 else
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
185 work. */
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
188 else
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = 1;
201 xcount = 1;
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
204 else
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
224 /* bystride should never be used for 1-dimensional b.
225 The value is only used for calculation of the
226 memory by the buffer. */
227 bystride = 256;
228 ycount = 1;
230 else
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_COMPLEX_10 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
260 assert (gemm != NULL);
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
263 &ldc, 1, 1);
264 return;
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
275 Umea University
276 S-901 87 Umea, Sweden
278 from netlib.org, translated to C, and modified for matmul.m4. */
280 const GFC_COMPLEX_10 *a, *b;
281 GFC_COMPLEX_10 *c;
282 const index_type m = xcount, n = ycount, k = count;
284 /* System generated locals */
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 i1, i2, i3, i4, i5, i6;
288 /* Local variables */
289 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
290 f13, f14, f23, f24, f33, f34, f43, f44;
291 index_type i, j, l, ii, jj, ll;
292 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
293 GFC_COMPLEX_10 *t1;
295 a = abase;
296 b = bbase;
297 c = retarray->base_addr;
299 /* Parameter adjustments */
300 c_dim1 = rystride;
301 c_offset = 1 + c_dim1;
302 c -= c_offset;
303 a_dim1 = aystride;
304 a_offset = 1 + a_dim1;
305 a -= a_offset;
306 b_dim1 = bystride;
307 b_offset = 1 + b_dim1;
308 b -= b_offset;
310 /* Empty c first. */
311 for (j=1; j<=n; j++)
312 for (i=1; i<=m; i++)
313 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
315 /* Early exit if possible */
316 if (m == 0 || n == 0 || k == 0)
317 return;
319 /* Adjust size of t1 to what is needed. */
320 index_type t1_dim, a_sz;
321 if (aystride == 1)
322 a_sz = rystride;
323 else
324 a_sz = a_dim1;
326 t1_dim = a_sz * 256 + b_dim1;
327 if (t1_dim > 65536)
328 t1_dim = 65536;
330 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
332 /* Start turning the crank. */
333 i1 = n;
334 for (jj = 1; jj <= i1; jj += 512)
336 /* Computing MIN */
337 i2 = 512;
338 i3 = n - jj + 1;
339 jsec = min(i2,i3);
340 ujsec = jsec - jsec % 4;
341 i2 = k;
342 for (ll = 1; ll <= i2; ll += 256)
344 /* Computing MIN */
345 i3 = 256;
346 i4 = k - ll + 1;
347 lsec = min(i3,i4);
348 ulsec = lsec - lsec % 2;
350 i3 = m;
351 for (ii = 1; ii <= i3; ii += 256)
353 /* Computing MIN */
354 i4 = 256;
355 i5 = m - ii + 1;
356 isec = min(i4,i5);
357 uisec = isec - isec % 2;
358 i4 = ll + ulsec - 1;
359 for (l = ll; l <= i4; l += 2)
361 i5 = ii + uisec - 1;
362 for (i = ii; i <= i5; i += 2)
364 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
365 a[i + l * a_dim1];
366 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
367 a[i + (l + 1) * a_dim1];
368 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
369 a[i + 1 + l * a_dim1];
370 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
371 a[i + 1 + (l + 1) * a_dim1];
373 if (uisec < isec)
375 t1[l - ll + 1 + (isec << 8) - 257] =
376 a[ii + isec - 1 + l * a_dim1];
377 t1[l - ll + 2 + (isec << 8) - 257] =
378 a[ii + isec - 1 + (l + 1) * a_dim1];
381 if (ulsec < lsec)
383 i4 = ii + isec - 1;
384 for (i = ii; i<= i4; ++i)
386 t1[lsec + ((i - ii + 1) << 8) - 257] =
387 a[i + (ll + lsec - 1) * a_dim1];
391 uisec = isec - isec % 4;
392 i4 = jj + ujsec - 1;
393 for (j = jj; j <= i4; j += 4)
395 i5 = ii + uisec - 1;
396 for (i = ii; i <= i5; i += 4)
398 f11 = c[i + j * c_dim1];
399 f21 = c[i + 1 + j * c_dim1];
400 f12 = c[i + (j + 1) * c_dim1];
401 f22 = c[i + 1 + (j + 1) * c_dim1];
402 f13 = c[i + (j + 2) * c_dim1];
403 f23 = c[i + 1 + (j + 2) * c_dim1];
404 f14 = c[i + (j + 3) * c_dim1];
405 f24 = c[i + 1 + (j + 3) * c_dim1];
406 f31 = c[i + 2 + j * c_dim1];
407 f41 = c[i + 3 + j * c_dim1];
408 f32 = c[i + 2 + (j + 1) * c_dim1];
409 f42 = c[i + 3 + (j + 1) * c_dim1];
410 f33 = c[i + 2 + (j + 2) * c_dim1];
411 f43 = c[i + 3 + (j + 2) * c_dim1];
412 f34 = c[i + 2 + (j + 3) * c_dim1];
413 f44 = c[i + 3 + (j + 3) * c_dim1];
414 i6 = ll + lsec - 1;
415 for (l = ll; l <= i6; ++l)
417 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
418 * b[l + j * b_dim1];
419 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
420 * b[l + j * b_dim1];
421 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
422 * b[l + (j + 1) * b_dim1];
423 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
424 * b[l + (j + 1) * b_dim1];
425 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
426 * b[l + (j + 2) * b_dim1];
427 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
428 * b[l + (j + 2) * b_dim1];
429 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430 * b[l + (j + 3) * b_dim1];
431 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432 * b[l + (j + 3) * b_dim1];
433 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
434 * b[l + j * b_dim1];
435 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
436 * b[l + j * b_dim1];
437 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
438 * b[l + (j + 1) * b_dim1];
439 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
440 * b[l + (j + 1) * b_dim1];
441 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
442 * b[l + (j + 2) * b_dim1];
443 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
444 * b[l + (j + 2) * b_dim1];
445 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446 * b[l + (j + 3) * b_dim1];
447 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448 * b[l + (j + 3) * b_dim1];
450 c[i + j * c_dim1] = f11;
451 c[i + 1 + j * c_dim1] = f21;
452 c[i + (j + 1) * c_dim1] = f12;
453 c[i + 1 + (j + 1) * c_dim1] = f22;
454 c[i + (j + 2) * c_dim1] = f13;
455 c[i + 1 + (j + 2) * c_dim1] = f23;
456 c[i + (j + 3) * c_dim1] = f14;
457 c[i + 1 + (j + 3) * c_dim1] = f24;
458 c[i + 2 + j * c_dim1] = f31;
459 c[i + 3 + j * c_dim1] = f41;
460 c[i + 2 + (j + 1) * c_dim1] = f32;
461 c[i + 3 + (j + 1) * c_dim1] = f42;
462 c[i + 2 + (j + 2) * c_dim1] = f33;
463 c[i + 3 + (j + 2) * c_dim1] = f43;
464 c[i + 2 + (j + 3) * c_dim1] = f34;
465 c[i + 3 + (j + 3) * c_dim1] = f44;
467 if (uisec < isec)
469 i5 = ii + isec - 1;
470 for (i = ii + uisec; i <= i5; ++i)
472 f11 = c[i + j * c_dim1];
473 f12 = c[i + (j + 1) * c_dim1];
474 f13 = c[i + (j + 2) * c_dim1];
475 f14 = c[i + (j + 3) * c_dim1];
476 i6 = ll + lsec - 1;
477 for (l = ll; l <= i6; ++l)
479 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
480 257] * b[l + j * b_dim1];
481 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
482 257] * b[l + (j + 1) * b_dim1];
483 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
484 257] * b[l + (j + 2) * b_dim1];
485 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
486 257] * b[l + (j + 3) * b_dim1];
488 c[i + j * c_dim1] = f11;
489 c[i + (j + 1) * c_dim1] = f12;
490 c[i + (j + 2) * c_dim1] = f13;
491 c[i + (j + 3) * c_dim1] = f14;
495 if (ujsec < jsec)
497 i4 = jj + jsec - 1;
498 for (j = jj + ujsec; j <= i4; ++j)
500 i5 = ii + uisec - 1;
501 for (i = ii; i <= i5; i += 4)
503 f11 = c[i + j * c_dim1];
504 f21 = c[i + 1 + j * c_dim1];
505 f31 = c[i + 2 + j * c_dim1];
506 f41 = c[i + 3 + j * c_dim1];
507 i6 = ll + lsec - 1;
508 for (l = ll; l <= i6; ++l)
510 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
511 257] * b[l + j * b_dim1];
512 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
513 257] * b[l + j * b_dim1];
514 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
515 257] * b[l + j * b_dim1];
516 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
517 257] * b[l + j * b_dim1];
519 c[i + j * c_dim1] = f11;
520 c[i + 1 + j * c_dim1] = f21;
521 c[i + 2 + j * c_dim1] = f31;
522 c[i + 3 + j * c_dim1] = f41;
524 i5 = ii + isec - 1;
525 for (i = ii + uisec; i <= i5; ++i)
527 f11 = c[i + j * c_dim1];
528 i6 = ll + lsec - 1;
529 for (l = ll; l <= i6; ++l)
531 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
532 257] * b[l + j * b_dim1];
534 c[i + j * c_dim1] = f11;
541 free(t1);
542 return;
544 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
546 if (GFC_DESCRIPTOR_RANK (a) != 1)
548 const GFC_COMPLEX_10 *restrict abase_x;
549 const GFC_COMPLEX_10 *restrict bbase_y;
550 GFC_COMPLEX_10 *restrict dest_y;
551 GFC_COMPLEX_10 s;
553 for (y = 0; y < ycount; y++)
555 bbase_y = &bbase[y*bystride];
556 dest_y = &dest[y*rystride];
557 for (x = 0; x < xcount; x++)
559 abase_x = &abase[x*axstride];
560 s = (GFC_COMPLEX_10) 0;
561 for (n = 0; n < count; n++)
562 s += abase_x[n] * bbase_y[n];
563 dest_y[x] = s;
567 else
569 const GFC_COMPLEX_10 *restrict bbase_y;
570 GFC_COMPLEX_10 s;
572 for (y = 0; y < ycount; y++)
574 bbase_y = &bbase[y*bystride];
575 s = (GFC_COMPLEX_10) 0;
576 for (n = 0; n < count; n++)
577 s += abase[n*axstride] * bbase_y[n];
578 dest[y*rystride] = s;
582 else if (axstride < aystride)
584 for (y = 0; y < ycount; y++)
585 for (x = 0; x < xcount; x++)
586 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
588 for (y = 0; y < ycount; y++)
589 for (n = 0; n < count; n++)
590 for (x = 0; x < xcount; x++)
591 /* dest[x,y] += a[x,n] * b[n,y] */
592 dest[x*rxstride + y*rystride] +=
593 abase[x*axstride + n*aystride] *
594 bbase[n*bxstride + y*bystride];
596 else if (GFC_DESCRIPTOR_RANK (a) == 1)
598 const GFC_COMPLEX_10 *restrict bbase_y;
599 GFC_COMPLEX_10 s;
601 for (y = 0; y < ycount; y++)
603 bbase_y = &bbase[y*bystride];
604 s = (GFC_COMPLEX_10) 0;
605 for (n = 0; n < count; n++)
606 s += abase[n*axstride] * bbase_y[n*bxstride];
607 dest[y*rxstride] = s;
610 else
612 const GFC_COMPLEX_10 *restrict abase_x;
613 const GFC_COMPLEX_10 *restrict bbase_y;
614 GFC_COMPLEX_10 *restrict dest_y;
615 GFC_COMPLEX_10 s;
617 for (y = 0; y < ycount; y++)
619 bbase_y = &bbase[y*bystride];
620 dest_y = &dest[y*rystride];
621 for (x = 0; x < xcount; x++)
623 abase_x = &abase[x*axstride];
624 s = (GFC_COMPLEX_10) 0;
625 for (n = 0; n < count; n++)
626 s += abase_x[n*aystride] * bbase_y[n*bxstride];
627 dest_y[x*rxstride] = s;
632 #undef POW3
633 #undef min
634 #undef max
636 #endif /* HAVE_AVX */
638 #ifdef HAVE_AVX2
639 static void
640 matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
641 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
642 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
643 static void
644 matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
645 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
646 int blas_limit, blas_call gemm)
648 const GFC_COMPLEX_10 * restrict abase;
649 const GFC_COMPLEX_10 * restrict bbase;
650 GFC_COMPLEX_10 * restrict dest;
652 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
653 index_type x, y, n, count, xcount, ycount;
655 assert (GFC_DESCRIPTOR_RANK (a) == 2
656 || GFC_DESCRIPTOR_RANK (b) == 2);
658 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
660 Either A or B (but not both) can be rank 1:
662 o One-dimensional argument A is implicitly treated as a row matrix
663 dimensioned [1,count], so xcount=1.
665 o One-dimensional argument B is implicitly treated as a column matrix
666 dimensioned [count, 1], so ycount=1.
669 if (retarray->base_addr == NULL)
671 if (GFC_DESCRIPTOR_RANK (a) == 1)
673 GFC_DIMENSION_SET(retarray->dim[0], 0,
674 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
676 else if (GFC_DESCRIPTOR_RANK (b) == 1)
678 GFC_DIMENSION_SET(retarray->dim[0], 0,
679 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
681 else
683 GFC_DIMENSION_SET(retarray->dim[0], 0,
684 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
686 GFC_DIMENSION_SET(retarray->dim[1], 0,
687 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
688 GFC_DESCRIPTOR_EXTENT(retarray,0));
691 retarray->base_addr
692 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
693 retarray->offset = 0;
695 else if (unlikely (compile_options.bounds_check))
697 index_type ret_extent, arg_extent;
699 if (GFC_DESCRIPTOR_RANK (a) == 1)
701 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
702 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
703 if (arg_extent != ret_extent)
704 runtime_error ("Incorrect extent in return array in"
705 " MATMUL intrinsic: is %ld, should be %ld",
706 (long int) ret_extent, (long int) arg_extent);
708 else if (GFC_DESCRIPTOR_RANK (b) == 1)
710 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
711 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
712 if (arg_extent != ret_extent)
713 runtime_error ("Incorrect extent in return array in"
714 " MATMUL intrinsic: is %ld, should be %ld",
715 (long int) ret_extent, (long int) arg_extent);
717 else
719 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
720 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
721 if (arg_extent != ret_extent)
722 runtime_error ("Incorrect extent in return array in"
723 " MATMUL intrinsic for dimension 1:"
724 " is %ld, should be %ld",
725 (long int) ret_extent, (long int) arg_extent);
727 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
728 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
729 if (arg_extent != ret_extent)
730 runtime_error ("Incorrect extent in return array in"
731 " MATMUL intrinsic for dimension 2:"
732 " is %ld, should be %ld",
733 (long int) ret_extent, (long int) arg_extent);
738 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
740 /* One-dimensional result may be addressed in the code below
741 either as a row or a column matrix. We want both cases to
742 work. */
743 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
745 else
747 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
748 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
752 if (GFC_DESCRIPTOR_RANK (a) == 1)
754 /* Treat it as a a row matrix A[1,count]. */
755 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
756 aystride = 1;
758 xcount = 1;
759 count = GFC_DESCRIPTOR_EXTENT(a,0);
761 else
763 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
764 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
766 count = GFC_DESCRIPTOR_EXTENT(a,1);
767 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
770 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
772 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
773 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
776 if (GFC_DESCRIPTOR_RANK (b) == 1)
778 /* Treat it as a column matrix B[count,1] */
779 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
781 /* bystride should never be used for 1-dimensional b.
782 The value is only used for calculation of the
783 memory by the buffer. */
784 bystride = 256;
785 ycount = 1;
787 else
789 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
790 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
791 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
794 abase = a->base_addr;
795 bbase = b->base_addr;
796 dest = retarray->base_addr;
798 /* Now that everything is set up, we perform the multiplication
799 itself. */
801 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
802 #define min(a,b) ((a) <= (b) ? (a) : (b))
803 #define max(a,b) ((a) >= (b) ? (a) : (b))
805 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
806 && (bxstride == 1 || bystride == 1)
807 && (((float) xcount) * ((float) ycount) * ((float) count)
808 > POW3(blas_limit)))
810 const int m = xcount, n = ycount, k = count, ldc = rystride;
811 const GFC_COMPLEX_10 one = 1, zero = 0;
812 const int lda = (axstride == 1) ? aystride : axstride,
813 ldb = (bxstride == 1) ? bystride : bxstride;
815 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
817 assert (gemm != NULL);
818 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
819 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
820 &ldc, 1, 1);
821 return;
825 if (rxstride == 1 && axstride == 1 && bxstride == 1)
827 /* This block of code implements a tuned matmul, derived from
828 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
830 Bo Kagstrom and Per Ling
831 Department of Computing Science
832 Umea University
833 S-901 87 Umea, Sweden
835 from netlib.org, translated to C, and modified for matmul.m4. */
837 const GFC_COMPLEX_10 *a, *b;
838 GFC_COMPLEX_10 *c;
839 const index_type m = xcount, n = ycount, k = count;
841 /* System generated locals */
842 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
843 i1, i2, i3, i4, i5, i6;
845 /* Local variables */
846 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
847 f13, f14, f23, f24, f33, f34, f43, f44;
848 index_type i, j, l, ii, jj, ll;
849 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
850 GFC_COMPLEX_10 *t1;
852 a = abase;
853 b = bbase;
854 c = retarray->base_addr;
856 /* Parameter adjustments */
857 c_dim1 = rystride;
858 c_offset = 1 + c_dim1;
859 c -= c_offset;
860 a_dim1 = aystride;
861 a_offset = 1 + a_dim1;
862 a -= a_offset;
863 b_dim1 = bystride;
864 b_offset = 1 + b_dim1;
865 b -= b_offset;
867 /* Empty c first. */
868 for (j=1; j<=n; j++)
869 for (i=1; i<=m; i++)
870 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
872 /* Early exit if possible */
873 if (m == 0 || n == 0 || k == 0)
874 return;
876 /* Adjust size of t1 to what is needed. */
877 index_type t1_dim, a_sz;
878 if (aystride == 1)
879 a_sz = rystride;
880 else
881 a_sz = a_dim1;
883 t1_dim = a_sz * 256 + b_dim1;
884 if (t1_dim > 65536)
885 t1_dim = 65536;
887 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
889 /* Start turning the crank. */
890 i1 = n;
891 for (jj = 1; jj <= i1; jj += 512)
893 /* Computing MIN */
894 i2 = 512;
895 i3 = n - jj + 1;
896 jsec = min(i2,i3);
897 ujsec = jsec - jsec % 4;
898 i2 = k;
899 for (ll = 1; ll <= i2; ll += 256)
901 /* Computing MIN */
902 i3 = 256;
903 i4 = k - ll + 1;
904 lsec = min(i3,i4);
905 ulsec = lsec - lsec % 2;
907 i3 = m;
908 for (ii = 1; ii <= i3; ii += 256)
910 /* Computing MIN */
911 i4 = 256;
912 i5 = m - ii + 1;
913 isec = min(i4,i5);
914 uisec = isec - isec % 2;
915 i4 = ll + ulsec - 1;
916 for (l = ll; l <= i4; l += 2)
918 i5 = ii + uisec - 1;
919 for (i = ii; i <= i5; i += 2)
921 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
922 a[i + l * a_dim1];
923 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
924 a[i + (l + 1) * a_dim1];
925 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
926 a[i + 1 + l * a_dim1];
927 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
928 a[i + 1 + (l + 1) * a_dim1];
930 if (uisec < isec)
932 t1[l - ll + 1 + (isec << 8) - 257] =
933 a[ii + isec - 1 + l * a_dim1];
934 t1[l - ll + 2 + (isec << 8) - 257] =
935 a[ii + isec - 1 + (l + 1) * a_dim1];
938 if (ulsec < lsec)
940 i4 = ii + isec - 1;
941 for (i = ii; i<= i4; ++i)
943 t1[lsec + ((i - ii + 1) << 8) - 257] =
944 a[i + (ll + lsec - 1) * a_dim1];
948 uisec = isec - isec % 4;
949 i4 = jj + ujsec - 1;
950 for (j = jj; j <= i4; j += 4)
952 i5 = ii + uisec - 1;
953 for (i = ii; i <= i5; i += 4)
955 f11 = c[i + j * c_dim1];
956 f21 = c[i + 1 + j * c_dim1];
957 f12 = c[i + (j + 1) * c_dim1];
958 f22 = c[i + 1 + (j + 1) * c_dim1];
959 f13 = c[i + (j + 2) * c_dim1];
960 f23 = c[i + 1 + (j + 2) * c_dim1];
961 f14 = c[i + (j + 3) * c_dim1];
962 f24 = c[i + 1 + (j + 3) * c_dim1];
963 f31 = c[i + 2 + j * c_dim1];
964 f41 = c[i + 3 + j * c_dim1];
965 f32 = c[i + 2 + (j + 1) * c_dim1];
966 f42 = c[i + 3 + (j + 1) * c_dim1];
967 f33 = c[i + 2 + (j + 2) * c_dim1];
968 f43 = c[i + 3 + (j + 2) * c_dim1];
969 f34 = c[i + 2 + (j + 3) * c_dim1];
970 f44 = c[i + 3 + (j + 3) * c_dim1];
971 i6 = ll + lsec - 1;
972 for (l = ll; l <= i6; ++l)
974 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
975 * b[l + j * b_dim1];
976 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
977 * b[l + j * b_dim1];
978 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
979 * b[l + (j + 1) * b_dim1];
980 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
981 * b[l + (j + 1) * b_dim1];
982 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
983 * b[l + (j + 2) * b_dim1];
984 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
985 * b[l + (j + 2) * b_dim1];
986 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
987 * b[l + (j + 3) * b_dim1];
988 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
989 * b[l + (j + 3) * b_dim1];
990 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
991 * b[l + j * b_dim1];
992 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
993 * b[l + j * b_dim1];
994 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
995 * b[l + (j + 1) * b_dim1];
996 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
997 * b[l + (j + 1) * b_dim1];
998 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
999 * b[l + (j + 2) * b_dim1];
1000 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1001 * b[l + (j + 2) * b_dim1];
1002 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1003 * b[l + (j + 3) * b_dim1];
1004 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1005 * b[l + (j + 3) * b_dim1];
1007 c[i + j * c_dim1] = f11;
1008 c[i + 1 + j * c_dim1] = f21;
1009 c[i + (j + 1) * c_dim1] = f12;
1010 c[i + 1 + (j + 1) * c_dim1] = f22;
1011 c[i + (j + 2) * c_dim1] = f13;
1012 c[i + 1 + (j + 2) * c_dim1] = f23;
1013 c[i + (j + 3) * c_dim1] = f14;
1014 c[i + 1 + (j + 3) * c_dim1] = f24;
1015 c[i + 2 + j * c_dim1] = f31;
1016 c[i + 3 + j * c_dim1] = f41;
1017 c[i + 2 + (j + 1) * c_dim1] = f32;
1018 c[i + 3 + (j + 1) * c_dim1] = f42;
1019 c[i + 2 + (j + 2) * c_dim1] = f33;
1020 c[i + 3 + (j + 2) * c_dim1] = f43;
1021 c[i + 2 + (j + 3) * c_dim1] = f34;
1022 c[i + 3 + (j + 3) * c_dim1] = f44;
1024 if (uisec < isec)
1026 i5 = ii + isec - 1;
1027 for (i = ii + uisec; i <= i5; ++i)
1029 f11 = c[i + j * c_dim1];
1030 f12 = c[i + (j + 1) * c_dim1];
1031 f13 = c[i + (j + 2) * c_dim1];
1032 f14 = c[i + (j + 3) * c_dim1];
1033 i6 = ll + lsec - 1;
1034 for (l = ll; l <= i6; ++l)
1036 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1037 257] * b[l + j * b_dim1];
1038 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1039 257] * b[l + (j + 1) * b_dim1];
1040 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041 257] * b[l + (j + 2) * b_dim1];
1042 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1043 257] * b[l + (j + 3) * b_dim1];
1045 c[i + j * c_dim1] = f11;
1046 c[i + (j + 1) * c_dim1] = f12;
1047 c[i + (j + 2) * c_dim1] = f13;
1048 c[i + (j + 3) * c_dim1] = f14;
1052 if (ujsec < jsec)
1054 i4 = jj + jsec - 1;
1055 for (j = jj + ujsec; j <= i4; ++j)
1057 i5 = ii + uisec - 1;
1058 for (i = ii; i <= i5; i += 4)
1060 f11 = c[i + j * c_dim1];
1061 f21 = c[i + 1 + j * c_dim1];
1062 f31 = c[i + 2 + j * c_dim1];
1063 f41 = c[i + 3 + j * c_dim1];
1064 i6 = ll + lsec - 1;
1065 for (l = ll; l <= i6; ++l)
1067 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1068 257] * b[l + j * b_dim1];
1069 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1070 257] * b[l + j * b_dim1];
1071 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1072 257] * b[l + j * b_dim1];
1073 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1074 257] * b[l + j * b_dim1];
1076 c[i + j * c_dim1] = f11;
1077 c[i + 1 + j * c_dim1] = f21;
1078 c[i + 2 + j * c_dim1] = f31;
1079 c[i + 3 + j * c_dim1] = f41;
1081 i5 = ii + isec - 1;
1082 for (i = ii + uisec; i <= i5; ++i)
1084 f11 = c[i + j * c_dim1];
1085 i6 = ll + lsec - 1;
1086 for (l = ll; l <= i6; ++l)
1088 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1089 257] * b[l + j * b_dim1];
1091 c[i + j * c_dim1] = f11;
1098 free(t1);
1099 return;
1101 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1103 if (GFC_DESCRIPTOR_RANK (a) != 1)
1105 const GFC_COMPLEX_10 *restrict abase_x;
1106 const GFC_COMPLEX_10 *restrict bbase_y;
1107 GFC_COMPLEX_10 *restrict dest_y;
1108 GFC_COMPLEX_10 s;
1110 for (y = 0; y < ycount; y++)
1112 bbase_y = &bbase[y*bystride];
1113 dest_y = &dest[y*rystride];
1114 for (x = 0; x < xcount; x++)
1116 abase_x = &abase[x*axstride];
1117 s = (GFC_COMPLEX_10) 0;
1118 for (n = 0; n < count; n++)
1119 s += abase_x[n] * bbase_y[n];
1120 dest_y[x] = s;
1124 else
1126 const GFC_COMPLEX_10 *restrict bbase_y;
1127 GFC_COMPLEX_10 s;
1129 for (y = 0; y < ycount; y++)
1131 bbase_y = &bbase[y*bystride];
1132 s = (GFC_COMPLEX_10) 0;
1133 for (n = 0; n < count; n++)
1134 s += abase[n*axstride] * bbase_y[n];
1135 dest[y*rystride] = s;
1139 else if (axstride < aystride)
1141 for (y = 0; y < ycount; y++)
1142 for (x = 0; x < xcount; x++)
1143 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
1145 for (y = 0; y < ycount; y++)
1146 for (n = 0; n < count; n++)
1147 for (x = 0; x < xcount; x++)
1148 /* dest[x,y] += a[x,n] * b[n,y] */
1149 dest[x*rxstride + y*rystride] +=
1150 abase[x*axstride + n*aystride] *
1151 bbase[n*bxstride + y*bystride];
1153 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1155 const GFC_COMPLEX_10 *restrict bbase_y;
1156 GFC_COMPLEX_10 s;
1158 for (y = 0; y < ycount; y++)
1160 bbase_y = &bbase[y*bystride];
1161 s = (GFC_COMPLEX_10) 0;
1162 for (n = 0; n < count; n++)
1163 s += abase[n*axstride] * bbase_y[n*bxstride];
1164 dest[y*rxstride] = s;
1167 else
1169 const GFC_COMPLEX_10 *restrict abase_x;
1170 const GFC_COMPLEX_10 *restrict bbase_y;
1171 GFC_COMPLEX_10 *restrict dest_y;
1172 GFC_COMPLEX_10 s;
1174 for (y = 0; y < ycount; y++)
1176 bbase_y = &bbase[y*bystride];
1177 dest_y = &dest[y*rystride];
1178 for (x = 0; x < xcount; x++)
1180 abase_x = &abase[x*axstride];
1181 s = (GFC_COMPLEX_10) 0;
1182 for (n = 0; n < count; n++)
1183 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1184 dest_y[x*rxstride] = s;
1189 #undef POW3
1190 #undef min
1191 #undef max
1193 #endif /* HAVE_AVX2 */
1195 #ifdef HAVE_AVX512F
1196 static void
1197 matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
1198 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1199 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1200 static void
1201 matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
1202 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1203 int blas_limit, blas_call gemm)
1205 const GFC_COMPLEX_10 * restrict abase;
1206 const GFC_COMPLEX_10 * restrict bbase;
1207 GFC_COMPLEX_10 * restrict dest;
1209 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1210 index_type x, y, n, count, xcount, ycount;
1212 assert (GFC_DESCRIPTOR_RANK (a) == 2
1213 || GFC_DESCRIPTOR_RANK (b) == 2);
1215 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1217 Either A or B (but not both) can be rank 1:
1219 o One-dimensional argument A is implicitly treated as a row matrix
1220 dimensioned [1,count], so xcount=1.
1222 o One-dimensional argument B is implicitly treated as a column matrix
1223 dimensioned [count, 1], so ycount=1.
1226 if (retarray->base_addr == NULL)
1228 if (GFC_DESCRIPTOR_RANK (a) == 1)
1230 GFC_DIMENSION_SET(retarray->dim[0], 0,
1231 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1233 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1235 GFC_DIMENSION_SET(retarray->dim[0], 0,
1236 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1238 else
1240 GFC_DIMENSION_SET(retarray->dim[0], 0,
1241 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1243 GFC_DIMENSION_SET(retarray->dim[1], 0,
1244 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1245 GFC_DESCRIPTOR_EXTENT(retarray,0));
1248 retarray->base_addr
1249 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
1250 retarray->offset = 0;
1252 else if (unlikely (compile_options.bounds_check))
1254 index_type ret_extent, arg_extent;
1256 if (GFC_DESCRIPTOR_RANK (a) == 1)
1258 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1259 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1260 if (arg_extent != ret_extent)
1261 runtime_error ("Incorrect extent in return array in"
1262 " MATMUL intrinsic: is %ld, should be %ld",
1263 (long int) ret_extent, (long int) arg_extent);
1265 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1267 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1268 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1269 if (arg_extent != ret_extent)
1270 runtime_error ("Incorrect extent in return array in"
1271 " MATMUL intrinsic: is %ld, should be %ld",
1272 (long int) ret_extent, (long int) arg_extent);
1274 else
1276 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1277 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1278 if (arg_extent != ret_extent)
1279 runtime_error ("Incorrect extent in return array in"
1280 " MATMUL intrinsic for dimension 1:"
1281 " is %ld, should be %ld",
1282 (long int) ret_extent, (long int) arg_extent);
1284 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1285 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1286 if (arg_extent != ret_extent)
1287 runtime_error ("Incorrect extent in return array in"
1288 " MATMUL intrinsic for dimension 2:"
1289 " is %ld, should be %ld",
1290 (long int) ret_extent, (long int) arg_extent);
1295 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1297 /* One-dimensional result may be addressed in the code below
1298 either as a row or a column matrix. We want both cases to
1299 work. */
1300 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1302 else
1304 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1305 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1309 if (GFC_DESCRIPTOR_RANK (a) == 1)
1311 /* Treat it as a a row matrix A[1,count]. */
1312 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1313 aystride = 1;
1315 xcount = 1;
1316 count = GFC_DESCRIPTOR_EXTENT(a,0);
1318 else
1320 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1321 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1323 count = GFC_DESCRIPTOR_EXTENT(a,1);
1324 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1327 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1329 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1330 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1333 if (GFC_DESCRIPTOR_RANK (b) == 1)
1335 /* Treat it as a column matrix B[count,1] */
1336 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1338 /* bystride should never be used for 1-dimensional b.
1339 The value is only used for calculation of the
1340 memory by the buffer. */
1341 bystride = 256;
1342 ycount = 1;
1344 else
1346 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1347 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1348 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1351 abase = a->base_addr;
1352 bbase = b->base_addr;
1353 dest = retarray->base_addr;
1355 /* Now that everything is set up, we perform the multiplication
1356 itself. */
1358 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1359 #define min(a,b) ((a) <= (b) ? (a) : (b))
1360 #define max(a,b) ((a) >= (b) ? (a) : (b))
1362 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1363 && (bxstride == 1 || bystride == 1)
1364 && (((float) xcount) * ((float) ycount) * ((float) count)
1365 > POW3(blas_limit)))
1367 const int m = xcount, n = ycount, k = count, ldc = rystride;
1368 const GFC_COMPLEX_10 one = 1, zero = 0;
1369 const int lda = (axstride == 1) ? aystride : axstride,
1370 ldb = (bxstride == 1) ? bystride : bxstride;
1372 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1374 assert (gemm != NULL);
1375 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1376 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1377 &ldc, 1, 1);
1378 return;
1382 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1384 /* This block of code implements a tuned matmul, derived from
1385 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1387 Bo Kagstrom and Per Ling
1388 Department of Computing Science
1389 Umea University
1390 S-901 87 Umea, Sweden
1392 from netlib.org, translated to C, and modified for matmul.m4. */
1394 const GFC_COMPLEX_10 *a, *b;
1395 GFC_COMPLEX_10 *c;
1396 const index_type m = xcount, n = ycount, k = count;
1398 /* System generated locals */
1399 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1400 i1, i2, i3, i4, i5, i6;
1402 /* Local variables */
1403 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
1404 f13, f14, f23, f24, f33, f34, f43, f44;
1405 index_type i, j, l, ii, jj, ll;
1406 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1407 GFC_COMPLEX_10 *t1;
1409 a = abase;
1410 b = bbase;
1411 c = retarray->base_addr;
1413 /* Parameter adjustments */
1414 c_dim1 = rystride;
1415 c_offset = 1 + c_dim1;
1416 c -= c_offset;
1417 a_dim1 = aystride;
1418 a_offset = 1 + a_dim1;
1419 a -= a_offset;
1420 b_dim1 = bystride;
1421 b_offset = 1 + b_dim1;
1422 b -= b_offset;
1424 /* Empty c first. */
1425 for (j=1; j<=n; j++)
1426 for (i=1; i<=m; i++)
1427 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
1429 /* Early exit if possible */
1430 if (m == 0 || n == 0 || k == 0)
1431 return;
1433 /* Adjust size of t1 to what is needed. */
1434 index_type t1_dim, a_sz;
1435 if (aystride == 1)
1436 a_sz = rystride;
1437 else
1438 a_sz = a_dim1;
1440 t1_dim = a_sz * 256 + b_dim1;
1441 if (t1_dim > 65536)
1442 t1_dim = 65536;
1444 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
1446 /* Start turning the crank. */
1447 i1 = n;
1448 for (jj = 1; jj <= i1; jj += 512)
1450 /* Computing MIN */
1451 i2 = 512;
1452 i3 = n - jj + 1;
1453 jsec = min(i2,i3);
1454 ujsec = jsec - jsec % 4;
1455 i2 = k;
1456 for (ll = 1; ll <= i2; ll += 256)
1458 /* Computing MIN */
1459 i3 = 256;
1460 i4 = k - ll + 1;
1461 lsec = min(i3,i4);
1462 ulsec = lsec - lsec % 2;
1464 i3 = m;
1465 for (ii = 1; ii <= i3; ii += 256)
1467 /* Computing MIN */
1468 i4 = 256;
1469 i5 = m - ii + 1;
1470 isec = min(i4,i5);
1471 uisec = isec - isec % 2;
1472 i4 = ll + ulsec - 1;
1473 for (l = ll; l <= i4; l += 2)
1475 i5 = ii + uisec - 1;
1476 for (i = ii; i <= i5; i += 2)
1478 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1479 a[i + l * a_dim1];
1480 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1481 a[i + (l + 1) * a_dim1];
1482 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1483 a[i + 1 + l * a_dim1];
1484 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1485 a[i + 1 + (l + 1) * a_dim1];
1487 if (uisec < isec)
1489 t1[l - ll + 1 + (isec << 8) - 257] =
1490 a[ii + isec - 1 + l * a_dim1];
1491 t1[l - ll + 2 + (isec << 8) - 257] =
1492 a[ii + isec - 1 + (l + 1) * a_dim1];
1495 if (ulsec < lsec)
1497 i4 = ii + isec - 1;
1498 for (i = ii; i<= i4; ++i)
1500 t1[lsec + ((i - ii + 1) << 8) - 257] =
1501 a[i + (ll + lsec - 1) * a_dim1];
1505 uisec = isec - isec % 4;
1506 i4 = jj + ujsec - 1;
1507 for (j = jj; j <= i4; j += 4)
1509 i5 = ii + uisec - 1;
1510 for (i = ii; i <= i5; i += 4)
1512 f11 = c[i + j * c_dim1];
1513 f21 = c[i + 1 + j * c_dim1];
1514 f12 = c[i + (j + 1) * c_dim1];
1515 f22 = c[i + 1 + (j + 1) * c_dim1];
1516 f13 = c[i + (j + 2) * c_dim1];
1517 f23 = c[i + 1 + (j + 2) * c_dim1];
1518 f14 = c[i + (j + 3) * c_dim1];
1519 f24 = c[i + 1 + (j + 3) * c_dim1];
1520 f31 = c[i + 2 + j * c_dim1];
1521 f41 = c[i + 3 + j * c_dim1];
1522 f32 = c[i + 2 + (j + 1) * c_dim1];
1523 f42 = c[i + 3 + (j + 1) * c_dim1];
1524 f33 = c[i + 2 + (j + 2) * c_dim1];
1525 f43 = c[i + 3 + (j + 2) * c_dim1];
1526 f34 = c[i + 2 + (j + 3) * c_dim1];
1527 f44 = c[i + 3 + (j + 3) * c_dim1];
1528 i6 = ll + lsec - 1;
1529 for (l = ll; l <= i6; ++l)
1531 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1532 * b[l + j * b_dim1];
1533 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1534 * b[l + j * b_dim1];
1535 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1536 * b[l + (j + 1) * b_dim1];
1537 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1538 * b[l + (j + 1) * b_dim1];
1539 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1540 * b[l + (j + 2) * b_dim1];
1541 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1542 * b[l + (j + 2) * b_dim1];
1543 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1544 * b[l + (j + 3) * b_dim1];
1545 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1546 * b[l + (j + 3) * b_dim1];
1547 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1548 * b[l + j * b_dim1];
1549 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1550 * b[l + j * b_dim1];
1551 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1552 * b[l + (j + 1) * b_dim1];
1553 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1554 * b[l + (j + 1) * b_dim1];
1555 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1556 * b[l + (j + 2) * b_dim1];
1557 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1558 * b[l + (j + 2) * b_dim1];
1559 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1560 * b[l + (j + 3) * b_dim1];
1561 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1562 * b[l + (j + 3) * b_dim1];
1564 c[i + j * c_dim1] = f11;
1565 c[i + 1 + j * c_dim1] = f21;
1566 c[i + (j + 1) * c_dim1] = f12;
1567 c[i + 1 + (j + 1) * c_dim1] = f22;
1568 c[i + (j + 2) * c_dim1] = f13;
1569 c[i + 1 + (j + 2) * c_dim1] = f23;
1570 c[i + (j + 3) * c_dim1] = f14;
1571 c[i + 1 + (j + 3) * c_dim1] = f24;
1572 c[i + 2 + j * c_dim1] = f31;
1573 c[i + 3 + j * c_dim1] = f41;
1574 c[i + 2 + (j + 1) * c_dim1] = f32;
1575 c[i + 3 + (j + 1) * c_dim1] = f42;
1576 c[i + 2 + (j + 2) * c_dim1] = f33;
1577 c[i + 3 + (j + 2) * c_dim1] = f43;
1578 c[i + 2 + (j + 3) * c_dim1] = f34;
1579 c[i + 3 + (j + 3) * c_dim1] = f44;
1581 if (uisec < isec)
1583 i5 = ii + isec - 1;
1584 for (i = ii + uisec; i <= i5; ++i)
1586 f11 = c[i + j * c_dim1];
1587 f12 = c[i + (j + 1) * c_dim1];
1588 f13 = c[i + (j + 2) * c_dim1];
1589 f14 = c[i + (j + 3) * c_dim1];
1590 i6 = ll + lsec - 1;
1591 for (l = ll; l <= i6; ++l)
1593 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1594 257] * b[l + j * b_dim1];
1595 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1596 257] * b[l + (j + 1) * b_dim1];
1597 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1598 257] * b[l + (j + 2) * b_dim1];
1599 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1600 257] * b[l + (j + 3) * b_dim1];
1602 c[i + j * c_dim1] = f11;
1603 c[i + (j + 1) * c_dim1] = f12;
1604 c[i + (j + 2) * c_dim1] = f13;
1605 c[i + (j + 3) * c_dim1] = f14;
1609 if (ujsec < jsec)
1611 i4 = jj + jsec - 1;
1612 for (j = jj + ujsec; j <= i4; ++j)
1614 i5 = ii + uisec - 1;
1615 for (i = ii; i <= i5; i += 4)
1617 f11 = c[i + j * c_dim1];
1618 f21 = c[i + 1 + j * c_dim1];
1619 f31 = c[i + 2 + j * c_dim1];
1620 f41 = c[i + 3 + j * c_dim1];
1621 i6 = ll + lsec - 1;
1622 for (l = ll; l <= i6; ++l)
1624 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1625 257] * b[l + j * b_dim1];
1626 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1627 257] * b[l + j * b_dim1];
1628 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1629 257] * b[l + j * b_dim1];
1630 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1631 257] * b[l + j * b_dim1];
1633 c[i + j * c_dim1] = f11;
1634 c[i + 1 + j * c_dim1] = f21;
1635 c[i + 2 + j * c_dim1] = f31;
1636 c[i + 3 + j * c_dim1] = f41;
1638 i5 = ii + isec - 1;
1639 for (i = ii + uisec; i <= i5; ++i)
1641 f11 = c[i + j * c_dim1];
1642 i6 = ll + lsec - 1;
1643 for (l = ll; l <= i6; ++l)
1645 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1646 257] * b[l + j * b_dim1];
1648 c[i + j * c_dim1] = f11;
1655 free(t1);
1656 return;
1658 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1660 if (GFC_DESCRIPTOR_RANK (a) != 1)
1662 const GFC_COMPLEX_10 *restrict abase_x;
1663 const GFC_COMPLEX_10 *restrict bbase_y;
1664 GFC_COMPLEX_10 *restrict dest_y;
1665 GFC_COMPLEX_10 s;
1667 for (y = 0; y < ycount; y++)
1669 bbase_y = &bbase[y*bystride];
1670 dest_y = &dest[y*rystride];
1671 for (x = 0; x < xcount; x++)
1673 abase_x = &abase[x*axstride];
1674 s = (GFC_COMPLEX_10) 0;
1675 for (n = 0; n < count; n++)
1676 s += abase_x[n] * bbase_y[n];
1677 dest_y[x] = s;
1681 else
1683 const GFC_COMPLEX_10 *restrict bbase_y;
1684 GFC_COMPLEX_10 s;
1686 for (y = 0; y < ycount; y++)
1688 bbase_y = &bbase[y*bystride];
1689 s = (GFC_COMPLEX_10) 0;
1690 for (n = 0; n < count; n++)
1691 s += abase[n*axstride] * bbase_y[n];
1692 dest[y*rystride] = s;
1696 else if (axstride < aystride)
1698 for (y = 0; y < ycount; y++)
1699 for (x = 0; x < xcount; x++)
1700 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
1702 for (y = 0; y < ycount; y++)
1703 for (n = 0; n < count; n++)
1704 for (x = 0; x < xcount; x++)
1705 /* dest[x,y] += a[x,n] * b[n,y] */
1706 dest[x*rxstride + y*rystride] +=
1707 abase[x*axstride + n*aystride] *
1708 bbase[n*bxstride + y*bystride];
1710 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1712 const GFC_COMPLEX_10 *restrict bbase_y;
1713 GFC_COMPLEX_10 s;
1715 for (y = 0; y < ycount; y++)
1717 bbase_y = &bbase[y*bystride];
1718 s = (GFC_COMPLEX_10) 0;
1719 for (n = 0; n < count; n++)
1720 s += abase[n*axstride] * bbase_y[n*bxstride];
1721 dest[y*rxstride] = s;
1724 else
1726 const GFC_COMPLEX_10 *restrict abase_x;
1727 const GFC_COMPLEX_10 *restrict bbase_y;
1728 GFC_COMPLEX_10 *restrict dest_y;
1729 GFC_COMPLEX_10 s;
1731 for (y = 0; y < ycount; y++)
1733 bbase_y = &bbase[y*bystride];
1734 dest_y = &dest[y*rystride];
1735 for (x = 0; x < xcount; x++)
1737 abase_x = &abase[x*axstride];
1738 s = (GFC_COMPLEX_10) 0;
1739 for (n = 0; n < count; n++)
1740 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1741 dest_y[x*rxstride] = s;
1746 #undef POW3
1747 #undef min
1748 #undef max
1750 #endif /* HAVE_AVX512F */
1752 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
1754 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1755 void
1756 matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
1757 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1758 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1759 internal_proto(matmul_c10_avx128_fma3);
1760 #endif
1762 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1763 void
1764 matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
1765 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1766 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1767 internal_proto(matmul_c10_avx128_fma4);
1768 #endif
1770 /* Function to fall back to if there is no special processor-specific version. */
1771 static void
1772 matmul_c10_vanilla (gfc_array_c10 * const restrict retarray,
1773 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1774 int blas_limit, blas_call gemm)
1776 const GFC_COMPLEX_10 * restrict abase;
1777 const GFC_COMPLEX_10 * restrict bbase;
1778 GFC_COMPLEX_10 * restrict dest;
1780 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1781 index_type x, y, n, count, xcount, ycount;
1783 assert (GFC_DESCRIPTOR_RANK (a) == 2
1784 || GFC_DESCRIPTOR_RANK (b) == 2);
1786 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1788 Either A or B (but not both) can be rank 1:
1790 o One-dimensional argument A is implicitly treated as a row matrix
1791 dimensioned [1,count], so xcount=1.
1793 o One-dimensional argument B is implicitly treated as a column matrix
1794 dimensioned [count, 1], so ycount=1.
1797 if (retarray->base_addr == NULL)
1799 if (GFC_DESCRIPTOR_RANK (a) == 1)
1801 GFC_DIMENSION_SET(retarray->dim[0], 0,
1802 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1804 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1806 GFC_DIMENSION_SET(retarray->dim[0], 0,
1807 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1809 else
1811 GFC_DIMENSION_SET(retarray->dim[0], 0,
1812 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1814 GFC_DIMENSION_SET(retarray->dim[1], 0,
1815 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1816 GFC_DESCRIPTOR_EXTENT(retarray,0));
1819 retarray->base_addr
1820 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
1821 retarray->offset = 0;
1823 else if (unlikely (compile_options.bounds_check))
1825 index_type ret_extent, arg_extent;
1827 if (GFC_DESCRIPTOR_RANK (a) == 1)
1829 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1830 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1831 if (arg_extent != ret_extent)
1832 runtime_error ("Incorrect extent in return array in"
1833 " MATMUL intrinsic: is %ld, should be %ld",
1834 (long int) ret_extent, (long int) arg_extent);
1836 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1838 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1839 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1840 if (arg_extent != ret_extent)
1841 runtime_error ("Incorrect extent in return array in"
1842 " MATMUL intrinsic: is %ld, should be %ld",
1843 (long int) ret_extent, (long int) arg_extent);
1845 else
1847 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1848 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1849 if (arg_extent != ret_extent)
1850 runtime_error ("Incorrect extent in return array in"
1851 " MATMUL intrinsic for dimension 1:"
1852 " is %ld, should be %ld",
1853 (long int) ret_extent, (long int) arg_extent);
1855 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1856 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1857 if (arg_extent != ret_extent)
1858 runtime_error ("Incorrect extent in return array in"
1859 " MATMUL intrinsic for dimension 2:"
1860 " is %ld, should be %ld",
1861 (long int) ret_extent, (long int) arg_extent);
1866 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1868 /* One-dimensional result may be addressed in the code below
1869 either as a row or a column matrix. We want both cases to
1870 work. */
1871 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1873 else
1875 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1876 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1880 if (GFC_DESCRIPTOR_RANK (a) == 1)
1882 /* Treat it as a a row matrix A[1,count]. */
1883 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1884 aystride = 1;
1886 xcount = 1;
1887 count = GFC_DESCRIPTOR_EXTENT(a,0);
1889 else
1891 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1892 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1894 count = GFC_DESCRIPTOR_EXTENT(a,1);
1895 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1898 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1900 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1901 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1904 if (GFC_DESCRIPTOR_RANK (b) == 1)
1906 /* Treat it as a column matrix B[count,1] */
1907 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1909 /* bystride should never be used for 1-dimensional b.
1910 The value is only used for calculation of the
1911 memory by the buffer. */
1912 bystride = 256;
1913 ycount = 1;
1915 else
1917 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1918 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1919 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1922 abase = a->base_addr;
1923 bbase = b->base_addr;
1924 dest = retarray->base_addr;
1926 /* Now that everything is set up, we perform the multiplication
1927 itself. */
1929 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1930 #define min(a,b) ((a) <= (b) ? (a) : (b))
1931 #define max(a,b) ((a) >= (b) ? (a) : (b))
1933 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1934 && (bxstride == 1 || bystride == 1)
1935 && (((float) xcount) * ((float) ycount) * ((float) count)
1936 > POW3(blas_limit)))
1938 const int m = xcount, n = ycount, k = count, ldc = rystride;
1939 const GFC_COMPLEX_10 one = 1, zero = 0;
1940 const int lda = (axstride == 1) ? aystride : axstride,
1941 ldb = (bxstride == 1) ? bystride : bxstride;
1943 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1945 assert (gemm != NULL);
1946 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1947 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1948 &ldc, 1, 1);
1949 return;
1953 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1955 /* This block of code implements a tuned matmul, derived from
1956 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1958 Bo Kagstrom and Per Ling
1959 Department of Computing Science
1960 Umea University
1961 S-901 87 Umea, Sweden
1963 from netlib.org, translated to C, and modified for matmul.m4. */
1965 const GFC_COMPLEX_10 *a, *b;
1966 GFC_COMPLEX_10 *c;
1967 const index_type m = xcount, n = ycount, k = count;
1969 /* System generated locals */
1970 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1971 i1, i2, i3, i4, i5, i6;
1973 /* Local variables */
1974 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
1975 f13, f14, f23, f24, f33, f34, f43, f44;
1976 index_type i, j, l, ii, jj, ll;
1977 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1978 GFC_COMPLEX_10 *t1;
1980 a = abase;
1981 b = bbase;
1982 c = retarray->base_addr;
1984 /* Parameter adjustments */
1985 c_dim1 = rystride;
1986 c_offset = 1 + c_dim1;
1987 c -= c_offset;
1988 a_dim1 = aystride;
1989 a_offset = 1 + a_dim1;
1990 a -= a_offset;
1991 b_dim1 = bystride;
1992 b_offset = 1 + b_dim1;
1993 b -= b_offset;
1995 /* Empty c first. */
1996 for (j=1; j<=n; j++)
1997 for (i=1; i<=m; i++)
1998 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
2000 /* Early exit if possible */
2001 if (m == 0 || n == 0 || k == 0)
2002 return;
2004 /* Adjust size of t1 to what is needed. */
2005 index_type t1_dim, a_sz;
2006 if (aystride == 1)
2007 a_sz = rystride;
2008 else
2009 a_sz = a_dim1;
2011 t1_dim = a_sz * 256 + b_dim1;
2012 if (t1_dim > 65536)
2013 t1_dim = 65536;
2015 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
2017 /* Start turning the crank. */
2018 i1 = n;
2019 for (jj = 1; jj <= i1; jj += 512)
2021 /* Computing MIN */
2022 i2 = 512;
2023 i3 = n - jj + 1;
2024 jsec = min(i2,i3);
2025 ujsec = jsec - jsec % 4;
2026 i2 = k;
2027 for (ll = 1; ll <= i2; ll += 256)
2029 /* Computing MIN */
2030 i3 = 256;
2031 i4 = k - ll + 1;
2032 lsec = min(i3,i4);
2033 ulsec = lsec - lsec % 2;
2035 i3 = m;
2036 for (ii = 1; ii <= i3; ii += 256)
2038 /* Computing MIN */
2039 i4 = 256;
2040 i5 = m - ii + 1;
2041 isec = min(i4,i5);
2042 uisec = isec - isec % 2;
2043 i4 = ll + ulsec - 1;
2044 for (l = ll; l <= i4; l += 2)
2046 i5 = ii + uisec - 1;
2047 for (i = ii; i <= i5; i += 2)
2049 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2050 a[i + l * a_dim1];
2051 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2052 a[i + (l + 1) * a_dim1];
2053 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2054 a[i + 1 + l * a_dim1];
2055 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2056 a[i + 1 + (l + 1) * a_dim1];
2058 if (uisec < isec)
2060 t1[l - ll + 1 + (isec << 8) - 257] =
2061 a[ii + isec - 1 + l * a_dim1];
2062 t1[l - ll + 2 + (isec << 8) - 257] =
2063 a[ii + isec - 1 + (l + 1) * a_dim1];
2066 if (ulsec < lsec)
2068 i4 = ii + isec - 1;
2069 for (i = ii; i<= i4; ++i)
2071 t1[lsec + ((i - ii + 1) << 8) - 257] =
2072 a[i + (ll + lsec - 1) * a_dim1];
2076 uisec = isec - isec % 4;
2077 i4 = jj + ujsec - 1;
2078 for (j = jj; j <= i4; j += 4)
2080 i5 = ii + uisec - 1;
2081 for (i = ii; i <= i5; i += 4)
2083 f11 = c[i + j * c_dim1];
2084 f21 = c[i + 1 + j * c_dim1];
2085 f12 = c[i + (j + 1) * c_dim1];
2086 f22 = c[i + 1 + (j + 1) * c_dim1];
2087 f13 = c[i + (j + 2) * c_dim1];
2088 f23 = c[i + 1 + (j + 2) * c_dim1];
2089 f14 = c[i + (j + 3) * c_dim1];
2090 f24 = c[i + 1 + (j + 3) * c_dim1];
2091 f31 = c[i + 2 + j * c_dim1];
2092 f41 = c[i + 3 + j * c_dim1];
2093 f32 = c[i + 2 + (j + 1) * c_dim1];
2094 f42 = c[i + 3 + (j + 1) * c_dim1];
2095 f33 = c[i + 2 + (j + 2) * c_dim1];
2096 f43 = c[i + 3 + (j + 2) * c_dim1];
2097 f34 = c[i + 2 + (j + 3) * c_dim1];
2098 f44 = c[i + 3 + (j + 3) * c_dim1];
2099 i6 = ll + lsec - 1;
2100 for (l = ll; l <= i6; ++l)
2102 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2103 * b[l + j * b_dim1];
2104 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2105 * b[l + j * b_dim1];
2106 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2107 * b[l + (j + 1) * b_dim1];
2108 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2109 * b[l + (j + 1) * b_dim1];
2110 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2111 * b[l + (j + 2) * b_dim1];
2112 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2113 * b[l + (j + 2) * b_dim1];
2114 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2115 * b[l + (j + 3) * b_dim1];
2116 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2117 * b[l + (j + 3) * b_dim1];
2118 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2119 * b[l + j * b_dim1];
2120 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2121 * b[l + j * b_dim1];
2122 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2123 * b[l + (j + 1) * b_dim1];
2124 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2125 * b[l + (j + 1) * b_dim1];
2126 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2127 * b[l + (j + 2) * b_dim1];
2128 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2129 * b[l + (j + 2) * b_dim1];
2130 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2131 * b[l + (j + 3) * b_dim1];
2132 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2133 * b[l + (j + 3) * b_dim1];
2135 c[i + j * c_dim1] = f11;
2136 c[i + 1 + j * c_dim1] = f21;
2137 c[i + (j + 1) * c_dim1] = f12;
2138 c[i + 1 + (j + 1) * c_dim1] = f22;
2139 c[i + (j + 2) * c_dim1] = f13;
2140 c[i + 1 + (j + 2) * c_dim1] = f23;
2141 c[i + (j + 3) * c_dim1] = f14;
2142 c[i + 1 + (j + 3) * c_dim1] = f24;
2143 c[i + 2 + j * c_dim1] = f31;
2144 c[i + 3 + j * c_dim1] = f41;
2145 c[i + 2 + (j + 1) * c_dim1] = f32;
2146 c[i + 3 + (j + 1) * c_dim1] = f42;
2147 c[i + 2 + (j + 2) * c_dim1] = f33;
2148 c[i + 3 + (j + 2) * c_dim1] = f43;
2149 c[i + 2 + (j + 3) * c_dim1] = f34;
2150 c[i + 3 + (j + 3) * c_dim1] = f44;
2152 if (uisec < isec)
2154 i5 = ii + isec - 1;
2155 for (i = ii + uisec; i <= i5; ++i)
2157 f11 = c[i + j * c_dim1];
2158 f12 = c[i + (j + 1) * c_dim1];
2159 f13 = c[i + (j + 2) * c_dim1];
2160 f14 = c[i + (j + 3) * c_dim1];
2161 i6 = ll + lsec - 1;
2162 for (l = ll; l <= i6; ++l)
2164 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2165 257] * b[l + j * b_dim1];
2166 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2167 257] * b[l + (j + 1) * b_dim1];
2168 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2169 257] * b[l + (j + 2) * b_dim1];
2170 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2171 257] * b[l + (j + 3) * b_dim1];
2173 c[i + j * c_dim1] = f11;
2174 c[i + (j + 1) * c_dim1] = f12;
2175 c[i + (j + 2) * c_dim1] = f13;
2176 c[i + (j + 3) * c_dim1] = f14;
2180 if (ujsec < jsec)
2182 i4 = jj + jsec - 1;
2183 for (j = jj + ujsec; j <= i4; ++j)
2185 i5 = ii + uisec - 1;
2186 for (i = ii; i <= i5; i += 4)
2188 f11 = c[i + j * c_dim1];
2189 f21 = c[i + 1 + j * c_dim1];
2190 f31 = c[i + 2 + j * c_dim1];
2191 f41 = c[i + 3 + j * c_dim1];
2192 i6 = ll + lsec - 1;
2193 for (l = ll; l <= i6; ++l)
2195 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2196 257] * b[l + j * b_dim1];
2197 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2198 257] * b[l + j * b_dim1];
2199 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2200 257] * b[l + j * b_dim1];
2201 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2202 257] * b[l + j * b_dim1];
2204 c[i + j * c_dim1] = f11;
2205 c[i + 1 + j * c_dim1] = f21;
2206 c[i + 2 + j * c_dim1] = f31;
2207 c[i + 3 + j * c_dim1] = f41;
2209 i5 = ii + isec - 1;
2210 for (i = ii + uisec; i <= i5; ++i)
2212 f11 = c[i + j * c_dim1];
2213 i6 = ll + lsec - 1;
2214 for (l = ll; l <= i6; ++l)
2216 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217 257] * b[l + j * b_dim1];
2219 c[i + j * c_dim1] = f11;
2226 free(t1);
2227 return;
2229 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2231 if (GFC_DESCRIPTOR_RANK (a) != 1)
2233 const GFC_COMPLEX_10 *restrict abase_x;
2234 const GFC_COMPLEX_10 *restrict bbase_y;
2235 GFC_COMPLEX_10 *restrict dest_y;
2236 GFC_COMPLEX_10 s;
2238 for (y = 0; y < ycount; y++)
2240 bbase_y = &bbase[y*bystride];
2241 dest_y = &dest[y*rystride];
2242 for (x = 0; x < xcount; x++)
2244 abase_x = &abase[x*axstride];
2245 s = (GFC_COMPLEX_10) 0;
2246 for (n = 0; n < count; n++)
2247 s += abase_x[n] * bbase_y[n];
2248 dest_y[x] = s;
2252 else
2254 const GFC_COMPLEX_10 *restrict bbase_y;
2255 GFC_COMPLEX_10 s;
2257 for (y = 0; y < ycount; y++)
2259 bbase_y = &bbase[y*bystride];
2260 s = (GFC_COMPLEX_10) 0;
2261 for (n = 0; n < count; n++)
2262 s += abase[n*axstride] * bbase_y[n];
2263 dest[y*rystride] = s;
2267 else if (axstride < aystride)
2269 for (y = 0; y < ycount; y++)
2270 for (x = 0; x < xcount; x++)
2271 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
2273 for (y = 0; y < ycount; y++)
2274 for (n = 0; n < count; n++)
2275 for (x = 0; x < xcount; x++)
2276 /* dest[x,y] += a[x,n] * b[n,y] */
2277 dest[x*rxstride + y*rystride] +=
2278 abase[x*axstride + n*aystride] *
2279 bbase[n*bxstride + y*bystride];
2281 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2283 const GFC_COMPLEX_10 *restrict bbase_y;
2284 GFC_COMPLEX_10 s;
2286 for (y = 0; y < ycount; y++)
2288 bbase_y = &bbase[y*bystride];
2289 s = (GFC_COMPLEX_10) 0;
2290 for (n = 0; n < count; n++)
2291 s += abase[n*axstride] * bbase_y[n*bxstride];
2292 dest[y*rxstride] = s;
2295 else
2297 const GFC_COMPLEX_10 *restrict abase_x;
2298 const GFC_COMPLEX_10 *restrict bbase_y;
2299 GFC_COMPLEX_10 *restrict dest_y;
2300 GFC_COMPLEX_10 s;
2302 for (y = 0; y < ycount; y++)
2304 bbase_y = &bbase[y*bystride];
2305 dest_y = &dest[y*rystride];
2306 for (x = 0; x < xcount; x++)
2308 abase_x = &abase[x*axstride];
2309 s = (GFC_COMPLEX_10) 0;
2310 for (n = 0; n < count; n++)
2311 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2312 dest_y[x*rxstride] = s;
2317 #undef POW3
2318 #undef min
2319 #undef max
2322 /* Compiling main function, with selection code for the processor. */
2324 /* Currently, this is i386 only. Adjust for other architectures. */
2326 #include <config/i386/cpuinfo.h>
2327 void matmul_c10 (gfc_array_c10 * const restrict retarray,
2328 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2329 int blas_limit, blas_call gemm)
2331 static void (*matmul_p) (gfc_array_c10 * const restrict retarray,
2332 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2333 int blas_limit, blas_call gemm);
2335 void (*matmul_fn) (gfc_array_c10 * const restrict retarray,
2336 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2337 int blas_limit, blas_call gemm);
2339 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2340 if (matmul_fn == NULL)
2342 matmul_fn = matmul_c10_vanilla;
2343 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2345 /* Run down the available processors in order of preference. */
2346 #ifdef HAVE_AVX512F
2347 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2349 matmul_fn = matmul_c10_avx512f;
2350 goto store;
2353 #endif /* HAVE_AVX512F */
2355 #ifdef HAVE_AVX2
2356 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2357 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2359 matmul_fn = matmul_c10_avx2;
2360 goto store;
2363 #endif
2365 #ifdef HAVE_AVX
2366 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2368 matmul_fn = matmul_c10_avx;
2369 goto store;
2371 #endif /* HAVE_AVX */
2373 else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2375 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2376 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2377 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2379 matmul_fn = matmul_c10_avx128_fma3;
2380 goto store;
2382 #endif
2383 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2384 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2385 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2387 matmul_fn = matmul_c10_avx128_fma4;
2388 goto store;
2390 #endif
2393 store:
2394 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2397 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2400 #else /* Just the vanilla function. */
2402 void
2403 matmul_c10 (gfc_array_c10 * const restrict retarray,
2404 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2405 int blas_limit, blas_call gemm)
2407 const GFC_COMPLEX_10 * restrict abase;
2408 const GFC_COMPLEX_10 * restrict bbase;
2409 GFC_COMPLEX_10 * restrict dest;
2411 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2412 index_type x, y, n, count, xcount, ycount;
2414 assert (GFC_DESCRIPTOR_RANK (a) == 2
2415 || GFC_DESCRIPTOR_RANK (b) == 2);
2417 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2419 Either A or B (but not both) can be rank 1:
2421 o One-dimensional argument A is implicitly treated as a row matrix
2422 dimensioned [1,count], so xcount=1.
2424 o One-dimensional argument B is implicitly treated as a column matrix
2425 dimensioned [count, 1], so ycount=1.
2428 if (retarray->base_addr == NULL)
2430 if (GFC_DESCRIPTOR_RANK (a) == 1)
2432 GFC_DIMENSION_SET(retarray->dim[0], 0,
2433 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2435 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2437 GFC_DIMENSION_SET(retarray->dim[0], 0,
2438 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2440 else
2442 GFC_DIMENSION_SET(retarray->dim[0], 0,
2443 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2445 GFC_DIMENSION_SET(retarray->dim[1], 0,
2446 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2447 GFC_DESCRIPTOR_EXTENT(retarray,0));
2450 retarray->base_addr
2451 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
2452 retarray->offset = 0;
2454 else if (unlikely (compile_options.bounds_check))
2456 index_type ret_extent, arg_extent;
2458 if (GFC_DESCRIPTOR_RANK (a) == 1)
2460 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2461 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2462 if (arg_extent != ret_extent)
2463 runtime_error ("Incorrect extent in return array in"
2464 " MATMUL intrinsic: is %ld, should be %ld",
2465 (long int) ret_extent, (long int) arg_extent);
2467 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2469 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2470 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2471 if (arg_extent != ret_extent)
2472 runtime_error ("Incorrect extent in return array in"
2473 " MATMUL intrinsic: is %ld, should be %ld",
2474 (long int) ret_extent, (long int) arg_extent);
2476 else
2478 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2479 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2480 if (arg_extent != ret_extent)
2481 runtime_error ("Incorrect extent in return array in"
2482 " MATMUL intrinsic for dimension 1:"
2483 " is %ld, should be %ld",
2484 (long int) ret_extent, (long int) arg_extent);
2486 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2487 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2488 if (arg_extent != ret_extent)
2489 runtime_error ("Incorrect extent in return array in"
2490 " MATMUL intrinsic for dimension 2:"
2491 " is %ld, should be %ld",
2492 (long int) ret_extent, (long int) arg_extent);
2497 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2499 /* One-dimensional result may be addressed in the code below
2500 either as a row or a column matrix. We want both cases to
2501 work. */
2502 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2504 else
2506 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2507 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2511 if (GFC_DESCRIPTOR_RANK (a) == 1)
2513 /* Treat it as a a row matrix A[1,count]. */
2514 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2515 aystride = 1;
2517 xcount = 1;
2518 count = GFC_DESCRIPTOR_EXTENT(a,0);
2520 else
2522 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2523 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2525 count = GFC_DESCRIPTOR_EXTENT(a,1);
2526 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2529 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2531 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2532 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2535 if (GFC_DESCRIPTOR_RANK (b) == 1)
2537 /* Treat it as a column matrix B[count,1] */
2538 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2540 /* bystride should never be used for 1-dimensional b.
2541 The value is only used for calculation of the
2542 memory by the buffer. */
2543 bystride = 256;
2544 ycount = 1;
2546 else
2548 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2549 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2550 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2553 abase = a->base_addr;
2554 bbase = b->base_addr;
2555 dest = retarray->base_addr;
2557 /* Now that everything is set up, we perform the multiplication
2558 itself. */
2560 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2561 #define min(a,b) ((a) <= (b) ? (a) : (b))
2562 #define max(a,b) ((a) >= (b) ? (a) : (b))
2564 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2565 && (bxstride == 1 || bystride == 1)
2566 && (((float) xcount) * ((float) ycount) * ((float) count)
2567 > POW3(blas_limit)))
2569 const int m = xcount, n = ycount, k = count, ldc = rystride;
2570 const GFC_COMPLEX_10 one = 1, zero = 0;
2571 const int lda = (axstride == 1) ? aystride : axstride,
2572 ldb = (bxstride == 1) ? bystride : bxstride;
2574 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2576 assert (gemm != NULL);
2577 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2578 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2579 &ldc, 1, 1);
2580 return;
2584 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2586 /* This block of code implements a tuned matmul, derived from
2587 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2589 Bo Kagstrom and Per Ling
2590 Department of Computing Science
2591 Umea University
2592 S-901 87 Umea, Sweden
2594 from netlib.org, translated to C, and modified for matmul.m4. */
2596 const GFC_COMPLEX_10 *a, *b;
2597 GFC_COMPLEX_10 *c;
2598 const index_type m = xcount, n = ycount, k = count;
2600 /* System generated locals */
2601 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2602 i1, i2, i3, i4, i5, i6;
2604 /* Local variables */
2605 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
2606 f13, f14, f23, f24, f33, f34, f43, f44;
2607 index_type i, j, l, ii, jj, ll;
2608 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2609 GFC_COMPLEX_10 *t1;
2611 a = abase;
2612 b = bbase;
2613 c = retarray->base_addr;
2615 /* Parameter adjustments */
2616 c_dim1 = rystride;
2617 c_offset = 1 + c_dim1;
2618 c -= c_offset;
2619 a_dim1 = aystride;
2620 a_offset = 1 + a_dim1;
2621 a -= a_offset;
2622 b_dim1 = bystride;
2623 b_offset = 1 + b_dim1;
2624 b -= b_offset;
2626 /* Empty c first. */
2627 for (j=1; j<=n; j++)
2628 for (i=1; i<=m; i++)
2629 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
2631 /* Early exit if possible */
2632 if (m == 0 || n == 0 || k == 0)
2633 return;
2635 /* Adjust size of t1 to what is needed. */
2636 index_type t1_dim, a_sz;
2637 if (aystride == 1)
2638 a_sz = rystride;
2639 else
2640 a_sz = a_dim1;
2642 t1_dim = a_sz * 256 + b_dim1;
2643 if (t1_dim > 65536)
2644 t1_dim = 65536;
2646 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
2648 /* Start turning the crank. */
2649 i1 = n;
2650 for (jj = 1; jj <= i1; jj += 512)
2652 /* Computing MIN */
2653 i2 = 512;
2654 i3 = n - jj + 1;
2655 jsec = min(i2,i3);
2656 ujsec = jsec - jsec % 4;
2657 i2 = k;
2658 for (ll = 1; ll <= i2; ll += 256)
2660 /* Computing MIN */
2661 i3 = 256;
2662 i4 = k - ll + 1;
2663 lsec = min(i3,i4);
2664 ulsec = lsec - lsec % 2;
2666 i3 = m;
2667 for (ii = 1; ii <= i3; ii += 256)
2669 /* Computing MIN */
2670 i4 = 256;
2671 i5 = m - ii + 1;
2672 isec = min(i4,i5);
2673 uisec = isec - isec % 2;
2674 i4 = ll + ulsec - 1;
2675 for (l = ll; l <= i4; l += 2)
2677 i5 = ii + uisec - 1;
2678 for (i = ii; i <= i5; i += 2)
2680 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2681 a[i + l * a_dim1];
2682 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2683 a[i + (l + 1) * a_dim1];
2684 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2685 a[i + 1 + l * a_dim1];
2686 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2687 a[i + 1 + (l + 1) * a_dim1];
2689 if (uisec < isec)
2691 t1[l - ll + 1 + (isec << 8) - 257] =
2692 a[ii + isec - 1 + l * a_dim1];
2693 t1[l - ll + 2 + (isec << 8) - 257] =
2694 a[ii + isec - 1 + (l + 1) * a_dim1];
2697 if (ulsec < lsec)
2699 i4 = ii + isec - 1;
2700 for (i = ii; i<= i4; ++i)
2702 t1[lsec + ((i - ii + 1) << 8) - 257] =
2703 a[i + (ll + lsec - 1) * a_dim1];
2707 uisec = isec - isec % 4;
2708 i4 = jj + ujsec - 1;
2709 for (j = jj; j <= i4; j += 4)
2711 i5 = ii + uisec - 1;
2712 for (i = ii; i <= i5; i += 4)
2714 f11 = c[i + j * c_dim1];
2715 f21 = c[i + 1 + j * c_dim1];
2716 f12 = c[i + (j + 1) * c_dim1];
2717 f22 = c[i + 1 + (j + 1) * c_dim1];
2718 f13 = c[i + (j + 2) * c_dim1];
2719 f23 = c[i + 1 + (j + 2) * c_dim1];
2720 f14 = c[i + (j + 3) * c_dim1];
2721 f24 = c[i + 1 + (j + 3) * c_dim1];
2722 f31 = c[i + 2 + j * c_dim1];
2723 f41 = c[i + 3 + j * c_dim1];
2724 f32 = c[i + 2 + (j + 1) * c_dim1];
2725 f42 = c[i + 3 + (j + 1) * c_dim1];
2726 f33 = c[i + 2 + (j + 2) * c_dim1];
2727 f43 = c[i + 3 + (j + 2) * c_dim1];
2728 f34 = c[i + 2 + (j + 3) * c_dim1];
2729 f44 = c[i + 3 + (j + 3) * c_dim1];
2730 i6 = ll + lsec - 1;
2731 for (l = ll; l <= i6; ++l)
2733 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2734 * b[l + j * b_dim1];
2735 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2736 * b[l + j * b_dim1];
2737 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2738 * b[l + (j + 1) * b_dim1];
2739 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2740 * b[l + (j + 1) * b_dim1];
2741 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2742 * b[l + (j + 2) * b_dim1];
2743 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2744 * b[l + (j + 2) * b_dim1];
2745 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2746 * b[l + (j + 3) * b_dim1];
2747 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2748 * b[l + (j + 3) * b_dim1];
2749 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2750 * b[l + j * b_dim1];
2751 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2752 * b[l + j * b_dim1];
2753 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2754 * b[l + (j + 1) * b_dim1];
2755 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2756 * b[l + (j + 1) * b_dim1];
2757 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2758 * b[l + (j + 2) * b_dim1];
2759 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2760 * b[l + (j + 2) * b_dim1];
2761 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2762 * b[l + (j + 3) * b_dim1];
2763 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2764 * b[l + (j + 3) * b_dim1];
2766 c[i + j * c_dim1] = f11;
2767 c[i + 1 + j * c_dim1] = f21;
2768 c[i + (j + 1) * c_dim1] = f12;
2769 c[i + 1 + (j + 1) * c_dim1] = f22;
2770 c[i + (j + 2) * c_dim1] = f13;
2771 c[i + 1 + (j + 2) * c_dim1] = f23;
2772 c[i + (j + 3) * c_dim1] = f14;
2773 c[i + 1 + (j + 3) * c_dim1] = f24;
2774 c[i + 2 + j * c_dim1] = f31;
2775 c[i + 3 + j * c_dim1] = f41;
2776 c[i + 2 + (j + 1) * c_dim1] = f32;
2777 c[i + 3 + (j + 1) * c_dim1] = f42;
2778 c[i + 2 + (j + 2) * c_dim1] = f33;
2779 c[i + 3 + (j + 2) * c_dim1] = f43;
2780 c[i + 2 + (j + 3) * c_dim1] = f34;
2781 c[i + 3 + (j + 3) * c_dim1] = f44;
2783 if (uisec < isec)
2785 i5 = ii + isec - 1;
2786 for (i = ii + uisec; i <= i5; ++i)
2788 f11 = c[i + j * c_dim1];
2789 f12 = c[i + (j + 1) * c_dim1];
2790 f13 = c[i + (j + 2) * c_dim1];
2791 f14 = c[i + (j + 3) * c_dim1];
2792 i6 = ll + lsec - 1;
2793 for (l = ll; l <= i6; ++l)
2795 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2796 257] * b[l + j * b_dim1];
2797 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2798 257] * b[l + (j + 1) * b_dim1];
2799 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2800 257] * b[l + (j + 2) * b_dim1];
2801 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2802 257] * b[l + (j + 3) * b_dim1];
2804 c[i + j * c_dim1] = f11;
2805 c[i + (j + 1) * c_dim1] = f12;
2806 c[i + (j + 2) * c_dim1] = f13;
2807 c[i + (j + 3) * c_dim1] = f14;
2811 if (ujsec < jsec)
2813 i4 = jj + jsec - 1;
2814 for (j = jj + ujsec; j <= i4; ++j)
2816 i5 = ii + uisec - 1;
2817 for (i = ii; i <= i5; i += 4)
2819 f11 = c[i + j * c_dim1];
2820 f21 = c[i + 1 + j * c_dim1];
2821 f31 = c[i + 2 + j * c_dim1];
2822 f41 = c[i + 3 + j * c_dim1];
2823 i6 = ll + lsec - 1;
2824 for (l = ll; l <= i6; ++l)
2826 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2827 257] * b[l + j * b_dim1];
2828 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2829 257] * b[l + j * b_dim1];
2830 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2831 257] * b[l + j * b_dim1];
2832 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2833 257] * b[l + j * b_dim1];
2835 c[i + j * c_dim1] = f11;
2836 c[i + 1 + j * c_dim1] = f21;
2837 c[i + 2 + j * c_dim1] = f31;
2838 c[i + 3 + j * c_dim1] = f41;
2840 i5 = ii + isec - 1;
2841 for (i = ii + uisec; i <= i5; ++i)
2843 f11 = c[i + j * c_dim1];
2844 i6 = ll + lsec - 1;
2845 for (l = ll; l <= i6; ++l)
2847 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2848 257] * b[l + j * b_dim1];
2850 c[i + j * c_dim1] = f11;
2857 free(t1);
2858 return;
2860 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2862 if (GFC_DESCRIPTOR_RANK (a) != 1)
2864 const GFC_COMPLEX_10 *restrict abase_x;
2865 const GFC_COMPLEX_10 *restrict bbase_y;
2866 GFC_COMPLEX_10 *restrict dest_y;
2867 GFC_COMPLEX_10 s;
2869 for (y = 0; y < ycount; y++)
2871 bbase_y = &bbase[y*bystride];
2872 dest_y = &dest[y*rystride];
2873 for (x = 0; x < xcount; x++)
2875 abase_x = &abase[x*axstride];
2876 s = (GFC_COMPLEX_10) 0;
2877 for (n = 0; n < count; n++)
2878 s += abase_x[n] * bbase_y[n];
2879 dest_y[x] = s;
2883 else
2885 const GFC_COMPLEX_10 *restrict bbase_y;
2886 GFC_COMPLEX_10 s;
2888 for (y = 0; y < ycount; y++)
2890 bbase_y = &bbase[y*bystride];
2891 s = (GFC_COMPLEX_10) 0;
2892 for (n = 0; n < count; n++)
2893 s += abase[n*axstride] * bbase_y[n];
2894 dest[y*rystride] = s;
2898 else if (axstride < aystride)
2900 for (y = 0; y < ycount; y++)
2901 for (x = 0; x < xcount; x++)
2902 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
2904 for (y = 0; y < ycount; y++)
2905 for (n = 0; n < count; n++)
2906 for (x = 0; x < xcount; x++)
2907 /* dest[x,y] += a[x,n] * b[n,y] */
2908 dest[x*rxstride + y*rystride] +=
2909 abase[x*axstride + n*aystride] *
2910 bbase[n*bxstride + y*bystride];
2912 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2914 const GFC_COMPLEX_10 *restrict bbase_y;
2915 GFC_COMPLEX_10 s;
2917 for (y = 0; y < ycount; y++)
2919 bbase_y = &bbase[y*bystride];
2920 s = (GFC_COMPLEX_10) 0;
2921 for (n = 0; n < count; n++)
2922 s += abase[n*axstride] * bbase_y[n*bxstride];
2923 dest[y*rxstride] = s;
2926 else
2928 const GFC_COMPLEX_10 *restrict abase_x;
2929 const GFC_COMPLEX_10 *restrict bbase_y;
2930 GFC_COMPLEX_10 *restrict dest_y;
2931 GFC_COMPLEX_10 s;
2933 for (y = 0; y < ycount; y++)
2935 bbase_y = &bbase[y*bystride];
2936 dest_y = &dest[y*rystride];
2937 for (x = 0; x < xcount; x++)
2939 abase_x = &abase[x*axstride];
2940 s = (GFC_COMPLEX_10) 0;
2941 for (n = 0; n < count; n++)
2942 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2943 dest_y[x*rxstride] = s;
2948 #undef POW3
2949 #undef min
2950 #undef max
2952 #endif
2953 #endif