1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2016 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
32 #if defined (HAVE_GFC_REAL_4)
34 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
35 passed to us by the front-end, in which case we call it for large
38 typedef void (*blas_call
)(const char *, const char *, const int *, const int *,
39 const int *, const GFC_REAL_4
*, const GFC_REAL_4
*,
40 const int *, const GFC_REAL_4
*, const int *,
41 const GFC_REAL_4
*, GFC_REAL_4
*, const int *,
44 /* The order of loops is different in the case of plain matrix
45 multiplication C=MATMUL(A,B), and in the frequent special case where
46 the argument A is the temporary result of a TRANSPOSE intrinsic:
47 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
48 looking at their strides.
50 The equivalent Fortran pseudo-code is:
52 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
53 IF (.NOT.IS_TRANSPOSED(A)) THEN
58 C(I,J) = C(I,J)+A(I,K)*B(K,J)
69 /* If try_blas is set to a nonzero value, then the matmul function will
70 see if there is a way to perform the matrix multiplication by a call
71 to the BLAS gemm function. */
73 extern void matmul_r4 (gfc_array_r4
* const restrict retarray
,
74 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
75 int blas_limit
, blas_call gemm
);
76 export_proto(matmul_r4
);
78 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
79 /* REAL types generate identical code for AVX and AVX2. Only generate
80 an AVX2 function if we are dealing with integer. */
85 /* Put exhaustive list of possible architectures here here, ORed together. */
87 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
91 matmul_r4_avx (gfc_array_r4
* const restrict retarray
,
92 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
93 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx")));
95 matmul_r4_avx (gfc_array_r4
* const restrict retarray
,
96 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
97 int blas_limit
, blas_call gemm
)
99 const GFC_REAL_4
* restrict abase
;
100 const GFC_REAL_4
* restrict bbase
;
101 GFC_REAL_4
* restrict dest
;
103 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
104 index_type x
, y
, n
, count
, xcount
, ycount
;
106 assert (GFC_DESCRIPTOR_RANK (a
) == 2
107 || GFC_DESCRIPTOR_RANK (b
) == 2);
109 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
111 Either A or B (but not both) can be rank 1:
113 o One-dimensional argument A is implicitly treated as a row matrix
114 dimensioned [1,count], so xcount=1.
116 o One-dimensional argument B is implicitly treated as a column matrix
117 dimensioned [count, 1], so ycount=1.
120 if (retarray
->base_addr
== NULL
)
122 if (GFC_DESCRIPTOR_RANK (a
) == 1)
124 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
125 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
127 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
129 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
130 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
134 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
135 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
137 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
138 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
139 GFC_DESCRIPTOR_EXTENT(retarray
,0));
143 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_REAL_4
));
144 retarray
->offset
= 0;
146 else if (unlikely (compile_options
.bounds_check
))
148 index_type ret_extent
, arg_extent
;
150 if (GFC_DESCRIPTOR_RANK (a
) == 1)
152 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
153 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
154 if (arg_extent
!= ret_extent
)
155 runtime_error ("Incorrect extent in return array in"
156 " MATMUL intrinsic: is %ld, should be %ld",
157 (long int) ret_extent
, (long int) arg_extent
);
159 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
161 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
162 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
163 if (arg_extent
!= ret_extent
)
164 runtime_error ("Incorrect extent in return array in"
165 " MATMUL intrinsic: is %ld, should be %ld",
166 (long int) ret_extent
, (long int) arg_extent
);
170 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
171 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
172 if (arg_extent
!= ret_extent
)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 1:"
175 " is %ld, should be %ld",
176 (long int) ret_extent
, (long int) arg_extent
);
178 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
179 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
180 if (arg_extent
!= ret_extent
)
181 runtime_error ("Incorrect extent in return array in"
182 " MATMUL intrinsic for dimension 2:"
183 " is %ld, should be %ld",
184 (long int) ret_extent
, (long int) arg_extent
);
189 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
191 /* One-dimensional result may be addressed in the code below
192 either as a row or a column matrix. We want both cases to
194 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
198 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
199 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
203 if (GFC_DESCRIPTOR_RANK (a
) == 1)
205 /* Treat it as a a row matrix A[1,count]. */
206 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
210 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
214 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
215 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
217 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
218 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
221 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
223 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
224 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
227 if (GFC_DESCRIPTOR_RANK (b
) == 1)
229 /* Treat it as a column matrix B[count,1] */
230 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
232 /* bystride should never be used for 1-dimensional b.
233 in case it is we want it to cause a segfault, rather than
234 an incorrect result. */
235 bystride
= 0xDEADBEEF;
240 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
241 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
242 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
245 abase
= a
->base_addr
;
246 bbase
= b
->base_addr
;
247 dest
= retarray
->base_addr
;
249 /* Now that everything is set up, we perform the multiplication
252 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
253 #define min(a,b) ((a) <= (b) ? (a) : (b))
254 #define max(a,b) ((a) >= (b) ? (a) : (b))
256 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
257 && (bxstride
== 1 || bystride
== 1)
258 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
261 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
262 const GFC_REAL_4 one
= 1, zero
= 0;
263 const int lda
= (axstride
== 1) ? aystride
: axstride
,
264 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
266 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
268 assert (gemm
!= NULL
);
269 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
270 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
276 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
278 /* This block of code implements a tuned matmul, derived from
279 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
281 Bo Kagstrom and Per Ling
282 Department of Computing Science
284 S-901 87 Umea, Sweden
286 from netlib.org, translated to C, and modified for matmul.m4. */
288 const GFC_REAL_4
*a
, *b
;
290 const index_type m
= xcount
, n
= ycount
, k
= count
;
292 /* System generated locals */
293 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
294 i1
, i2
, i3
, i4
, i5
, i6
;
296 /* Local variables */
297 GFC_REAL_4 t1
[65536], /* was [256][256] */
298 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
299 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
300 index_type i
, j
, l
, ii
, jj
, ll
;
301 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
305 c
= retarray
->base_addr
;
307 /* Parameter adjustments */
309 c_offset
= 1 + c_dim1
;
312 a_offset
= 1 + a_dim1
;
315 b_offset
= 1 + b_dim1
;
318 /* Early exit if possible */
319 if (m
== 0 || n
== 0 || k
== 0)
325 c
[i
+ j
* c_dim1
] = (GFC_REAL_4
)0;
327 /* Start turning the crank. */
329 for (jj
= 1; jj
<= i1
; jj
+= 512)
335 ujsec
= jsec
- jsec
% 4;
337 for (ll
= 1; ll
<= i2
; ll
+= 256)
343 ulsec
= lsec
- lsec
% 2;
346 for (ii
= 1; ii
<= i3
; ii
+= 256)
352 uisec
= isec
- isec
% 2;
354 for (l
= ll
; l
<= i4
; l
+= 2)
357 for (i
= ii
; i
<= i5
; i
+= 2)
359 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
361 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
362 a
[i
+ (l
+ 1) * a_dim1
];
363 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
364 a
[i
+ 1 + l
* a_dim1
];
365 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
366 a
[i
+ 1 + (l
+ 1) * a_dim1
];
370 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
371 a
[ii
+ isec
- 1 + l
* a_dim1
];
372 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
373 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
379 for (i
= ii
; i
<= i4
; ++i
)
381 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
382 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
386 uisec
= isec
- isec
% 4;
388 for (j
= jj
; j
<= i4
; j
+= 4)
391 for (i
= ii
; i
<= i5
; i
+= 4)
393 f11
= c
[i
+ j
* c_dim1
];
394 f21
= c
[i
+ 1 + j
* c_dim1
];
395 f12
= c
[i
+ (j
+ 1) * c_dim1
];
396 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
397 f13
= c
[i
+ (j
+ 2) * c_dim1
];
398 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
399 f14
= c
[i
+ (j
+ 3) * c_dim1
];
400 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
401 f31
= c
[i
+ 2 + j
* c_dim1
];
402 f41
= c
[i
+ 3 + j
* c_dim1
];
403 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
404 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
405 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
406 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
407 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
408 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
410 for (l
= ll
; l
<= i6
; ++l
)
412 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
414 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
416 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
417 * b
[l
+ (j
+ 1) * b_dim1
];
418 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
419 * b
[l
+ (j
+ 1) * b_dim1
];
420 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
421 * b
[l
+ (j
+ 2) * b_dim1
];
422 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
423 * b
[l
+ (j
+ 2) * b_dim1
];
424 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
425 * b
[l
+ (j
+ 3) * b_dim1
];
426 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
427 * b
[l
+ (j
+ 3) * b_dim1
];
428 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
430 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
432 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
433 * b
[l
+ (j
+ 1) * b_dim1
];
434 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
435 * b
[l
+ (j
+ 1) * b_dim1
];
436 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
437 * b
[l
+ (j
+ 2) * b_dim1
];
438 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
439 * b
[l
+ (j
+ 2) * b_dim1
];
440 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
441 * b
[l
+ (j
+ 3) * b_dim1
];
442 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
443 * b
[l
+ (j
+ 3) * b_dim1
];
445 c
[i
+ j
* c_dim1
] = f11
;
446 c
[i
+ 1 + j
* c_dim1
] = f21
;
447 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
448 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
449 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
450 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
451 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
452 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
453 c
[i
+ 2 + j
* c_dim1
] = f31
;
454 c
[i
+ 3 + j
* c_dim1
] = f41
;
455 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
456 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
457 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
458 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
459 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
460 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
465 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
467 f11
= c
[i
+ j
* c_dim1
];
468 f12
= c
[i
+ (j
+ 1) * c_dim1
];
469 f13
= c
[i
+ (j
+ 2) * c_dim1
];
470 f14
= c
[i
+ (j
+ 3) * c_dim1
];
472 for (l
= ll
; l
<= i6
; ++l
)
474 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
475 257] * b
[l
+ j
* b_dim1
];
476 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
477 257] * b
[l
+ (j
+ 1) * b_dim1
];
478 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
479 257] * b
[l
+ (j
+ 2) * b_dim1
];
480 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
481 257] * b
[l
+ (j
+ 3) * b_dim1
];
483 c
[i
+ j
* c_dim1
] = f11
;
484 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
485 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
486 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
493 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
496 for (i
= ii
; i
<= i5
; i
+= 4)
498 f11
= c
[i
+ j
* c_dim1
];
499 f21
= c
[i
+ 1 + j
* c_dim1
];
500 f31
= c
[i
+ 2 + j
* c_dim1
];
501 f41
= c
[i
+ 3 + j
* c_dim1
];
503 for (l
= ll
; l
<= i6
; ++l
)
505 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
506 257] * b
[l
+ j
* b_dim1
];
507 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
508 257] * b
[l
+ j
* b_dim1
];
509 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
510 257] * b
[l
+ j
* b_dim1
];
511 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
512 257] * b
[l
+ j
* b_dim1
];
514 c
[i
+ j
* c_dim1
] = f11
;
515 c
[i
+ 1 + j
* c_dim1
] = f21
;
516 c
[i
+ 2 + j
* c_dim1
] = f31
;
517 c
[i
+ 3 + j
* c_dim1
] = f41
;
520 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
522 f11
= c
[i
+ j
* c_dim1
];
524 for (l
= ll
; l
<= i6
; ++l
)
526 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
527 257] * b
[l
+ j
* b_dim1
];
529 c
[i
+ j
* c_dim1
] = f11
;
538 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
540 if (GFC_DESCRIPTOR_RANK (a
) != 1)
542 const GFC_REAL_4
*restrict abase_x
;
543 const GFC_REAL_4
*restrict bbase_y
;
544 GFC_REAL_4
*restrict dest_y
;
547 for (y
= 0; y
< ycount
; y
++)
549 bbase_y
= &bbase
[y
*bystride
];
550 dest_y
= &dest
[y
*rystride
];
551 for (x
= 0; x
< xcount
; x
++)
553 abase_x
= &abase
[x
*axstride
];
555 for (n
= 0; n
< count
; n
++)
556 s
+= abase_x
[n
] * bbase_y
[n
];
563 const GFC_REAL_4
*restrict bbase_y
;
566 for (y
= 0; y
< ycount
; y
++)
568 bbase_y
= &bbase
[y
*bystride
];
570 for (n
= 0; n
< count
; n
++)
571 s
+= abase
[n
*axstride
] * bbase_y
[n
];
572 dest
[y
*rystride
] = s
;
576 else if (axstride
< aystride
)
578 for (y
= 0; y
< ycount
; y
++)
579 for (x
= 0; x
< xcount
; x
++)
580 dest
[x
*rxstride
+ y
*rystride
] = (GFC_REAL_4
)0;
582 for (y
= 0; y
< ycount
; y
++)
583 for (n
= 0; n
< count
; n
++)
584 for (x
= 0; x
< xcount
; x
++)
585 /* dest[x,y] += a[x,n] * b[n,y] */
586 dest
[x
*rxstride
+ y
*rystride
] +=
587 abase
[x
*axstride
+ n
*aystride
] *
588 bbase
[n
*bxstride
+ y
*bystride
];
590 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
592 const GFC_REAL_4
*restrict bbase_y
;
595 for (y
= 0; y
< ycount
; y
++)
597 bbase_y
= &bbase
[y
*bystride
];
599 for (n
= 0; n
< count
; n
++)
600 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
601 dest
[y
*rxstride
] = s
;
606 const GFC_REAL_4
*restrict abase_x
;
607 const GFC_REAL_4
*restrict bbase_y
;
608 GFC_REAL_4
*restrict dest_y
;
611 for (y
= 0; y
< ycount
; y
++)
613 bbase_y
= &bbase
[y
*bystride
];
614 dest_y
= &dest
[y
*rystride
];
615 for (x
= 0; x
< xcount
; x
++)
617 abase_x
= &abase
[x
*axstride
];
619 for (n
= 0; n
< count
; n
++)
620 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
621 dest_y
[x
*rxstride
] = s
;
630 #endif /* HAVE_AVX */
634 matmul_r4_avx2 (gfc_array_r4
* const restrict retarray
,
635 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
636 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx2")));
638 matmul_r4_avx2 (gfc_array_r4
* const restrict retarray
,
639 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
640 int blas_limit
, blas_call gemm
)
642 const GFC_REAL_4
* restrict abase
;
643 const GFC_REAL_4
* restrict bbase
;
644 GFC_REAL_4
* restrict dest
;
646 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
647 index_type x
, y
, n
, count
, xcount
, ycount
;
649 assert (GFC_DESCRIPTOR_RANK (a
) == 2
650 || GFC_DESCRIPTOR_RANK (b
) == 2);
652 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
654 Either A or B (but not both) can be rank 1:
656 o One-dimensional argument A is implicitly treated as a row matrix
657 dimensioned [1,count], so xcount=1.
659 o One-dimensional argument B is implicitly treated as a column matrix
660 dimensioned [count, 1], so ycount=1.
663 if (retarray
->base_addr
== NULL
)
665 if (GFC_DESCRIPTOR_RANK (a
) == 1)
667 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
668 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
670 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
672 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
673 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
677 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
678 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
680 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
681 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
682 GFC_DESCRIPTOR_EXTENT(retarray
,0));
686 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_REAL_4
));
687 retarray
->offset
= 0;
689 else if (unlikely (compile_options
.bounds_check
))
691 index_type ret_extent
, arg_extent
;
693 if (GFC_DESCRIPTOR_RANK (a
) == 1)
695 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
696 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
697 if (arg_extent
!= ret_extent
)
698 runtime_error ("Incorrect extent in return array in"
699 " MATMUL intrinsic: is %ld, should be %ld",
700 (long int) ret_extent
, (long int) arg_extent
);
702 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
704 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
705 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
706 if (arg_extent
!= ret_extent
)
707 runtime_error ("Incorrect extent in return array in"
708 " MATMUL intrinsic: is %ld, should be %ld",
709 (long int) ret_extent
, (long int) arg_extent
);
713 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
714 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
715 if (arg_extent
!= ret_extent
)
716 runtime_error ("Incorrect extent in return array in"
717 " MATMUL intrinsic for dimension 1:"
718 " is %ld, should be %ld",
719 (long int) ret_extent
, (long int) arg_extent
);
721 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
722 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
723 if (arg_extent
!= ret_extent
)
724 runtime_error ("Incorrect extent in return array in"
725 " MATMUL intrinsic for dimension 2:"
726 " is %ld, should be %ld",
727 (long int) ret_extent
, (long int) arg_extent
);
732 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
734 /* One-dimensional result may be addressed in the code below
735 either as a row or a column matrix. We want both cases to
737 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
741 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
742 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
746 if (GFC_DESCRIPTOR_RANK (a
) == 1)
748 /* Treat it as a a row matrix A[1,count]. */
749 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
753 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
757 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
758 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
760 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
761 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
764 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
766 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
767 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
770 if (GFC_DESCRIPTOR_RANK (b
) == 1)
772 /* Treat it as a column matrix B[count,1] */
773 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
775 /* bystride should never be used for 1-dimensional b.
776 in case it is we want it to cause a segfault, rather than
777 an incorrect result. */
778 bystride
= 0xDEADBEEF;
783 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
784 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
785 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
788 abase
= a
->base_addr
;
789 bbase
= b
->base_addr
;
790 dest
= retarray
->base_addr
;
792 /* Now that everything is set up, we perform the multiplication
795 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
796 #define min(a,b) ((a) <= (b) ? (a) : (b))
797 #define max(a,b) ((a) >= (b) ? (a) : (b))
799 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
800 && (bxstride
== 1 || bystride
== 1)
801 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
804 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
805 const GFC_REAL_4 one
= 1, zero
= 0;
806 const int lda
= (axstride
== 1) ? aystride
: axstride
,
807 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
809 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
811 assert (gemm
!= NULL
);
812 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
813 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
819 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
821 /* This block of code implements a tuned matmul, derived from
822 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
824 Bo Kagstrom and Per Ling
825 Department of Computing Science
827 S-901 87 Umea, Sweden
829 from netlib.org, translated to C, and modified for matmul.m4. */
831 const GFC_REAL_4
*a
, *b
;
833 const index_type m
= xcount
, n
= ycount
, k
= count
;
835 /* System generated locals */
836 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
837 i1
, i2
, i3
, i4
, i5
, i6
;
839 /* Local variables */
840 GFC_REAL_4 t1
[65536], /* was [256][256] */
841 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
842 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
843 index_type i
, j
, l
, ii
, jj
, ll
;
844 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
848 c
= retarray
->base_addr
;
850 /* Parameter adjustments */
852 c_offset
= 1 + c_dim1
;
855 a_offset
= 1 + a_dim1
;
858 b_offset
= 1 + b_dim1
;
861 /* Early exit if possible */
862 if (m
== 0 || n
== 0 || k
== 0)
868 c
[i
+ j
* c_dim1
] = (GFC_REAL_4
)0;
870 /* Start turning the crank. */
872 for (jj
= 1; jj
<= i1
; jj
+= 512)
878 ujsec
= jsec
- jsec
% 4;
880 for (ll
= 1; ll
<= i2
; ll
+= 256)
886 ulsec
= lsec
- lsec
% 2;
889 for (ii
= 1; ii
<= i3
; ii
+= 256)
895 uisec
= isec
- isec
% 2;
897 for (l
= ll
; l
<= i4
; l
+= 2)
900 for (i
= ii
; i
<= i5
; i
+= 2)
902 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
904 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
905 a
[i
+ (l
+ 1) * a_dim1
];
906 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
907 a
[i
+ 1 + l
* a_dim1
];
908 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
909 a
[i
+ 1 + (l
+ 1) * a_dim1
];
913 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
914 a
[ii
+ isec
- 1 + l
* a_dim1
];
915 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
916 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
922 for (i
= ii
; i
<= i4
; ++i
)
924 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
925 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
929 uisec
= isec
- isec
% 4;
931 for (j
= jj
; j
<= i4
; j
+= 4)
934 for (i
= ii
; i
<= i5
; i
+= 4)
936 f11
= c
[i
+ j
* c_dim1
];
937 f21
= c
[i
+ 1 + j
* c_dim1
];
938 f12
= c
[i
+ (j
+ 1) * c_dim1
];
939 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
940 f13
= c
[i
+ (j
+ 2) * c_dim1
];
941 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
942 f14
= c
[i
+ (j
+ 3) * c_dim1
];
943 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
944 f31
= c
[i
+ 2 + j
* c_dim1
];
945 f41
= c
[i
+ 3 + j
* c_dim1
];
946 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
947 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
948 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
949 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
950 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
951 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
953 for (l
= ll
; l
<= i6
; ++l
)
955 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
957 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
959 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
960 * b
[l
+ (j
+ 1) * b_dim1
];
961 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
962 * b
[l
+ (j
+ 1) * b_dim1
];
963 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
964 * b
[l
+ (j
+ 2) * b_dim1
];
965 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
966 * b
[l
+ (j
+ 2) * b_dim1
];
967 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
968 * b
[l
+ (j
+ 3) * b_dim1
];
969 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
970 * b
[l
+ (j
+ 3) * b_dim1
];
971 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
973 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
975 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
976 * b
[l
+ (j
+ 1) * b_dim1
];
977 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
978 * b
[l
+ (j
+ 1) * b_dim1
];
979 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
980 * b
[l
+ (j
+ 2) * b_dim1
];
981 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
982 * b
[l
+ (j
+ 2) * b_dim1
];
983 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
984 * b
[l
+ (j
+ 3) * b_dim1
];
985 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
986 * b
[l
+ (j
+ 3) * b_dim1
];
988 c
[i
+ j
* c_dim1
] = f11
;
989 c
[i
+ 1 + j
* c_dim1
] = f21
;
990 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
991 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
992 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
993 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
994 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
995 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
996 c
[i
+ 2 + j
* c_dim1
] = f31
;
997 c
[i
+ 3 + j
* c_dim1
] = f41
;
998 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
999 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1000 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1001 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1002 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1003 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1008 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1010 f11
= c
[i
+ j
* c_dim1
];
1011 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1012 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1013 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1015 for (l
= ll
; l
<= i6
; ++l
)
1017 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1018 257] * b
[l
+ j
* b_dim1
];
1019 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1020 257] * b
[l
+ (j
+ 1) * b_dim1
];
1021 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1022 257] * b
[l
+ (j
+ 2) * b_dim1
];
1023 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1024 257] * b
[l
+ (j
+ 3) * b_dim1
];
1026 c
[i
+ j
* c_dim1
] = f11
;
1027 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1028 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1029 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1036 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1038 i5
= ii
+ uisec
- 1;
1039 for (i
= ii
; i
<= i5
; i
+= 4)
1041 f11
= c
[i
+ j
* c_dim1
];
1042 f21
= c
[i
+ 1 + j
* c_dim1
];
1043 f31
= c
[i
+ 2 + j
* c_dim1
];
1044 f41
= c
[i
+ 3 + j
* c_dim1
];
1046 for (l
= ll
; l
<= i6
; ++l
)
1048 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1049 257] * b
[l
+ j
* b_dim1
];
1050 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1051 257] * b
[l
+ j
* b_dim1
];
1052 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1053 257] * b
[l
+ j
* b_dim1
];
1054 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1055 257] * b
[l
+ j
* b_dim1
];
1057 c
[i
+ j
* c_dim1
] = f11
;
1058 c
[i
+ 1 + j
* c_dim1
] = f21
;
1059 c
[i
+ 2 + j
* c_dim1
] = f31
;
1060 c
[i
+ 3 + j
* c_dim1
] = f41
;
1063 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1065 f11
= c
[i
+ j
* c_dim1
];
1067 for (l
= ll
; l
<= i6
; ++l
)
1069 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1070 257] * b
[l
+ j
* b_dim1
];
1072 c
[i
+ j
* c_dim1
] = f11
;
1081 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1083 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1085 const GFC_REAL_4
*restrict abase_x
;
1086 const GFC_REAL_4
*restrict bbase_y
;
1087 GFC_REAL_4
*restrict dest_y
;
1090 for (y
= 0; y
< ycount
; y
++)
1092 bbase_y
= &bbase
[y
*bystride
];
1093 dest_y
= &dest
[y
*rystride
];
1094 for (x
= 0; x
< xcount
; x
++)
1096 abase_x
= &abase
[x
*axstride
];
1098 for (n
= 0; n
< count
; n
++)
1099 s
+= abase_x
[n
] * bbase_y
[n
];
1106 const GFC_REAL_4
*restrict bbase_y
;
1109 for (y
= 0; y
< ycount
; y
++)
1111 bbase_y
= &bbase
[y
*bystride
];
1113 for (n
= 0; n
< count
; n
++)
1114 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1115 dest
[y
*rystride
] = s
;
1119 else if (axstride
< aystride
)
1121 for (y
= 0; y
< ycount
; y
++)
1122 for (x
= 0; x
< xcount
; x
++)
1123 dest
[x
*rxstride
+ y
*rystride
] = (GFC_REAL_4
)0;
1125 for (y
= 0; y
< ycount
; y
++)
1126 for (n
= 0; n
< count
; n
++)
1127 for (x
= 0; x
< xcount
; x
++)
1128 /* dest[x,y] += a[x,n] * b[n,y] */
1129 dest
[x
*rxstride
+ y
*rystride
] +=
1130 abase
[x
*axstride
+ n
*aystride
] *
1131 bbase
[n
*bxstride
+ y
*bystride
];
1133 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1135 const GFC_REAL_4
*restrict bbase_y
;
1138 for (y
= 0; y
< ycount
; y
++)
1140 bbase_y
= &bbase
[y
*bystride
];
1142 for (n
= 0; n
< count
; n
++)
1143 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1144 dest
[y
*rxstride
] = s
;
1149 const GFC_REAL_4
*restrict abase_x
;
1150 const GFC_REAL_4
*restrict bbase_y
;
1151 GFC_REAL_4
*restrict dest_y
;
1154 for (y
= 0; y
< ycount
; y
++)
1156 bbase_y
= &bbase
[y
*bystride
];
1157 dest_y
= &dest
[y
*rystride
];
1158 for (x
= 0; x
< xcount
; x
++)
1160 abase_x
= &abase
[x
*axstride
];
1162 for (n
= 0; n
< count
; n
++)
1163 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1164 dest_y
[x
*rxstride
] = s
;
1173 #endif /* HAVE_AVX2 */
1177 matmul_r4_avx512f (gfc_array_r4
* const restrict retarray
,
1178 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
1179 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx512f")));
1181 matmul_r4_avx512f (gfc_array_r4
* const restrict retarray
,
1182 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
1183 int blas_limit
, blas_call gemm
)
1185 const GFC_REAL_4
* restrict abase
;
1186 const GFC_REAL_4
* restrict bbase
;
1187 GFC_REAL_4
* restrict dest
;
1189 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1190 index_type x
, y
, n
, count
, xcount
, ycount
;
1192 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1193 || GFC_DESCRIPTOR_RANK (b
) == 2);
1195 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1197 Either A or B (but not both) can be rank 1:
1199 o One-dimensional argument A is implicitly treated as a row matrix
1200 dimensioned [1,count], so xcount=1.
1202 o One-dimensional argument B is implicitly treated as a column matrix
1203 dimensioned [count, 1], so ycount=1.
1206 if (retarray
->base_addr
== NULL
)
1208 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1210 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1211 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1213 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1215 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1216 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1220 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1221 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1223 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1224 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1225 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1229 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_REAL_4
));
1230 retarray
->offset
= 0;
1232 else if (unlikely (compile_options
.bounds_check
))
1234 index_type ret_extent
, arg_extent
;
1236 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1238 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1239 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1240 if (arg_extent
!= ret_extent
)
1241 runtime_error ("Incorrect extent in return array in"
1242 " MATMUL intrinsic: is %ld, should be %ld",
1243 (long int) ret_extent
, (long int) arg_extent
);
1245 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1247 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1248 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1249 if (arg_extent
!= ret_extent
)
1250 runtime_error ("Incorrect extent in return array in"
1251 " MATMUL intrinsic: is %ld, should be %ld",
1252 (long int) ret_extent
, (long int) arg_extent
);
1256 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1257 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1258 if (arg_extent
!= ret_extent
)
1259 runtime_error ("Incorrect extent in return array in"
1260 " MATMUL intrinsic for dimension 1:"
1261 " is %ld, should be %ld",
1262 (long int) ret_extent
, (long int) arg_extent
);
1264 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1265 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1266 if (arg_extent
!= ret_extent
)
1267 runtime_error ("Incorrect extent in return array in"
1268 " MATMUL intrinsic for dimension 2:"
1269 " is %ld, should be %ld",
1270 (long int) ret_extent
, (long int) arg_extent
);
1275 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1277 /* One-dimensional result may be addressed in the code below
1278 either as a row or a column matrix. We want both cases to
1280 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1284 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1285 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1289 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1291 /* Treat it as a a row matrix A[1,count]. */
1292 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1296 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1300 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1301 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1303 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1304 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1307 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1309 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1310 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1313 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1315 /* Treat it as a column matrix B[count,1] */
1316 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1318 /* bystride should never be used for 1-dimensional b.
1319 in case it is we want it to cause a segfault, rather than
1320 an incorrect result. */
1321 bystride
= 0xDEADBEEF;
1326 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1327 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1328 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1331 abase
= a
->base_addr
;
1332 bbase
= b
->base_addr
;
1333 dest
= retarray
->base_addr
;
1335 /* Now that everything is set up, we perform the multiplication
1338 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1339 #define min(a,b) ((a) <= (b) ? (a) : (b))
1340 #define max(a,b) ((a) >= (b) ? (a) : (b))
1342 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1343 && (bxstride
== 1 || bystride
== 1)
1344 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1345 > POW3(blas_limit
)))
1347 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1348 const GFC_REAL_4 one
= 1, zero
= 0;
1349 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1350 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1352 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1354 assert (gemm
!= NULL
);
1355 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1356 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1362 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1364 /* This block of code implements a tuned matmul, derived from
1365 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1367 Bo Kagstrom and Per Ling
1368 Department of Computing Science
1370 S-901 87 Umea, Sweden
1372 from netlib.org, translated to C, and modified for matmul.m4. */
1374 const GFC_REAL_4
*a
, *b
;
1376 const index_type m
= xcount
, n
= ycount
, k
= count
;
1378 /* System generated locals */
1379 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1380 i1
, i2
, i3
, i4
, i5
, i6
;
1382 /* Local variables */
1383 GFC_REAL_4 t1
[65536], /* was [256][256] */
1384 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1385 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1386 index_type i
, j
, l
, ii
, jj
, ll
;
1387 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1391 c
= retarray
->base_addr
;
1393 /* Parameter adjustments */
1395 c_offset
= 1 + c_dim1
;
1398 a_offset
= 1 + a_dim1
;
1401 b_offset
= 1 + b_dim1
;
1404 /* Early exit if possible */
1405 if (m
== 0 || n
== 0 || k
== 0)
1408 /* Empty c first. */
1409 for (j
=1; j
<=n
; j
++)
1410 for (i
=1; i
<=m
; i
++)
1411 c
[i
+ j
* c_dim1
] = (GFC_REAL_4
)0;
1413 /* Start turning the crank. */
1415 for (jj
= 1; jj
<= i1
; jj
+= 512)
1421 ujsec
= jsec
- jsec
% 4;
1423 for (ll
= 1; ll
<= i2
; ll
+= 256)
1429 ulsec
= lsec
- lsec
% 2;
1432 for (ii
= 1; ii
<= i3
; ii
+= 256)
1438 uisec
= isec
- isec
% 2;
1439 i4
= ll
+ ulsec
- 1;
1440 for (l
= ll
; l
<= i4
; l
+= 2)
1442 i5
= ii
+ uisec
- 1;
1443 for (i
= ii
; i
<= i5
; i
+= 2)
1445 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
1447 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
1448 a
[i
+ (l
+ 1) * a_dim1
];
1449 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
1450 a
[i
+ 1 + l
* a_dim1
];
1451 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
1452 a
[i
+ 1 + (l
+ 1) * a_dim1
];
1456 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
1457 a
[ii
+ isec
- 1 + l
* a_dim1
];
1458 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
1459 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
1465 for (i
= ii
; i
<= i4
; ++i
)
1467 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
1468 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
1472 uisec
= isec
- isec
% 4;
1473 i4
= jj
+ ujsec
- 1;
1474 for (j
= jj
; j
<= i4
; j
+= 4)
1476 i5
= ii
+ uisec
- 1;
1477 for (i
= ii
; i
<= i5
; i
+= 4)
1479 f11
= c
[i
+ j
* c_dim1
];
1480 f21
= c
[i
+ 1 + j
* c_dim1
];
1481 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1482 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
1483 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1484 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
1485 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1486 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
1487 f31
= c
[i
+ 2 + j
* c_dim1
];
1488 f41
= c
[i
+ 3 + j
* c_dim1
];
1489 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
1490 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
1491 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
1492 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
1493 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
1494 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
1496 for (l
= ll
; l
<= i6
; ++l
)
1498 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1499 * b
[l
+ j
* b_dim1
];
1500 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1501 * b
[l
+ j
* b_dim1
];
1502 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1503 * b
[l
+ (j
+ 1) * b_dim1
];
1504 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1505 * b
[l
+ (j
+ 1) * b_dim1
];
1506 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1507 * b
[l
+ (j
+ 2) * b_dim1
];
1508 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1509 * b
[l
+ (j
+ 2) * b_dim1
];
1510 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1511 * b
[l
+ (j
+ 3) * b_dim1
];
1512 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1513 * b
[l
+ (j
+ 3) * b_dim1
];
1514 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1515 * b
[l
+ j
* b_dim1
];
1516 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1517 * b
[l
+ j
* b_dim1
];
1518 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1519 * b
[l
+ (j
+ 1) * b_dim1
];
1520 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1521 * b
[l
+ (j
+ 1) * b_dim1
];
1522 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1523 * b
[l
+ (j
+ 2) * b_dim1
];
1524 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1525 * b
[l
+ (j
+ 2) * b_dim1
];
1526 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1527 * b
[l
+ (j
+ 3) * b_dim1
];
1528 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1529 * b
[l
+ (j
+ 3) * b_dim1
];
1531 c
[i
+ j
* c_dim1
] = f11
;
1532 c
[i
+ 1 + j
* c_dim1
] = f21
;
1533 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1534 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1535 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1536 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1537 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1538 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1539 c
[i
+ 2 + j
* c_dim1
] = f31
;
1540 c
[i
+ 3 + j
* c_dim1
] = f41
;
1541 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1542 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1543 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1544 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1545 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1546 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1551 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1553 f11
= c
[i
+ j
* c_dim1
];
1554 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1555 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1556 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1558 for (l
= ll
; l
<= i6
; ++l
)
1560 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1561 257] * b
[l
+ j
* b_dim1
];
1562 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1563 257] * b
[l
+ (j
+ 1) * b_dim1
];
1564 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1565 257] * b
[l
+ (j
+ 2) * b_dim1
];
1566 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1567 257] * b
[l
+ (j
+ 3) * b_dim1
];
1569 c
[i
+ j
* c_dim1
] = f11
;
1570 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1571 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1572 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1579 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1581 i5
= ii
+ uisec
- 1;
1582 for (i
= ii
; i
<= i5
; i
+= 4)
1584 f11
= c
[i
+ j
* c_dim1
];
1585 f21
= c
[i
+ 1 + j
* c_dim1
];
1586 f31
= c
[i
+ 2 + j
* c_dim1
];
1587 f41
= c
[i
+ 3 + j
* c_dim1
];
1589 for (l
= ll
; l
<= i6
; ++l
)
1591 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1592 257] * b
[l
+ j
* b_dim1
];
1593 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1594 257] * b
[l
+ j
* b_dim1
];
1595 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1596 257] * b
[l
+ j
* b_dim1
];
1597 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1598 257] * b
[l
+ j
* b_dim1
];
1600 c
[i
+ j
* c_dim1
] = f11
;
1601 c
[i
+ 1 + j
* c_dim1
] = f21
;
1602 c
[i
+ 2 + j
* c_dim1
] = f31
;
1603 c
[i
+ 3 + j
* c_dim1
] = f41
;
1606 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1608 f11
= c
[i
+ j
* c_dim1
];
1610 for (l
= ll
; l
<= i6
; ++l
)
1612 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1613 257] * b
[l
+ j
* b_dim1
];
1615 c
[i
+ j
* c_dim1
] = f11
;
1624 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1626 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1628 const GFC_REAL_4
*restrict abase_x
;
1629 const GFC_REAL_4
*restrict bbase_y
;
1630 GFC_REAL_4
*restrict dest_y
;
1633 for (y
= 0; y
< ycount
; y
++)
1635 bbase_y
= &bbase
[y
*bystride
];
1636 dest_y
= &dest
[y
*rystride
];
1637 for (x
= 0; x
< xcount
; x
++)
1639 abase_x
= &abase
[x
*axstride
];
1641 for (n
= 0; n
< count
; n
++)
1642 s
+= abase_x
[n
] * bbase_y
[n
];
1649 const GFC_REAL_4
*restrict bbase_y
;
1652 for (y
= 0; y
< ycount
; y
++)
1654 bbase_y
= &bbase
[y
*bystride
];
1656 for (n
= 0; n
< count
; n
++)
1657 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1658 dest
[y
*rystride
] = s
;
1662 else if (axstride
< aystride
)
1664 for (y
= 0; y
< ycount
; y
++)
1665 for (x
= 0; x
< xcount
; x
++)
1666 dest
[x
*rxstride
+ y
*rystride
] = (GFC_REAL_4
)0;
1668 for (y
= 0; y
< ycount
; y
++)
1669 for (n
= 0; n
< count
; n
++)
1670 for (x
= 0; x
< xcount
; x
++)
1671 /* dest[x,y] += a[x,n] * b[n,y] */
1672 dest
[x
*rxstride
+ y
*rystride
] +=
1673 abase
[x
*axstride
+ n
*aystride
] *
1674 bbase
[n
*bxstride
+ y
*bystride
];
1676 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1678 const GFC_REAL_4
*restrict bbase_y
;
1681 for (y
= 0; y
< ycount
; y
++)
1683 bbase_y
= &bbase
[y
*bystride
];
1685 for (n
= 0; n
< count
; n
++)
1686 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1687 dest
[y
*rxstride
] = s
;
1692 const GFC_REAL_4
*restrict abase_x
;
1693 const GFC_REAL_4
*restrict bbase_y
;
1694 GFC_REAL_4
*restrict dest_y
;
1697 for (y
= 0; y
< ycount
; y
++)
1699 bbase_y
= &bbase
[y
*bystride
];
1700 dest_y
= &dest
[y
*rystride
];
1701 for (x
= 0; x
< xcount
; x
++)
1703 abase_x
= &abase
[x
*axstride
];
1705 for (n
= 0; n
< count
; n
++)
1706 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1707 dest_y
[x
*rxstride
] = s
;
1716 #endif /* HAVE_AVX512F */
1718 /* Function to fall back to if there is no special processor-specific version. */
1720 matmul_r4_vanilla (gfc_array_r4
* const restrict retarray
,
1721 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
1722 int blas_limit
, blas_call gemm
)
1724 const GFC_REAL_4
* restrict abase
;
1725 const GFC_REAL_4
* restrict bbase
;
1726 GFC_REAL_4
* restrict dest
;
1728 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1729 index_type x
, y
, n
, count
, xcount
, ycount
;
1731 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1732 || GFC_DESCRIPTOR_RANK (b
) == 2);
1734 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1736 Either A or B (but not both) can be rank 1:
1738 o One-dimensional argument A is implicitly treated as a row matrix
1739 dimensioned [1,count], so xcount=1.
1741 o One-dimensional argument B is implicitly treated as a column matrix
1742 dimensioned [count, 1], so ycount=1.
1745 if (retarray
->base_addr
== NULL
)
1747 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1749 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1750 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1752 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1754 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1755 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1759 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1760 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1762 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1763 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1764 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1768 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_REAL_4
));
1769 retarray
->offset
= 0;
1771 else if (unlikely (compile_options
.bounds_check
))
1773 index_type ret_extent
, arg_extent
;
1775 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1777 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1778 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1779 if (arg_extent
!= ret_extent
)
1780 runtime_error ("Incorrect extent in return array in"
1781 " MATMUL intrinsic: is %ld, should be %ld",
1782 (long int) ret_extent
, (long int) arg_extent
);
1784 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1786 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1787 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1788 if (arg_extent
!= ret_extent
)
1789 runtime_error ("Incorrect extent in return array in"
1790 " MATMUL intrinsic: is %ld, should be %ld",
1791 (long int) ret_extent
, (long int) arg_extent
);
1795 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1796 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1797 if (arg_extent
!= ret_extent
)
1798 runtime_error ("Incorrect extent in return array in"
1799 " MATMUL intrinsic for dimension 1:"
1800 " is %ld, should be %ld",
1801 (long int) ret_extent
, (long int) arg_extent
);
1803 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1804 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1805 if (arg_extent
!= ret_extent
)
1806 runtime_error ("Incorrect extent in return array in"
1807 " MATMUL intrinsic for dimension 2:"
1808 " is %ld, should be %ld",
1809 (long int) ret_extent
, (long int) arg_extent
);
1814 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1816 /* One-dimensional result may be addressed in the code below
1817 either as a row or a column matrix. We want both cases to
1819 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1823 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1824 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1828 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1830 /* Treat it as a a row matrix A[1,count]. */
1831 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1835 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1839 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1840 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1842 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1843 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1846 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1848 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1849 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1852 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1854 /* Treat it as a column matrix B[count,1] */
1855 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1857 /* bystride should never be used for 1-dimensional b.
1858 in case it is we want it to cause a segfault, rather than
1859 an incorrect result. */
1860 bystride
= 0xDEADBEEF;
1865 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1866 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1867 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1870 abase
= a
->base_addr
;
1871 bbase
= b
->base_addr
;
1872 dest
= retarray
->base_addr
;
1874 /* Now that everything is set up, we perform the multiplication
1877 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1878 #define min(a,b) ((a) <= (b) ? (a) : (b))
1879 #define max(a,b) ((a) >= (b) ? (a) : (b))
1881 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1882 && (bxstride
== 1 || bystride
== 1)
1883 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1884 > POW3(blas_limit
)))
1886 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1887 const GFC_REAL_4 one
= 1, zero
= 0;
1888 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1889 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1891 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1893 assert (gemm
!= NULL
);
1894 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1895 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1901 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1903 /* This block of code implements a tuned matmul, derived from
1904 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1906 Bo Kagstrom and Per Ling
1907 Department of Computing Science
1909 S-901 87 Umea, Sweden
1911 from netlib.org, translated to C, and modified for matmul.m4. */
1913 const GFC_REAL_4
*a
, *b
;
1915 const index_type m
= xcount
, n
= ycount
, k
= count
;
1917 /* System generated locals */
1918 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1919 i1
, i2
, i3
, i4
, i5
, i6
;
1921 /* Local variables */
1922 GFC_REAL_4 t1
[65536], /* was [256][256] */
1923 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1924 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1925 index_type i
, j
, l
, ii
, jj
, ll
;
1926 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1930 c
= retarray
->base_addr
;
1932 /* Parameter adjustments */
1934 c_offset
= 1 + c_dim1
;
1937 a_offset
= 1 + a_dim1
;
1940 b_offset
= 1 + b_dim1
;
1943 /* Early exit if possible */
1944 if (m
== 0 || n
== 0 || k
== 0)
1947 /* Empty c first. */
1948 for (j
=1; j
<=n
; j
++)
1949 for (i
=1; i
<=m
; i
++)
1950 c
[i
+ j
* c_dim1
] = (GFC_REAL_4
)0;
1952 /* Start turning the crank. */
1954 for (jj
= 1; jj
<= i1
; jj
+= 512)
1960 ujsec
= jsec
- jsec
% 4;
1962 for (ll
= 1; ll
<= i2
; ll
+= 256)
1968 ulsec
= lsec
- lsec
% 2;
1971 for (ii
= 1; ii
<= i3
; ii
+= 256)
1977 uisec
= isec
- isec
% 2;
1978 i4
= ll
+ ulsec
- 1;
1979 for (l
= ll
; l
<= i4
; l
+= 2)
1981 i5
= ii
+ uisec
- 1;
1982 for (i
= ii
; i
<= i5
; i
+= 2)
1984 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
1986 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
1987 a
[i
+ (l
+ 1) * a_dim1
];
1988 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
1989 a
[i
+ 1 + l
* a_dim1
];
1990 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
1991 a
[i
+ 1 + (l
+ 1) * a_dim1
];
1995 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
1996 a
[ii
+ isec
- 1 + l
* a_dim1
];
1997 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
1998 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2004 for (i
= ii
; i
<= i4
; ++i
)
2006 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2007 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2011 uisec
= isec
- isec
% 4;
2012 i4
= jj
+ ujsec
- 1;
2013 for (j
= jj
; j
<= i4
; j
+= 4)
2015 i5
= ii
+ uisec
- 1;
2016 for (i
= ii
; i
<= i5
; i
+= 4)
2018 f11
= c
[i
+ j
* c_dim1
];
2019 f21
= c
[i
+ 1 + j
* c_dim1
];
2020 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2021 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2022 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2023 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2024 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2025 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2026 f31
= c
[i
+ 2 + j
* c_dim1
];
2027 f41
= c
[i
+ 3 + j
* c_dim1
];
2028 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2029 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2030 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2031 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2032 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2033 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2035 for (l
= ll
; l
<= i6
; ++l
)
2037 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2038 * b
[l
+ j
* b_dim1
];
2039 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2040 * b
[l
+ j
* b_dim1
];
2041 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2042 * b
[l
+ (j
+ 1) * b_dim1
];
2043 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2044 * b
[l
+ (j
+ 1) * b_dim1
];
2045 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2046 * b
[l
+ (j
+ 2) * b_dim1
];
2047 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2048 * b
[l
+ (j
+ 2) * b_dim1
];
2049 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2050 * b
[l
+ (j
+ 3) * b_dim1
];
2051 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2052 * b
[l
+ (j
+ 3) * b_dim1
];
2053 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2054 * b
[l
+ j
* b_dim1
];
2055 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2056 * b
[l
+ j
* b_dim1
];
2057 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2058 * b
[l
+ (j
+ 1) * b_dim1
];
2059 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2060 * b
[l
+ (j
+ 1) * b_dim1
];
2061 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2062 * b
[l
+ (j
+ 2) * b_dim1
];
2063 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2064 * b
[l
+ (j
+ 2) * b_dim1
];
2065 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2066 * b
[l
+ (j
+ 3) * b_dim1
];
2067 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2068 * b
[l
+ (j
+ 3) * b_dim1
];
2070 c
[i
+ j
* c_dim1
] = f11
;
2071 c
[i
+ 1 + j
* c_dim1
] = f21
;
2072 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2073 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2074 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2075 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2076 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2077 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2078 c
[i
+ 2 + j
* c_dim1
] = f31
;
2079 c
[i
+ 3 + j
* c_dim1
] = f41
;
2080 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2081 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2082 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2083 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2084 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2085 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2090 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2092 f11
= c
[i
+ j
* c_dim1
];
2093 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2094 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2095 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2097 for (l
= ll
; l
<= i6
; ++l
)
2099 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2100 257] * b
[l
+ j
* b_dim1
];
2101 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2102 257] * b
[l
+ (j
+ 1) * b_dim1
];
2103 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2104 257] * b
[l
+ (j
+ 2) * b_dim1
];
2105 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2106 257] * b
[l
+ (j
+ 3) * b_dim1
];
2108 c
[i
+ j
* c_dim1
] = f11
;
2109 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2110 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2111 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2118 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2120 i5
= ii
+ uisec
- 1;
2121 for (i
= ii
; i
<= i5
; i
+= 4)
2123 f11
= c
[i
+ j
* c_dim1
];
2124 f21
= c
[i
+ 1 + j
* c_dim1
];
2125 f31
= c
[i
+ 2 + j
* c_dim1
];
2126 f41
= c
[i
+ 3 + j
* c_dim1
];
2128 for (l
= ll
; l
<= i6
; ++l
)
2130 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2131 257] * b
[l
+ j
* b_dim1
];
2132 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2133 257] * b
[l
+ j
* b_dim1
];
2134 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2135 257] * b
[l
+ j
* b_dim1
];
2136 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2137 257] * b
[l
+ j
* b_dim1
];
2139 c
[i
+ j
* c_dim1
] = f11
;
2140 c
[i
+ 1 + j
* c_dim1
] = f21
;
2141 c
[i
+ 2 + j
* c_dim1
] = f31
;
2142 c
[i
+ 3 + j
* c_dim1
] = f41
;
2145 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2147 f11
= c
[i
+ j
* c_dim1
];
2149 for (l
= ll
; l
<= i6
; ++l
)
2151 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2152 257] * b
[l
+ j
* b_dim1
];
2154 c
[i
+ j
* c_dim1
] = f11
;
2163 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2165 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2167 const GFC_REAL_4
*restrict abase_x
;
2168 const GFC_REAL_4
*restrict bbase_y
;
2169 GFC_REAL_4
*restrict dest_y
;
2172 for (y
= 0; y
< ycount
; y
++)
2174 bbase_y
= &bbase
[y
*bystride
];
2175 dest_y
= &dest
[y
*rystride
];
2176 for (x
= 0; x
< xcount
; x
++)
2178 abase_x
= &abase
[x
*axstride
];
2180 for (n
= 0; n
< count
; n
++)
2181 s
+= abase_x
[n
] * bbase_y
[n
];
2188 const GFC_REAL_4
*restrict bbase_y
;
2191 for (y
= 0; y
< ycount
; y
++)
2193 bbase_y
= &bbase
[y
*bystride
];
2195 for (n
= 0; n
< count
; n
++)
2196 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2197 dest
[y
*rystride
] = s
;
2201 else if (axstride
< aystride
)
2203 for (y
= 0; y
< ycount
; y
++)
2204 for (x
= 0; x
< xcount
; x
++)
2205 dest
[x
*rxstride
+ y
*rystride
] = (GFC_REAL_4
)0;
2207 for (y
= 0; y
< ycount
; y
++)
2208 for (n
= 0; n
< count
; n
++)
2209 for (x
= 0; x
< xcount
; x
++)
2210 /* dest[x,y] += a[x,n] * b[n,y] */
2211 dest
[x
*rxstride
+ y
*rystride
] +=
2212 abase
[x
*axstride
+ n
*aystride
] *
2213 bbase
[n
*bxstride
+ y
*bystride
];
2215 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2217 const GFC_REAL_4
*restrict bbase_y
;
2220 for (y
= 0; y
< ycount
; y
++)
2222 bbase_y
= &bbase
[y
*bystride
];
2224 for (n
= 0; n
< count
; n
++)
2225 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2226 dest
[y
*rxstride
] = s
;
2231 const GFC_REAL_4
*restrict abase_x
;
2232 const GFC_REAL_4
*restrict bbase_y
;
2233 GFC_REAL_4
*restrict dest_y
;
2236 for (y
= 0; y
< ycount
; y
++)
2238 bbase_y
= &bbase
[y
*bystride
];
2239 dest_y
= &dest
[y
*rystride
];
2240 for (x
= 0; x
< xcount
; x
++)
2242 abase_x
= &abase
[x
*axstride
];
2244 for (n
= 0; n
< count
; n
++)
2245 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2246 dest_y
[x
*rxstride
] = s
;
2256 /* Compiling main function, with selection code for the processor. */
2258 /* Currently, this is i386 only. Adjust for other architectures. */
2260 #include <config/i386/cpuinfo.h>
2261 void matmul_r4 (gfc_array_r4
* const restrict retarray
,
2262 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
2263 int blas_limit
, blas_call gemm
)
2265 static void (*matmul_p
) (gfc_array_r4
* const restrict retarray
,
2266 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
2267 int blas_limit
, blas_call gemm
) = NULL
;
2269 if (matmul_p
== NULL
)
2271 matmul_p
= matmul_r4_vanilla
;
2272 if (__cpu_model
.__cpu_vendor
== VENDOR_INTEL
)
2274 /* Run down the available processors in order of preference. */
2276 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX512F
))
2278 matmul_p
= matmul_r4_avx512f
;
2282 #endif /* HAVE_AVX512F */
2285 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX2
))
2287 matmul_p
= matmul_r4_avx2
;
2294 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2296 matmul_p
= matmul_r4_avx
;
2299 #endif /* HAVE_AVX */
2304 (*matmul_p
) (retarray
, a
, b
, try_blas
, blas_limit
, gemm
);
2307 #else /* Just the vanilla function. */
2310 matmul_r4 (gfc_array_r4
* const restrict retarray
,
2311 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
2312 int blas_limit
, blas_call gemm
)
2314 const GFC_REAL_4
* restrict abase
;
2315 const GFC_REAL_4
* restrict bbase
;
2316 GFC_REAL_4
* restrict dest
;
2318 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
2319 index_type x
, y
, n
, count
, xcount
, ycount
;
2321 assert (GFC_DESCRIPTOR_RANK (a
) == 2
2322 || GFC_DESCRIPTOR_RANK (b
) == 2);
2324 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2326 Either A or B (but not both) can be rank 1:
2328 o One-dimensional argument A is implicitly treated as a row matrix
2329 dimensioned [1,count], so xcount=1.
2331 o One-dimensional argument B is implicitly treated as a column matrix
2332 dimensioned [count, 1], so ycount=1.
2335 if (retarray
->base_addr
== NULL
)
2337 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2339 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2340 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
2342 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2344 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2345 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2349 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2350 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2352 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
2353 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
2354 GFC_DESCRIPTOR_EXTENT(retarray
,0));
2358 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_REAL_4
));
2359 retarray
->offset
= 0;
2361 else if (unlikely (compile_options
.bounds_check
))
2363 index_type ret_extent
, arg_extent
;
2365 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2367 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2368 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2369 if (arg_extent
!= ret_extent
)
2370 runtime_error ("Incorrect extent in return array in"
2371 " MATMUL intrinsic: is %ld, should be %ld",
2372 (long int) ret_extent
, (long int) arg_extent
);
2374 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2376 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2377 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2378 if (arg_extent
!= ret_extent
)
2379 runtime_error ("Incorrect extent in return array in"
2380 " MATMUL intrinsic: is %ld, should be %ld",
2381 (long int) ret_extent
, (long int) arg_extent
);
2385 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2386 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2387 if (arg_extent
!= ret_extent
)
2388 runtime_error ("Incorrect extent in return array in"
2389 " MATMUL intrinsic for dimension 1:"
2390 " is %ld, should be %ld",
2391 (long int) ret_extent
, (long int) arg_extent
);
2393 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2394 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
2395 if (arg_extent
!= ret_extent
)
2396 runtime_error ("Incorrect extent in return array in"
2397 " MATMUL intrinsic for dimension 2:"
2398 " is %ld, should be %ld",
2399 (long int) ret_extent
, (long int) arg_extent
);
2404 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
2406 /* One-dimensional result may be addressed in the code below
2407 either as a row or a column matrix. We want both cases to
2409 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2413 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2414 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
2418 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2420 /* Treat it as a a row matrix A[1,count]. */
2421 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2425 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
2429 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2430 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
2432 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
2433 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
2436 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
2438 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
2439 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2442 if (GFC_DESCRIPTOR_RANK (b
) == 1)
2444 /* Treat it as a column matrix B[count,1] */
2445 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2447 /* bystride should never be used for 1-dimensional b.
2448 in case it is we want it to cause a segfault, rather than
2449 an incorrect result. */
2450 bystride
= 0xDEADBEEF;
2455 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2456 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
2457 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
2460 abase
= a
->base_addr
;
2461 bbase
= b
->base_addr
;
2462 dest
= retarray
->base_addr
;
2464 /* Now that everything is set up, we perform the multiplication
2467 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2468 #define min(a,b) ((a) <= (b) ? (a) : (b))
2469 #define max(a,b) ((a) >= (b) ? (a) : (b))
2471 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
2472 && (bxstride
== 1 || bystride
== 1)
2473 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
2474 > POW3(blas_limit
)))
2476 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
2477 const GFC_REAL_4 one
= 1, zero
= 0;
2478 const int lda
= (axstride
== 1) ? aystride
: axstride
,
2479 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
2481 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
2483 assert (gemm
!= NULL
);
2484 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
2485 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
2491 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
2493 /* This block of code implements a tuned matmul, derived from
2494 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2496 Bo Kagstrom and Per Ling
2497 Department of Computing Science
2499 S-901 87 Umea, Sweden
2501 from netlib.org, translated to C, and modified for matmul.m4. */
2503 const GFC_REAL_4
*a
, *b
;
2505 const index_type m
= xcount
, n
= ycount
, k
= count
;
2507 /* System generated locals */
2508 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
2509 i1
, i2
, i3
, i4
, i5
, i6
;
2511 /* Local variables */
2512 GFC_REAL_4 t1
[65536], /* was [256][256] */
2513 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
2514 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
2515 index_type i
, j
, l
, ii
, jj
, ll
;
2516 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
2520 c
= retarray
->base_addr
;
2522 /* Parameter adjustments */
2524 c_offset
= 1 + c_dim1
;
2527 a_offset
= 1 + a_dim1
;
2530 b_offset
= 1 + b_dim1
;
2533 /* Early exit if possible */
2534 if (m
== 0 || n
== 0 || k
== 0)
2537 /* Empty c first. */
2538 for (j
=1; j
<=n
; j
++)
2539 for (i
=1; i
<=m
; i
++)
2540 c
[i
+ j
* c_dim1
] = (GFC_REAL_4
)0;
2542 /* Start turning the crank. */
2544 for (jj
= 1; jj
<= i1
; jj
+= 512)
2550 ujsec
= jsec
- jsec
% 4;
2552 for (ll
= 1; ll
<= i2
; ll
+= 256)
2558 ulsec
= lsec
- lsec
% 2;
2561 for (ii
= 1; ii
<= i3
; ii
+= 256)
2567 uisec
= isec
- isec
% 2;
2568 i4
= ll
+ ulsec
- 1;
2569 for (l
= ll
; l
<= i4
; l
+= 2)
2571 i5
= ii
+ uisec
- 1;
2572 for (i
= ii
; i
<= i5
; i
+= 2)
2574 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2576 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2577 a
[i
+ (l
+ 1) * a_dim1
];
2578 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2579 a
[i
+ 1 + l
* a_dim1
];
2580 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2581 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2585 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2586 a
[ii
+ isec
- 1 + l
* a_dim1
];
2587 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2588 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2594 for (i
= ii
; i
<= i4
; ++i
)
2596 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2597 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2601 uisec
= isec
- isec
% 4;
2602 i4
= jj
+ ujsec
- 1;
2603 for (j
= jj
; j
<= i4
; j
+= 4)
2605 i5
= ii
+ uisec
- 1;
2606 for (i
= ii
; i
<= i5
; i
+= 4)
2608 f11
= c
[i
+ j
* c_dim1
];
2609 f21
= c
[i
+ 1 + j
* c_dim1
];
2610 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2611 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2612 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2613 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2614 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2615 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2616 f31
= c
[i
+ 2 + j
* c_dim1
];
2617 f41
= c
[i
+ 3 + j
* c_dim1
];
2618 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2619 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2620 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2621 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2622 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2623 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2625 for (l
= ll
; l
<= i6
; ++l
)
2627 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2628 * b
[l
+ j
* b_dim1
];
2629 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2630 * b
[l
+ j
* b_dim1
];
2631 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2632 * b
[l
+ (j
+ 1) * b_dim1
];
2633 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2634 * b
[l
+ (j
+ 1) * b_dim1
];
2635 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2636 * b
[l
+ (j
+ 2) * b_dim1
];
2637 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2638 * b
[l
+ (j
+ 2) * b_dim1
];
2639 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2640 * b
[l
+ (j
+ 3) * b_dim1
];
2641 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2642 * b
[l
+ (j
+ 3) * b_dim1
];
2643 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2644 * b
[l
+ j
* b_dim1
];
2645 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2646 * b
[l
+ j
* b_dim1
];
2647 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2648 * b
[l
+ (j
+ 1) * b_dim1
];
2649 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2650 * b
[l
+ (j
+ 1) * b_dim1
];
2651 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2652 * b
[l
+ (j
+ 2) * b_dim1
];
2653 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2654 * b
[l
+ (j
+ 2) * b_dim1
];
2655 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2656 * b
[l
+ (j
+ 3) * b_dim1
];
2657 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2658 * b
[l
+ (j
+ 3) * b_dim1
];
2660 c
[i
+ j
* c_dim1
] = f11
;
2661 c
[i
+ 1 + j
* c_dim1
] = f21
;
2662 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2663 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2664 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2665 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2666 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2667 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2668 c
[i
+ 2 + j
* c_dim1
] = f31
;
2669 c
[i
+ 3 + j
* c_dim1
] = f41
;
2670 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2671 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2672 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2673 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2674 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2675 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2680 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2682 f11
= c
[i
+ j
* c_dim1
];
2683 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2684 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2685 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2687 for (l
= ll
; l
<= i6
; ++l
)
2689 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2690 257] * b
[l
+ j
* b_dim1
];
2691 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2692 257] * b
[l
+ (j
+ 1) * b_dim1
];
2693 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2694 257] * b
[l
+ (j
+ 2) * b_dim1
];
2695 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2696 257] * b
[l
+ (j
+ 3) * b_dim1
];
2698 c
[i
+ j
* c_dim1
] = f11
;
2699 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2700 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2701 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2708 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2710 i5
= ii
+ uisec
- 1;
2711 for (i
= ii
; i
<= i5
; i
+= 4)
2713 f11
= c
[i
+ j
* c_dim1
];
2714 f21
= c
[i
+ 1 + j
* c_dim1
];
2715 f31
= c
[i
+ 2 + j
* c_dim1
];
2716 f41
= c
[i
+ 3 + j
* c_dim1
];
2718 for (l
= ll
; l
<= i6
; ++l
)
2720 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2721 257] * b
[l
+ j
* b_dim1
];
2722 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2723 257] * b
[l
+ j
* b_dim1
];
2724 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2725 257] * b
[l
+ j
* b_dim1
];
2726 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2727 257] * b
[l
+ j
* b_dim1
];
2729 c
[i
+ j
* c_dim1
] = f11
;
2730 c
[i
+ 1 + j
* c_dim1
] = f21
;
2731 c
[i
+ 2 + j
* c_dim1
] = f31
;
2732 c
[i
+ 3 + j
* c_dim1
] = f41
;
2735 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2737 f11
= c
[i
+ j
* c_dim1
];
2739 for (l
= ll
; l
<= i6
; ++l
)
2741 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2742 257] * b
[l
+ j
* b_dim1
];
2744 c
[i
+ j
* c_dim1
] = f11
;
2753 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2755 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2757 const GFC_REAL_4
*restrict abase_x
;
2758 const GFC_REAL_4
*restrict bbase_y
;
2759 GFC_REAL_4
*restrict dest_y
;
2762 for (y
= 0; y
< ycount
; y
++)
2764 bbase_y
= &bbase
[y
*bystride
];
2765 dest_y
= &dest
[y
*rystride
];
2766 for (x
= 0; x
< xcount
; x
++)
2768 abase_x
= &abase
[x
*axstride
];
2770 for (n
= 0; n
< count
; n
++)
2771 s
+= abase_x
[n
] * bbase_y
[n
];
2778 const GFC_REAL_4
*restrict bbase_y
;
2781 for (y
= 0; y
< ycount
; y
++)
2783 bbase_y
= &bbase
[y
*bystride
];
2785 for (n
= 0; n
< count
; n
++)
2786 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2787 dest
[y
*rystride
] = s
;
2791 else if (axstride
< aystride
)
2793 for (y
= 0; y
< ycount
; y
++)
2794 for (x
= 0; x
< xcount
; x
++)
2795 dest
[x
*rxstride
+ y
*rystride
] = (GFC_REAL_4
)0;
2797 for (y
= 0; y
< ycount
; y
++)
2798 for (n
= 0; n
< count
; n
++)
2799 for (x
= 0; x
< xcount
; x
++)
2800 /* dest[x,y] += a[x,n] * b[n,y] */
2801 dest
[x
*rxstride
+ y
*rystride
] +=
2802 abase
[x
*axstride
+ n
*aystride
] *
2803 bbase
[n
*bxstride
+ y
*bystride
];
2805 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2807 const GFC_REAL_4
*restrict bbase_y
;
2810 for (y
= 0; y
< ycount
; y
++)
2812 bbase_y
= &bbase
[y
*bystride
];
2814 for (n
= 0; n
< count
; n
++)
2815 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2816 dest
[y
*rxstride
] = s
;
2821 const GFC_REAL_4
*restrict abase_x
;
2822 const GFC_REAL_4
*restrict bbase_y
;
2823 GFC_REAL_4
*restrict dest_y
;
2826 for (y
= 0; y
< ycount
; y
++)
2828 bbase_y
= &bbase
[y
*bystride
];
2829 dest_y
= &dest
[y
*rystride
];
2830 for (x
= 0; x
< xcount
; x
++)
2832 abase_x
= &abase
[x
*axstride
];
2834 for (n
= 0; n
< count
; n
++)
2835 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2836 dest_y
[x
*rxstride
] = s
;