1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
31 #if defined (HAVE_GFC_REAL_4)
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
37 typedef void (*blas_call
)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_REAL_4
*, const GFC_REAL_4
*,
39 const int *, const GFC_REAL_4
*, const int *,
40 const GFC_REAL_4
*, GFC_REAL_4
*, const int *,
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
49 The equivalent Fortran pseudo-code is:
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
72 extern void matmul_r4 (gfc_array_r4
* const restrict retarray
,
73 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
74 int blas_limit
, blas_call gemm
);
75 export_proto(matmul_r4
);
77 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
78 /* REAL types generate identical code for AVX and AVX2. Only generate
79 an AVX2 function if we are dealing with integer. */
84 /* Put exhaustive list of possible architectures here here, ORed together. */
86 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
90 matmul_r4_avx (gfc_array_r4
* const restrict retarray
,
91 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
92 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx")));
94 matmul_r4_avx (gfc_array_r4
* const restrict retarray
,
95 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
96 int blas_limit
, blas_call gemm
)
98 const GFC_REAL_4
* restrict abase
;
99 const GFC_REAL_4
* restrict bbase
;
100 GFC_REAL_4
* restrict dest
;
102 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
103 index_type x
, y
, n
, count
, xcount
, ycount
;
105 assert (GFC_DESCRIPTOR_RANK (a
) == 2
106 || GFC_DESCRIPTOR_RANK (b
) == 2);
108 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
110 Either A or B (but not both) can be rank 1:
112 o One-dimensional argument A is implicitly treated as a row matrix
113 dimensioned [1,count], so xcount=1.
115 o One-dimensional argument B is implicitly treated as a column matrix
116 dimensioned [count, 1], so ycount=1.
119 if (retarray
->base_addr
== NULL
)
121 if (GFC_DESCRIPTOR_RANK (a
) == 1)
123 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
124 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
126 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
128 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
129 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
133 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
134 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
136 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
137 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
138 GFC_DESCRIPTOR_EXTENT(retarray
,0));
142 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_REAL_4
));
143 retarray
->offset
= 0;
145 else if (unlikely (compile_options
.bounds_check
))
147 index_type ret_extent
, arg_extent
;
149 if (GFC_DESCRIPTOR_RANK (a
) == 1)
151 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
152 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
153 if (arg_extent
!= ret_extent
)
154 runtime_error ("Incorrect extent in return array in"
155 " MATMUL intrinsic: is %ld, should be %ld",
156 (long int) ret_extent
, (long int) arg_extent
);
158 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
160 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
161 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
162 if (arg_extent
!= ret_extent
)
163 runtime_error ("Incorrect extent in return array in"
164 " MATMUL intrinsic: is %ld, should be %ld",
165 (long int) ret_extent
, (long int) arg_extent
);
169 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
170 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
171 if (arg_extent
!= ret_extent
)
172 runtime_error ("Incorrect extent in return array in"
173 " MATMUL intrinsic for dimension 1:"
174 " is %ld, should be %ld",
175 (long int) ret_extent
, (long int) arg_extent
);
177 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
178 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
179 if (arg_extent
!= ret_extent
)
180 runtime_error ("Incorrect extent in return array in"
181 " MATMUL intrinsic for dimension 2:"
182 " is %ld, should be %ld",
183 (long int) ret_extent
, (long int) arg_extent
);
188 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
190 /* One-dimensional result may be addressed in the code below
191 either as a row or a column matrix. We want both cases to
193 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
197 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
198 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
202 if (GFC_DESCRIPTOR_RANK (a
) == 1)
204 /* Treat it as a a row matrix A[1,count]. */
205 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
209 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
213 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
214 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
216 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
217 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
220 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
222 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
223 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
226 if (GFC_DESCRIPTOR_RANK (b
) == 1)
228 /* Treat it as a column matrix B[count,1] */
229 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
231 /* bystride should never be used for 1-dimensional b.
232 in case it is we want it to cause a segfault, rather than
233 an incorrect result. */
234 bystride
= 0xDEADBEEF;
239 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
240 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
241 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
244 abase
= a
->base_addr
;
245 bbase
= b
->base_addr
;
246 dest
= retarray
->base_addr
;
248 /* Now that everything is set up, we perform the multiplication
251 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
252 #define min(a,b) ((a) <= (b) ? (a) : (b))
253 #define max(a,b) ((a) >= (b) ? (a) : (b))
255 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
256 && (bxstride
== 1 || bystride
== 1)
257 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
260 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
261 const GFC_REAL_4 one
= 1, zero
= 0;
262 const int lda
= (axstride
== 1) ? aystride
: axstride
,
263 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
265 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
267 assert (gemm
!= NULL
);
268 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
269 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
275 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
277 /* This block of code implements a tuned matmul, derived from
278 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
280 Bo Kagstrom and Per Ling
281 Department of Computing Science
283 S-901 87 Umea, Sweden
285 from netlib.org, translated to C, and modified for matmul.m4. */
287 const GFC_REAL_4
*a
, *b
;
289 const index_type m
= xcount
, n
= ycount
, k
= count
;
291 /* System generated locals */
292 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
293 i1
, i2
, i3
, i4
, i5
, i6
;
295 /* Local variables */
296 GFC_REAL_4 t1
[65536], /* was [256][256] */
297 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
298 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
299 index_type i
, j
, l
, ii
, jj
, ll
;
300 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
304 c
= retarray
->base_addr
;
306 /* Parameter adjustments */
308 c_offset
= 1 + c_dim1
;
311 a_offset
= 1 + a_dim1
;
314 b_offset
= 1 + b_dim1
;
317 /* Early exit if possible */
318 if (m
== 0 || n
== 0 || k
== 0)
324 c
[i
+ j
* c_dim1
] = (GFC_REAL_4
)0;
326 /* Start turning the crank. */
328 for (jj
= 1; jj
<= i1
; jj
+= 512)
334 ujsec
= jsec
- jsec
% 4;
336 for (ll
= 1; ll
<= i2
; ll
+= 256)
342 ulsec
= lsec
- lsec
% 2;
345 for (ii
= 1; ii
<= i3
; ii
+= 256)
351 uisec
= isec
- isec
% 2;
353 for (l
= ll
; l
<= i4
; l
+= 2)
356 for (i
= ii
; i
<= i5
; i
+= 2)
358 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
360 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
361 a
[i
+ (l
+ 1) * a_dim1
];
362 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
363 a
[i
+ 1 + l
* a_dim1
];
364 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
365 a
[i
+ 1 + (l
+ 1) * a_dim1
];
369 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
370 a
[ii
+ isec
- 1 + l
* a_dim1
];
371 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
372 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
378 for (i
= ii
; i
<= i4
; ++i
)
380 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
381 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
385 uisec
= isec
- isec
% 4;
387 for (j
= jj
; j
<= i4
; j
+= 4)
390 for (i
= ii
; i
<= i5
; i
+= 4)
392 f11
= c
[i
+ j
* c_dim1
];
393 f21
= c
[i
+ 1 + j
* c_dim1
];
394 f12
= c
[i
+ (j
+ 1) * c_dim1
];
395 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
396 f13
= c
[i
+ (j
+ 2) * c_dim1
];
397 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
398 f14
= c
[i
+ (j
+ 3) * c_dim1
];
399 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
400 f31
= c
[i
+ 2 + j
* c_dim1
];
401 f41
= c
[i
+ 3 + j
* c_dim1
];
402 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
403 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
404 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
405 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
406 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
407 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
409 for (l
= ll
; l
<= i6
; ++l
)
411 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
413 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
415 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
416 * b
[l
+ (j
+ 1) * b_dim1
];
417 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
418 * b
[l
+ (j
+ 1) * b_dim1
];
419 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
420 * b
[l
+ (j
+ 2) * b_dim1
];
421 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
422 * b
[l
+ (j
+ 2) * b_dim1
];
423 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
424 * b
[l
+ (j
+ 3) * b_dim1
];
425 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
426 * b
[l
+ (j
+ 3) * b_dim1
];
427 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
429 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
431 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
432 * b
[l
+ (j
+ 1) * b_dim1
];
433 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
434 * b
[l
+ (j
+ 1) * b_dim1
];
435 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
436 * b
[l
+ (j
+ 2) * b_dim1
];
437 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
438 * b
[l
+ (j
+ 2) * b_dim1
];
439 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
440 * b
[l
+ (j
+ 3) * b_dim1
];
441 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
442 * b
[l
+ (j
+ 3) * b_dim1
];
444 c
[i
+ j
* c_dim1
] = f11
;
445 c
[i
+ 1 + j
* c_dim1
] = f21
;
446 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
447 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
448 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
449 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
450 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
451 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
452 c
[i
+ 2 + j
* c_dim1
] = f31
;
453 c
[i
+ 3 + j
* c_dim1
] = f41
;
454 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
455 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
456 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
457 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
458 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
459 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
464 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
466 f11
= c
[i
+ j
* c_dim1
];
467 f12
= c
[i
+ (j
+ 1) * c_dim1
];
468 f13
= c
[i
+ (j
+ 2) * c_dim1
];
469 f14
= c
[i
+ (j
+ 3) * c_dim1
];
471 for (l
= ll
; l
<= i6
; ++l
)
473 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
474 257] * b
[l
+ j
* b_dim1
];
475 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
476 257] * b
[l
+ (j
+ 1) * b_dim1
];
477 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
478 257] * b
[l
+ (j
+ 2) * b_dim1
];
479 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
480 257] * b
[l
+ (j
+ 3) * b_dim1
];
482 c
[i
+ j
* c_dim1
] = f11
;
483 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
484 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
485 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
492 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
495 for (i
= ii
; i
<= i5
; i
+= 4)
497 f11
= c
[i
+ j
* c_dim1
];
498 f21
= c
[i
+ 1 + j
* c_dim1
];
499 f31
= c
[i
+ 2 + j
* c_dim1
];
500 f41
= c
[i
+ 3 + j
* c_dim1
];
502 for (l
= ll
; l
<= i6
; ++l
)
504 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
505 257] * b
[l
+ j
* b_dim1
];
506 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
507 257] * b
[l
+ j
* b_dim1
];
508 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
509 257] * b
[l
+ j
* b_dim1
];
510 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
511 257] * b
[l
+ j
* b_dim1
];
513 c
[i
+ j
* c_dim1
] = f11
;
514 c
[i
+ 1 + j
* c_dim1
] = f21
;
515 c
[i
+ 2 + j
* c_dim1
] = f31
;
516 c
[i
+ 3 + j
* c_dim1
] = f41
;
519 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
521 f11
= c
[i
+ j
* c_dim1
];
523 for (l
= ll
; l
<= i6
; ++l
)
525 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
526 257] * b
[l
+ j
* b_dim1
];
528 c
[i
+ j
* c_dim1
] = f11
;
537 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
539 if (GFC_DESCRIPTOR_RANK (a
) != 1)
541 const GFC_REAL_4
*restrict abase_x
;
542 const GFC_REAL_4
*restrict bbase_y
;
543 GFC_REAL_4
*restrict dest_y
;
546 for (y
= 0; y
< ycount
; y
++)
548 bbase_y
= &bbase
[y
*bystride
];
549 dest_y
= &dest
[y
*rystride
];
550 for (x
= 0; x
< xcount
; x
++)
552 abase_x
= &abase
[x
*axstride
];
554 for (n
= 0; n
< count
; n
++)
555 s
+= abase_x
[n
] * bbase_y
[n
];
562 const GFC_REAL_4
*restrict bbase_y
;
565 for (y
= 0; y
< ycount
; y
++)
567 bbase_y
= &bbase
[y
*bystride
];
569 for (n
= 0; n
< count
; n
++)
570 s
+= abase
[n
*axstride
] * bbase_y
[n
];
571 dest
[y
*rystride
] = s
;
575 else if (axstride
< aystride
)
577 for (y
= 0; y
< ycount
; y
++)
578 for (x
= 0; x
< xcount
; x
++)
579 dest
[x
*rxstride
+ y
*rystride
] = (GFC_REAL_4
)0;
581 for (y
= 0; y
< ycount
; y
++)
582 for (n
= 0; n
< count
; n
++)
583 for (x
= 0; x
< xcount
; x
++)
584 /* dest[x,y] += a[x,n] * b[n,y] */
585 dest
[x
*rxstride
+ y
*rystride
] +=
586 abase
[x
*axstride
+ n
*aystride
] *
587 bbase
[n
*bxstride
+ y
*bystride
];
589 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
591 const GFC_REAL_4
*restrict bbase_y
;
594 for (y
= 0; y
< ycount
; y
++)
596 bbase_y
= &bbase
[y
*bystride
];
598 for (n
= 0; n
< count
; n
++)
599 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
600 dest
[y
*rxstride
] = s
;
605 const GFC_REAL_4
*restrict abase_x
;
606 const GFC_REAL_4
*restrict bbase_y
;
607 GFC_REAL_4
*restrict dest_y
;
610 for (y
= 0; y
< ycount
; y
++)
612 bbase_y
= &bbase
[y
*bystride
];
613 dest_y
= &dest
[y
*rystride
];
614 for (x
= 0; x
< xcount
; x
++)
616 abase_x
= &abase
[x
*axstride
];
618 for (n
= 0; n
< count
; n
++)
619 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
620 dest_y
[x
*rxstride
] = s
;
629 #endif /* HAVE_AVX */
633 matmul_r4_avx2 (gfc_array_r4
* const restrict retarray
,
634 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
635 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx2")));
637 matmul_r4_avx2 (gfc_array_r4
* const restrict retarray
,
638 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
639 int blas_limit
, blas_call gemm
)
641 const GFC_REAL_4
* restrict abase
;
642 const GFC_REAL_4
* restrict bbase
;
643 GFC_REAL_4
* restrict dest
;
645 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
646 index_type x
, y
, n
, count
, xcount
, ycount
;
648 assert (GFC_DESCRIPTOR_RANK (a
) == 2
649 || GFC_DESCRIPTOR_RANK (b
) == 2);
651 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
653 Either A or B (but not both) can be rank 1:
655 o One-dimensional argument A is implicitly treated as a row matrix
656 dimensioned [1,count], so xcount=1.
658 o One-dimensional argument B is implicitly treated as a column matrix
659 dimensioned [count, 1], so ycount=1.
662 if (retarray
->base_addr
== NULL
)
664 if (GFC_DESCRIPTOR_RANK (a
) == 1)
666 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
667 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
669 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
671 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
672 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
676 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
677 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
679 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
680 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
681 GFC_DESCRIPTOR_EXTENT(retarray
,0));
685 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_REAL_4
));
686 retarray
->offset
= 0;
688 else if (unlikely (compile_options
.bounds_check
))
690 index_type ret_extent
, arg_extent
;
692 if (GFC_DESCRIPTOR_RANK (a
) == 1)
694 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
695 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
696 if (arg_extent
!= ret_extent
)
697 runtime_error ("Incorrect extent in return array in"
698 " MATMUL intrinsic: is %ld, should be %ld",
699 (long int) ret_extent
, (long int) arg_extent
);
701 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
703 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
704 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
705 if (arg_extent
!= ret_extent
)
706 runtime_error ("Incorrect extent in return array in"
707 " MATMUL intrinsic: is %ld, should be %ld",
708 (long int) ret_extent
, (long int) arg_extent
);
712 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
713 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
714 if (arg_extent
!= ret_extent
)
715 runtime_error ("Incorrect extent in return array in"
716 " MATMUL intrinsic for dimension 1:"
717 " is %ld, should be %ld",
718 (long int) ret_extent
, (long int) arg_extent
);
720 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
721 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
722 if (arg_extent
!= ret_extent
)
723 runtime_error ("Incorrect extent in return array in"
724 " MATMUL intrinsic for dimension 2:"
725 " is %ld, should be %ld",
726 (long int) ret_extent
, (long int) arg_extent
);
731 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
733 /* One-dimensional result may be addressed in the code below
734 either as a row or a column matrix. We want both cases to
736 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
740 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
741 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
745 if (GFC_DESCRIPTOR_RANK (a
) == 1)
747 /* Treat it as a a row matrix A[1,count]. */
748 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
752 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
756 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
757 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
759 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
760 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
763 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
765 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
766 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
769 if (GFC_DESCRIPTOR_RANK (b
) == 1)
771 /* Treat it as a column matrix B[count,1] */
772 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
774 /* bystride should never be used for 1-dimensional b.
775 in case it is we want it to cause a segfault, rather than
776 an incorrect result. */
777 bystride
= 0xDEADBEEF;
782 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
783 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
784 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
787 abase
= a
->base_addr
;
788 bbase
= b
->base_addr
;
789 dest
= retarray
->base_addr
;
791 /* Now that everything is set up, we perform the multiplication
794 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
795 #define min(a,b) ((a) <= (b) ? (a) : (b))
796 #define max(a,b) ((a) >= (b) ? (a) : (b))
798 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
799 && (bxstride
== 1 || bystride
== 1)
800 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
803 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
804 const GFC_REAL_4 one
= 1, zero
= 0;
805 const int lda
= (axstride
== 1) ? aystride
: axstride
,
806 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
808 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
810 assert (gemm
!= NULL
);
811 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
812 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
818 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
820 /* This block of code implements a tuned matmul, derived from
821 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
823 Bo Kagstrom and Per Ling
824 Department of Computing Science
826 S-901 87 Umea, Sweden
828 from netlib.org, translated to C, and modified for matmul.m4. */
830 const GFC_REAL_4
*a
, *b
;
832 const index_type m
= xcount
, n
= ycount
, k
= count
;
834 /* System generated locals */
835 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
836 i1
, i2
, i3
, i4
, i5
, i6
;
838 /* Local variables */
839 GFC_REAL_4 t1
[65536], /* was [256][256] */
840 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
841 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
842 index_type i
, j
, l
, ii
, jj
, ll
;
843 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
847 c
= retarray
->base_addr
;
849 /* Parameter adjustments */
851 c_offset
= 1 + c_dim1
;
854 a_offset
= 1 + a_dim1
;
857 b_offset
= 1 + b_dim1
;
860 /* Early exit if possible */
861 if (m
== 0 || n
== 0 || k
== 0)
867 c
[i
+ j
* c_dim1
] = (GFC_REAL_4
)0;
869 /* Start turning the crank. */
871 for (jj
= 1; jj
<= i1
; jj
+= 512)
877 ujsec
= jsec
- jsec
% 4;
879 for (ll
= 1; ll
<= i2
; ll
+= 256)
885 ulsec
= lsec
- lsec
% 2;
888 for (ii
= 1; ii
<= i3
; ii
+= 256)
894 uisec
= isec
- isec
% 2;
896 for (l
= ll
; l
<= i4
; l
+= 2)
899 for (i
= ii
; i
<= i5
; i
+= 2)
901 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
903 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
904 a
[i
+ (l
+ 1) * a_dim1
];
905 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
906 a
[i
+ 1 + l
* a_dim1
];
907 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
908 a
[i
+ 1 + (l
+ 1) * a_dim1
];
912 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
913 a
[ii
+ isec
- 1 + l
* a_dim1
];
914 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
915 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
921 for (i
= ii
; i
<= i4
; ++i
)
923 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
924 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
928 uisec
= isec
- isec
% 4;
930 for (j
= jj
; j
<= i4
; j
+= 4)
933 for (i
= ii
; i
<= i5
; i
+= 4)
935 f11
= c
[i
+ j
* c_dim1
];
936 f21
= c
[i
+ 1 + j
* c_dim1
];
937 f12
= c
[i
+ (j
+ 1) * c_dim1
];
938 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
939 f13
= c
[i
+ (j
+ 2) * c_dim1
];
940 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
941 f14
= c
[i
+ (j
+ 3) * c_dim1
];
942 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
943 f31
= c
[i
+ 2 + j
* c_dim1
];
944 f41
= c
[i
+ 3 + j
* c_dim1
];
945 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
946 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
947 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
948 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
949 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
950 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
952 for (l
= ll
; l
<= i6
; ++l
)
954 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
956 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
958 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
959 * b
[l
+ (j
+ 1) * b_dim1
];
960 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
961 * b
[l
+ (j
+ 1) * b_dim1
];
962 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
963 * b
[l
+ (j
+ 2) * b_dim1
];
964 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
965 * b
[l
+ (j
+ 2) * b_dim1
];
966 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
967 * b
[l
+ (j
+ 3) * b_dim1
];
968 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
969 * b
[l
+ (j
+ 3) * b_dim1
];
970 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
972 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
974 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
975 * b
[l
+ (j
+ 1) * b_dim1
];
976 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
977 * b
[l
+ (j
+ 1) * b_dim1
];
978 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
979 * b
[l
+ (j
+ 2) * b_dim1
];
980 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
981 * b
[l
+ (j
+ 2) * b_dim1
];
982 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
983 * b
[l
+ (j
+ 3) * b_dim1
];
984 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
985 * b
[l
+ (j
+ 3) * b_dim1
];
987 c
[i
+ j
* c_dim1
] = f11
;
988 c
[i
+ 1 + j
* c_dim1
] = f21
;
989 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
990 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
991 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
992 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
993 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
994 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
995 c
[i
+ 2 + j
* c_dim1
] = f31
;
996 c
[i
+ 3 + j
* c_dim1
] = f41
;
997 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
998 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
999 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1000 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1001 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1002 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1007 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1009 f11
= c
[i
+ j
* c_dim1
];
1010 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1011 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1012 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1014 for (l
= ll
; l
<= i6
; ++l
)
1016 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1017 257] * b
[l
+ j
* b_dim1
];
1018 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1019 257] * b
[l
+ (j
+ 1) * b_dim1
];
1020 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1021 257] * b
[l
+ (j
+ 2) * b_dim1
];
1022 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1023 257] * b
[l
+ (j
+ 3) * b_dim1
];
1025 c
[i
+ j
* c_dim1
] = f11
;
1026 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1027 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1028 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1035 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1037 i5
= ii
+ uisec
- 1;
1038 for (i
= ii
; i
<= i5
; i
+= 4)
1040 f11
= c
[i
+ j
* c_dim1
];
1041 f21
= c
[i
+ 1 + j
* c_dim1
];
1042 f31
= c
[i
+ 2 + j
* c_dim1
];
1043 f41
= c
[i
+ 3 + j
* c_dim1
];
1045 for (l
= ll
; l
<= i6
; ++l
)
1047 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1048 257] * b
[l
+ j
* b_dim1
];
1049 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1050 257] * b
[l
+ j
* b_dim1
];
1051 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1052 257] * b
[l
+ j
* b_dim1
];
1053 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1054 257] * b
[l
+ j
* b_dim1
];
1056 c
[i
+ j
* c_dim1
] = f11
;
1057 c
[i
+ 1 + j
* c_dim1
] = f21
;
1058 c
[i
+ 2 + j
* c_dim1
] = f31
;
1059 c
[i
+ 3 + j
* c_dim1
] = f41
;
1062 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1064 f11
= c
[i
+ j
* c_dim1
];
1066 for (l
= ll
; l
<= i6
; ++l
)
1068 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1069 257] * b
[l
+ j
* b_dim1
];
1071 c
[i
+ j
* c_dim1
] = f11
;
1080 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1082 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1084 const GFC_REAL_4
*restrict abase_x
;
1085 const GFC_REAL_4
*restrict bbase_y
;
1086 GFC_REAL_4
*restrict dest_y
;
1089 for (y
= 0; y
< ycount
; y
++)
1091 bbase_y
= &bbase
[y
*bystride
];
1092 dest_y
= &dest
[y
*rystride
];
1093 for (x
= 0; x
< xcount
; x
++)
1095 abase_x
= &abase
[x
*axstride
];
1097 for (n
= 0; n
< count
; n
++)
1098 s
+= abase_x
[n
] * bbase_y
[n
];
1105 const GFC_REAL_4
*restrict bbase_y
;
1108 for (y
= 0; y
< ycount
; y
++)
1110 bbase_y
= &bbase
[y
*bystride
];
1112 for (n
= 0; n
< count
; n
++)
1113 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1114 dest
[y
*rystride
] = s
;
1118 else if (axstride
< aystride
)
1120 for (y
= 0; y
< ycount
; y
++)
1121 for (x
= 0; x
< xcount
; x
++)
1122 dest
[x
*rxstride
+ y
*rystride
] = (GFC_REAL_4
)0;
1124 for (y
= 0; y
< ycount
; y
++)
1125 for (n
= 0; n
< count
; n
++)
1126 for (x
= 0; x
< xcount
; x
++)
1127 /* dest[x,y] += a[x,n] * b[n,y] */
1128 dest
[x
*rxstride
+ y
*rystride
] +=
1129 abase
[x
*axstride
+ n
*aystride
] *
1130 bbase
[n
*bxstride
+ y
*bystride
];
1132 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1134 const GFC_REAL_4
*restrict bbase_y
;
1137 for (y
= 0; y
< ycount
; y
++)
1139 bbase_y
= &bbase
[y
*bystride
];
1141 for (n
= 0; n
< count
; n
++)
1142 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1143 dest
[y
*rxstride
] = s
;
1148 const GFC_REAL_4
*restrict abase_x
;
1149 const GFC_REAL_4
*restrict bbase_y
;
1150 GFC_REAL_4
*restrict dest_y
;
1153 for (y
= 0; y
< ycount
; y
++)
1155 bbase_y
= &bbase
[y
*bystride
];
1156 dest_y
= &dest
[y
*rystride
];
1157 for (x
= 0; x
< xcount
; x
++)
1159 abase_x
= &abase
[x
*axstride
];
1161 for (n
= 0; n
< count
; n
++)
1162 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1163 dest_y
[x
*rxstride
] = s
;
1172 #endif /* HAVE_AVX2 */
1176 matmul_r4_avx512f (gfc_array_r4
* const restrict retarray
,
1177 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
1178 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx512f")));
1180 matmul_r4_avx512f (gfc_array_r4
* const restrict retarray
,
1181 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
1182 int blas_limit
, blas_call gemm
)
1184 const GFC_REAL_4
* restrict abase
;
1185 const GFC_REAL_4
* restrict bbase
;
1186 GFC_REAL_4
* restrict dest
;
1188 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1189 index_type x
, y
, n
, count
, xcount
, ycount
;
1191 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1192 || GFC_DESCRIPTOR_RANK (b
) == 2);
1194 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1196 Either A or B (but not both) can be rank 1:
1198 o One-dimensional argument A is implicitly treated as a row matrix
1199 dimensioned [1,count], so xcount=1.
1201 o One-dimensional argument B is implicitly treated as a column matrix
1202 dimensioned [count, 1], so ycount=1.
1205 if (retarray
->base_addr
== NULL
)
1207 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1209 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1210 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1212 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1214 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1215 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1219 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1220 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1222 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1223 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1224 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1228 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_REAL_4
));
1229 retarray
->offset
= 0;
1231 else if (unlikely (compile_options
.bounds_check
))
1233 index_type ret_extent
, arg_extent
;
1235 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1237 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1238 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1239 if (arg_extent
!= ret_extent
)
1240 runtime_error ("Incorrect extent in return array in"
1241 " MATMUL intrinsic: is %ld, should be %ld",
1242 (long int) ret_extent
, (long int) arg_extent
);
1244 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1246 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1247 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1248 if (arg_extent
!= ret_extent
)
1249 runtime_error ("Incorrect extent in return array in"
1250 " MATMUL intrinsic: is %ld, should be %ld",
1251 (long int) ret_extent
, (long int) arg_extent
);
1255 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1256 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1257 if (arg_extent
!= ret_extent
)
1258 runtime_error ("Incorrect extent in return array in"
1259 " MATMUL intrinsic for dimension 1:"
1260 " is %ld, should be %ld",
1261 (long int) ret_extent
, (long int) arg_extent
);
1263 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1264 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1265 if (arg_extent
!= ret_extent
)
1266 runtime_error ("Incorrect extent in return array in"
1267 " MATMUL intrinsic for dimension 2:"
1268 " is %ld, should be %ld",
1269 (long int) ret_extent
, (long int) arg_extent
);
1274 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1276 /* One-dimensional result may be addressed in the code below
1277 either as a row or a column matrix. We want both cases to
1279 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1283 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1284 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1288 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1290 /* Treat it as a a row matrix A[1,count]. */
1291 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1295 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1299 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1300 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1302 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1303 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1306 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1308 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1309 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1312 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1314 /* Treat it as a column matrix B[count,1] */
1315 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1317 /* bystride should never be used for 1-dimensional b.
1318 in case it is we want it to cause a segfault, rather than
1319 an incorrect result. */
1320 bystride
= 0xDEADBEEF;
1325 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1326 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1327 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1330 abase
= a
->base_addr
;
1331 bbase
= b
->base_addr
;
1332 dest
= retarray
->base_addr
;
1334 /* Now that everything is set up, we perform the multiplication
1337 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1338 #define min(a,b) ((a) <= (b) ? (a) : (b))
1339 #define max(a,b) ((a) >= (b) ? (a) : (b))
1341 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1342 && (bxstride
== 1 || bystride
== 1)
1343 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1344 > POW3(blas_limit
)))
1346 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1347 const GFC_REAL_4 one
= 1, zero
= 0;
1348 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1349 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1351 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1353 assert (gemm
!= NULL
);
1354 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1355 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1361 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1363 /* This block of code implements a tuned matmul, derived from
1364 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1366 Bo Kagstrom and Per Ling
1367 Department of Computing Science
1369 S-901 87 Umea, Sweden
1371 from netlib.org, translated to C, and modified for matmul.m4. */
1373 const GFC_REAL_4
*a
, *b
;
1375 const index_type m
= xcount
, n
= ycount
, k
= count
;
1377 /* System generated locals */
1378 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1379 i1
, i2
, i3
, i4
, i5
, i6
;
1381 /* Local variables */
1382 GFC_REAL_4 t1
[65536], /* was [256][256] */
1383 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1384 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1385 index_type i
, j
, l
, ii
, jj
, ll
;
1386 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1390 c
= retarray
->base_addr
;
1392 /* Parameter adjustments */
1394 c_offset
= 1 + c_dim1
;
1397 a_offset
= 1 + a_dim1
;
1400 b_offset
= 1 + b_dim1
;
1403 /* Early exit if possible */
1404 if (m
== 0 || n
== 0 || k
== 0)
1407 /* Empty c first. */
1408 for (j
=1; j
<=n
; j
++)
1409 for (i
=1; i
<=m
; i
++)
1410 c
[i
+ j
* c_dim1
] = (GFC_REAL_4
)0;
1412 /* Start turning the crank. */
1414 for (jj
= 1; jj
<= i1
; jj
+= 512)
1420 ujsec
= jsec
- jsec
% 4;
1422 for (ll
= 1; ll
<= i2
; ll
+= 256)
1428 ulsec
= lsec
- lsec
% 2;
1431 for (ii
= 1; ii
<= i3
; ii
+= 256)
1437 uisec
= isec
- isec
% 2;
1438 i4
= ll
+ ulsec
- 1;
1439 for (l
= ll
; l
<= i4
; l
+= 2)
1441 i5
= ii
+ uisec
- 1;
1442 for (i
= ii
; i
<= i5
; i
+= 2)
1444 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
1446 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
1447 a
[i
+ (l
+ 1) * a_dim1
];
1448 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
1449 a
[i
+ 1 + l
* a_dim1
];
1450 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
1451 a
[i
+ 1 + (l
+ 1) * a_dim1
];
1455 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
1456 a
[ii
+ isec
- 1 + l
* a_dim1
];
1457 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
1458 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
1464 for (i
= ii
; i
<= i4
; ++i
)
1466 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
1467 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
1471 uisec
= isec
- isec
% 4;
1472 i4
= jj
+ ujsec
- 1;
1473 for (j
= jj
; j
<= i4
; j
+= 4)
1475 i5
= ii
+ uisec
- 1;
1476 for (i
= ii
; i
<= i5
; i
+= 4)
1478 f11
= c
[i
+ j
* c_dim1
];
1479 f21
= c
[i
+ 1 + j
* c_dim1
];
1480 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1481 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
1482 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1483 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
1484 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1485 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
1486 f31
= c
[i
+ 2 + j
* c_dim1
];
1487 f41
= c
[i
+ 3 + j
* c_dim1
];
1488 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
1489 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
1490 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
1491 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
1492 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
1493 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
1495 for (l
= ll
; l
<= i6
; ++l
)
1497 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1498 * b
[l
+ j
* b_dim1
];
1499 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1500 * b
[l
+ j
* b_dim1
];
1501 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1502 * b
[l
+ (j
+ 1) * b_dim1
];
1503 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1504 * b
[l
+ (j
+ 1) * b_dim1
];
1505 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1506 * b
[l
+ (j
+ 2) * b_dim1
];
1507 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1508 * b
[l
+ (j
+ 2) * b_dim1
];
1509 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1510 * b
[l
+ (j
+ 3) * b_dim1
];
1511 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1512 * b
[l
+ (j
+ 3) * b_dim1
];
1513 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1514 * b
[l
+ j
* b_dim1
];
1515 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1516 * b
[l
+ j
* b_dim1
];
1517 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1518 * b
[l
+ (j
+ 1) * b_dim1
];
1519 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1520 * b
[l
+ (j
+ 1) * b_dim1
];
1521 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1522 * b
[l
+ (j
+ 2) * b_dim1
];
1523 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1524 * b
[l
+ (j
+ 2) * b_dim1
];
1525 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1526 * b
[l
+ (j
+ 3) * b_dim1
];
1527 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1528 * b
[l
+ (j
+ 3) * b_dim1
];
1530 c
[i
+ j
* c_dim1
] = f11
;
1531 c
[i
+ 1 + j
* c_dim1
] = f21
;
1532 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1533 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1534 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1535 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1536 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1537 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1538 c
[i
+ 2 + j
* c_dim1
] = f31
;
1539 c
[i
+ 3 + j
* c_dim1
] = f41
;
1540 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1541 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1542 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1543 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1544 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1545 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1550 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1552 f11
= c
[i
+ j
* c_dim1
];
1553 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1554 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1555 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1557 for (l
= ll
; l
<= i6
; ++l
)
1559 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1560 257] * b
[l
+ j
* b_dim1
];
1561 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1562 257] * b
[l
+ (j
+ 1) * b_dim1
];
1563 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1564 257] * b
[l
+ (j
+ 2) * b_dim1
];
1565 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1566 257] * b
[l
+ (j
+ 3) * b_dim1
];
1568 c
[i
+ j
* c_dim1
] = f11
;
1569 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1570 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1571 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1578 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1580 i5
= ii
+ uisec
- 1;
1581 for (i
= ii
; i
<= i5
; i
+= 4)
1583 f11
= c
[i
+ j
* c_dim1
];
1584 f21
= c
[i
+ 1 + j
* c_dim1
];
1585 f31
= c
[i
+ 2 + j
* c_dim1
];
1586 f41
= c
[i
+ 3 + j
* c_dim1
];
1588 for (l
= ll
; l
<= i6
; ++l
)
1590 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1591 257] * b
[l
+ j
* b_dim1
];
1592 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1593 257] * b
[l
+ j
* b_dim1
];
1594 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1595 257] * b
[l
+ j
* b_dim1
];
1596 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1597 257] * b
[l
+ j
* b_dim1
];
1599 c
[i
+ j
* c_dim1
] = f11
;
1600 c
[i
+ 1 + j
* c_dim1
] = f21
;
1601 c
[i
+ 2 + j
* c_dim1
] = f31
;
1602 c
[i
+ 3 + j
* c_dim1
] = f41
;
1605 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1607 f11
= c
[i
+ j
* c_dim1
];
1609 for (l
= ll
; l
<= i6
; ++l
)
1611 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1612 257] * b
[l
+ j
* b_dim1
];
1614 c
[i
+ j
* c_dim1
] = f11
;
1623 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1625 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1627 const GFC_REAL_4
*restrict abase_x
;
1628 const GFC_REAL_4
*restrict bbase_y
;
1629 GFC_REAL_4
*restrict dest_y
;
1632 for (y
= 0; y
< ycount
; y
++)
1634 bbase_y
= &bbase
[y
*bystride
];
1635 dest_y
= &dest
[y
*rystride
];
1636 for (x
= 0; x
< xcount
; x
++)
1638 abase_x
= &abase
[x
*axstride
];
1640 for (n
= 0; n
< count
; n
++)
1641 s
+= abase_x
[n
] * bbase_y
[n
];
1648 const GFC_REAL_4
*restrict bbase_y
;
1651 for (y
= 0; y
< ycount
; y
++)
1653 bbase_y
= &bbase
[y
*bystride
];
1655 for (n
= 0; n
< count
; n
++)
1656 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1657 dest
[y
*rystride
] = s
;
1661 else if (axstride
< aystride
)
1663 for (y
= 0; y
< ycount
; y
++)
1664 for (x
= 0; x
< xcount
; x
++)
1665 dest
[x
*rxstride
+ y
*rystride
] = (GFC_REAL_4
)0;
1667 for (y
= 0; y
< ycount
; y
++)
1668 for (n
= 0; n
< count
; n
++)
1669 for (x
= 0; x
< xcount
; x
++)
1670 /* dest[x,y] += a[x,n] * b[n,y] */
1671 dest
[x
*rxstride
+ y
*rystride
] +=
1672 abase
[x
*axstride
+ n
*aystride
] *
1673 bbase
[n
*bxstride
+ y
*bystride
];
1675 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1677 const GFC_REAL_4
*restrict bbase_y
;
1680 for (y
= 0; y
< ycount
; y
++)
1682 bbase_y
= &bbase
[y
*bystride
];
1684 for (n
= 0; n
< count
; n
++)
1685 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1686 dest
[y
*rxstride
] = s
;
1691 const GFC_REAL_4
*restrict abase_x
;
1692 const GFC_REAL_4
*restrict bbase_y
;
1693 GFC_REAL_4
*restrict dest_y
;
1696 for (y
= 0; y
< ycount
; y
++)
1698 bbase_y
= &bbase
[y
*bystride
];
1699 dest_y
= &dest
[y
*rystride
];
1700 for (x
= 0; x
< xcount
; x
++)
1702 abase_x
= &abase
[x
*axstride
];
1704 for (n
= 0; n
< count
; n
++)
1705 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1706 dest_y
[x
*rxstride
] = s
;
1715 #endif /* HAVE_AVX512F */
1717 /* Function to fall back to if there is no special processor-specific version. */
1719 matmul_r4_vanilla (gfc_array_r4
* const restrict retarray
,
1720 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
1721 int blas_limit
, blas_call gemm
)
1723 const GFC_REAL_4
* restrict abase
;
1724 const GFC_REAL_4
* restrict bbase
;
1725 GFC_REAL_4
* restrict dest
;
1727 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1728 index_type x
, y
, n
, count
, xcount
, ycount
;
1730 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1731 || GFC_DESCRIPTOR_RANK (b
) == 2);
1733 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1735 Either A or B (but not both) can be rank 1:
1737 o One-dimensional argument A is implicitly treated as a row matrix
1738 dimensioned [1,count], so xcount=1.
1740 o One-dimensional argument B is implicitly treated as a column matrix
1741 dimensioned [count, 1], so ycount=1.
1744 if (retarray
->base_addr
== NULL
)
1746 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1748 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1749 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1751 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1753 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1754 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1758 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1759 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1761 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1762 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1763 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1767 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_REAL_4
));
1768 retarray
->offset
= 0;
1770 else if (unlikely (compile_options
.bounds_check
))
1772 index_type ret_extent
, arg_extent
;
1774 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1776 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1777 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1778 if (arg_extent
!= ret_extent
)
1779 runtime_error ("Incorrect extent in return array in"
1780 " MATMUL intrinsic: is %ld, should be %ld",
1781 (long int) ret_extent
, (long int) arg_extent
);
1783 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1785 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1786 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1787 if (arg_extent
!= ret_extent
)
1788 runtime_error ("Incorrect extent in return array in"
1789 " MATMUL intrinsic: is %ld, should be %ld",
1790 (long int) ret_extent
, (long int) arg_extent
);
1794 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1795 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1796 if (arg_extent
!= ret_extent
)
1797 runtime_error ("Incorrect extent in return array in"
1798 " MATMUL intrinsic for dimension 1:"
1799 " is %ld, should be %ld",
1800 (long int) ret_extent
, (long int) arg_extent
);
1802 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1803 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1804 if (arg_extent
!= ret_extent
)
1805 runtime_error ("Incorrect extent in return array in"
1806 " MATMUL intrinsic for dimension 2:"
1807 " is %ld, should be %ld",
1808 (long int) ret_extent
, (long int) arg_extent
);
1813 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1815 /* One-dimensional result may be addressed in the code below
1816 either as a row or a column matrix. We want both cases to
1818 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1822 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1823 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1827 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1829 /* Treat it as a a row matrix A[1,count]. */
1830 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1834 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1838 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1839 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1841 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1842 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1845 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1847 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1848 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1851 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1853 /* Treat it as a column matrix B[count,1] */
1854 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1856 /* bystride should never be used for 1-dimensional b.
1857 in case it is we want it to cause a segfault, rather than
1858 an incorrect result. */
1859 bystride
= 0xDEADBEEF;
1864 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1865 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1866 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1869 abase
= a
->base_addr
;
1870 bbase
= b
->base_addr
;
1871 dest
= retarray
->base_addr
;
1873 /* Now that everything is set up, we perform the multiplication
1876 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1877 #define min(a,b) ((a) <= (b) ? (a) : (b))
1878 #define max(a,b) ((a) >= (b) ? (a) : (b))
1880 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1881 && (bxstride
== 1 || bystride
== 1)
1882 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1883 > POW3(blas_limit
)))
1885 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1886 const GFC_REAL_4 one
= 1, zero
= 0;
1887 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1888 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1890 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1892 assert (gemm
!= NULL
);
1893 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1894 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1900 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1902 /* This block of code implements a tuned matmul, derived from
1903 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1905 Bo Kagstrom and Per Ling
1906 Department of Computing Science
1908 S-901 87 Umea, Sweden
1910 from netlib.org, translated to C, and modified for matmul.m4. */
1912 const GFC_REAL_4
*a
, *b
;
1914 const index_type m
= xcount
, n
= ycount
, k
= count
;
1916 /* System generated locals */
1917 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1918 i1
, i2
, i3
, i4
, i5
, i6
;
1920 /* Local variables */
1921 GFC_REAL_4 t1
[65536], /* was [256][256] */
1922 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1923 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1924 index_type i
, j
, l
, ii
, jj
, ll
;
1925 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1929 c
= retarray
->base_addr
;
1931 /* Parameter adjustments */
1933 c_offset
= 1 + c_dim1
;
1936 a_offset
= 1 + a_dim1
;
1939 b_offset
= 1 + b_dim1
;
1942 /* Early exit if possible */
1943 if (m
== 0 || n
== 0 || k
== 0)
1946 /* Empty c first. */
1947 for (j
=1; j
<=n
; j
++)
1948 for (i
=1; i
<=m
; i
++)
1949 c
[i
+ j
* c_dim1
] = (GFC_REAL_4
)0;
1951 /* Start turning the crank. */
1953 for (jj
= 1; jj
<= i1
; jj
+= 512)
1959 ujsec
= jsec
- jsec
% 4;
1961 for (ll
= 1; ll
<= i2
; ll
+= 256)
1967 ulsec
= lsec
- lsec
% 2;
1970 for (ii
= 1; ii
<= i3
; ii
+= 256)
1976 uisec
= isec
- isec
% 2;
1977 i4
= ll
+ ulsec
- 1;
1978 for (l
= ll
; l
<= i4
; l
+= 2)
1980 i5
= ii
+ uisec
- 1;
1981 for (i
= ii
; i
<= i5
; i
+= 2)
1983 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
1985 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
1986 a
[i
+ (l
+ 1) * a_dim1
];
1987 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
1988 a
[i
+ 1 + l
* a_dim1
];
1989 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
1990 a
[i
+ 1 + (l
+ 1) * a_dim1
];
1994 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
1995 a
[ii
+ isec
- 1 + l
* a_dim1
];
1996 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
1997 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2003 for (i
= ii
; i
<= i4
; ++i
)
2005 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2006 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2010 uisec
= isec
- isec
% 4;
2011 i4
= jj
+ ujsec
- 1;
2012 for (j
= jj
; j
<= i4
; j
+= 4)
2014 i5
= ii
+ uisec
- 1;
2015 for (i
= ii
; i
<= i5
; i
+= 4)
2017 f11
= c
[i
+ j
* c_dim1
];
2018 f21
= c
[i
+ 1 + j
* c_dim1
];
2019 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2020 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2021 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2022 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2023 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2024 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2025 f31
= c
[i
+ 2 + j
* c_dim1
];
2026 f41
= c
[i
+ 3 + j
* c_dim1
];
2027 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2028 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2029 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2030 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2031 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2032 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2034 for (l
= ll
; l
<= i6
; ++l
)
2036 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2037 * b
[l
+ j
* b_dim1
];
2038 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2039 * b
[l
+ j
* b_dim1
];
2040 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2041 * b
[l
+ (j
+ 1) * b_dim1
];
2042 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2043 * b
[l
+ (j
+ 1) * b_dim1
];
2044 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2045 * b
[l
+ (j
+ 2) * b_dim1
];
2046 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2047 * b
[l
+ (j
+ 2) * b_dim1
];
2048 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2049 * b
[l
+ (j
+ 3) * b_dim1
];
2050 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2051 * b
[l
+ (j
+ 3) * b_dim1
];
2052 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2053 * b
[l
+ j
* b_dim1
];
2054 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2055 * b
[l
+ j
* b_dim1
];
2056 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2057 * b
[l
+ (j
+ 1) * b_dim1
];
2058 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2059 * b
[l
+ (j
+ 1) * b_dim1
];
2060 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2061 * b
[l
+ (j
+ 2) * b_dim1
];
2062 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2063 * b
[l
+ (j
+ 2) * b_dim1
];
2064 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2065 * b
[l
+ (j
+ 3) * b_dim1
];
2066 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2067 * b
[l
+ (j
+ 3) * b_dim1
];
2069 c
[i
+ j
* c_dim1
] = f11
;
2070 c
[i
+ 1 + j
* c_dim1
] = f21
;
2071 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2072 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2073 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2074 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2075 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2076 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2077 c
[i
+ 2 + j
* c_dim1
] = f31
;
2078 c
[i
+ 3 + j
* c_dim1
] = f41
;
2079 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2080 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2081 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2082 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2083 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2084 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2089 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2091 f11
= c
[i
+ j
* c_dim1
];
2092 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2093 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2094 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2096 for (l
= ll
; l
<= i6
; ++l
)
2098 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2099 257] * b
[l
+ j
* b_dim1
];
2100 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2101 257] * b
[l
+ (j
+ 1) * b_dim1
];
2102 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2103 257] * b
[l
+ (j
+ 2) * b_dim1
];
2104 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2105 257] * b
[l
+ (j
+ 3) * b_dim1
];
2107 c
[i
+ j
* c_dim1
] = f11
;
2108 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2109 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2110 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2117 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2119 i5
= ii
+ uisec
- 1;
2120 for (i
= ii
; i
<= i5
; i
+= 4)
2122 f11
= c
[i
+ j
* c_dim1
];
2123 f21
= c
[i
+ 1 + j
* c_dim1
];
2124 f31
= c
[i
+ 2 + j
* c_dim1
];
2125 f41
= c
[i
+ 3 + j
* c_dim1
];
2127 for (l
= ll
; l
<= i6
; ++l
)
2129 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2130 257] * b
[l
+ j
* b_dim1
];
2131 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2132 257] * b
[l
+ j
* b_dim1
];
2133 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2134 257] * b
[l
+ j
* b_dim1
];
2135 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2136 257] * b
[l
+ j
* b_dim1
];
2138 c
[i
+ j
* c_dim1
] = f11
;
2139 c
[i
+ 1 + j
* c_dim1
] = f21
;
2140 c
[i
+ 2 + j
* c_dim1
] = f31
;
2141 c
[i
+ 3 + j
* c_dim1
] = f41
;
2144 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2146 f11
= c
[i
+ j
* c_dim1
];
2148 for (l
= ll
; l
<= i6
; ++l
)
2150 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2151 257] * b
[l
+ j
* b_dim1
];
2153 c
[i
+ j
* c_dim1
] = f11
;
2162 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2164 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2166 const GFC_REAL_4
*restrict abase_x
;
2167 const GFC_REAL_4
*restrict bbase_y
;
2168 GFC_REAL_4
*restrict dest_y
;
2171 for (y
= 0; y
< ycount
; y
++)
2173 bbase_y
= &bbase
[y
*bystride
];
2174 dest_y
= &dest
[y
*rystride
];
2175 for (x
= 0; x
< xcount
; x
++)
2177 abase_x
= &abase
[x
*axstride
];
2179 for (n
= 0; n
< count
; n
++)
2180 s
+= abase_x
[n
] * bbase_y
[n
];
2187 const GFC_REAL_4
*restrict bbase_y
;
2190 for (y
= 0; y
< ycount
; y
++)
2192 bbase_y
= &bbase
[y
*bystride
];
2194 for (n
= 0; n
< count
; n
++)
2195 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2196 dest
[y
*rystride
] = s
;
2200 else if (axstride
< aystride
)
2202 for (y
= 0; y
< ycount
; y
++)
2203 for (x
= 0; x
< xcount
; x
++)
2204 dest
[x
*rxstride
+ y
*rystride
] = (GFC_REAL_4
)0;
2206 for (y
= 0; y
< ycount
; y
++)
2207 for (n
= 0; n
< count
; n
++)
2208 for (x
= 0; x
< xcount
; x
++)
2209 /* dest[x,y] += a[x,n] * b[n,y] */
2210 dest
[x
*rxstride
+ y
*rystride
] +=
2211 abase
[x
*axstride
+ n
*aystride
] *
2212 bbase
[n
*bxstride
+ y
*bystride
];
2214 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2216 const GFC_REAL_4
*restrict bbase_y
;
2219 for (y
= 0; y
< ycount
; y
++)
2221 bbase_y
= &bbase
[y
*bystride
];
2223 for (n
= 0; n
< count
; n
++)
2224 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2225 dest
[y
*rxstride
] = s
;
2230 const GFC_REAL_4
*restrict abase_x
;
2231 const GFC_REAL_4
*restrict bbase_y
;
2232 GFC_REAL_4
*restrict dest_y
;
2235 for (y
= 0; y
< ycount
; y
++)
2237 bbase_y
= &bbase
[y
*bystride
];
2238 dest_y
= &dest
[y
*rystride
];
2239 for (x
= 0; x
< xcount
; x
++)
2241 abase_x
= &abase
[x
*axstride
];
2243 for (n
= 0; n
< count
; n
++)
2244 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2245 dest_y
[x
*rxstride
] = s
;
2255 /* Compiling main function, with selection code for the processor. */
2257 /* Currently, this is i386 only. Adjust for other architectures. */
2259 #include <config/i386/cpuinfo.h>
2260 void matmul_r4 (gfc_array_r4
* const restrict retarray
,
2261 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
2262 int blas_limit
, blas_call gemm
)
2264 static void (*matmul_p
) (gfc_array_r4
* const restrict retarray
,
2265 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
2266 int blas_limit
, blas_call gemm
) = NULL
;
2268 if (matmul_p
== NULL
)
2270 matmul_p
= matmul_r4_vanilla
;
2271 if (__cpu_model
.__cpu_vendor
== VENDOR_INTEL
)
2273 /* Run down the available processors in order of preference. */
2275 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX512F
))
2277 matmul_p
= matmul_r4_avx512f
;
2281 #endif /* HAVE_AVX512F */
2284 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX2
))
2286 matmul_p
= matmul_r4_avx2
;
2293 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2295 matmul_p
= matmul_r4_avx
;
2298 #endif /* HAVE_AVX */
2303 (*matmul_p
) (retarray
, a
, b
, try_blas
, blas_limit
, gemm
);
2306 #else /* Just the vanilla function. */
2309 matmul_r4 (gfc_array_r4
* const restrict retarray
,
2310 gfc_array_r4
* const restrict a
, gfc_array_r4
* const restrict b
, int try_blas
,
2311 int blas_limit
, blas_call gemm
)
2313 const GFC_REAL_4
* restrict abase
;
2314 const GFC_REAL_4
* restrict bbase
;
2315 GFC_REAL_4
* restrict dest
;
2317 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
2318 index_type x
, y
, n
, count
, xcount
, ycount
;
2320 assert (GFC_DESCRIPTOR_RANK (a
) == 2
2321 || GFC_DESCRIPTOR_RANK (b
) == 2);
2323 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2325 Either A or B (but not both) can be rank 1:
2327 o One-dimensional argument A is implicitly treated as a row matrix
2328 dimensioned [1,count], so xcount=1.
2330 o One-dimensional argument B is implicitly treated as a column matrix
2331 dimensioned [count, 1], so ycount=1.
2334 if (retarray
->base_addr
== NULL
)
2336 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2338 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2339 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
2341 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2343 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2344 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2348 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2349 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2351 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
2352 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
2353 GFC_DESCRIPTOR_EXTENT(retarray
,0));
2357 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_REAL_4
));
2358 retarray
->offset
= 0;
2360 else if (unlikely (compile_options
.bounds_check
))
2362 index_type ret_extent
, arg_extent
;
2364 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2366 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2367 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2368 if (arg_extent
!= ret_extent
)
2369 runtime_error ("Incorrect extent in return array in"
2370 " MATMUL intrinsic: is %ld, should be %ld",
2371 (long int) ret_extent
, (long int) arg_extent
);
2373 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2375 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2376 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2377 if (arg_extent
!= ret_extent
)
2378 runtime_error ("Incorrect extent in return array in"
2379 " MATMUL intrinsic: is %ld, should be %ld",
2380 (long int) ret_extent
, (long int) arg_extent
);
2384 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2385 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2386 if (arg_extent
!= ret_extent
)
2387 runtime_error ("Incorrect extent in return array in"
2388 " MATMUL intrinsic for dimension 1:"
2389 " is %ld, should be %ld",
2390 (long int) ret_extent
, (long int) arg_extent
);
2392 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2393 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
2394 if (arg_extent
!= ret_extent
)
2395 runtime_error ("Incorrect extent in return array in"
2396 " MATMUL intrinsic for dimension 2:"
2397 " is %ld, should be %ld",
2398 (long int) ret_extent
, (long int) arg_extent
);
2403 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
2405 /* One-dimensional result may be addressed in the code below
2406 either as a row or a column matrix. We want both cases to
2408 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2412 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2413 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
2417 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2419 /* Treat it as a a row matrix A[1,count]. */
2420 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2424 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
2428 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2429 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
2431 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
2432 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
2435 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
2437 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
2438 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2441 if (GFC_DESCRIPTOR_RANK (b
) == 1)
2443 /* Treat it as a column matrix B[count,1] */
2444 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2446 /* bystride should never be used for 1-dimensional b.
2447 in case it is we want it to cause a segfault, rather than
2448 an incorrect result. */
2449 bystride
= 0xDEADBEEF;
2454 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2455 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
2456 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
2459 abase
= a
->base_addr
;
2460 bbase
= b
->base_addr
;
2461 dest
= retarray
->base_addr
;
2463 /* Now that everything is set up, we perform the multiplication
2466 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2467 #define min(a,b) ((a) <= (b) ? (a) : (b))
2468 #define max(a,b) ((a) >= (b) ? (a) : (b))
2470 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
2471 && (bxstride
== 1 || bystride
== 1)
2472 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
2473 > POW3(blas_limit
)))
2475 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
2476 const GFC_REAL_4 one
= 1, zero
= 0;
2477 const int lda
= (axstride
== 1) ? aystride
: axstride
,
2478 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
2480 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
2482 assert (gemm
!= NULL
);
2483 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
2484 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
2490 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
2492 /* This block of code implements a tuned matmul, derived from
2493 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2495 Bo Kagstrom and Per Ling
2496 Department of Computing Science
2498 S-901 87 Umea, Sweden
2500 from netlib.org, translated to C, and modified for matmul.m4. */
2502 const GFC_REAL_4
*a
, *b
;
2504 const index_type m
= xcount
, n
= ycount
, k
= count
;
2506 /* System generated locals */
2507 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
2508 i1
, i2
, i3
, i4
, i5
, i6
;
2510 /* Local variables */
2511 GFC_REAL_4 t1
[65536], /* was [256][256] */
2512 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
2513 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
2514 index_type i
, j
, l
, ii
, jj
, ll
;
2515 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
2519 c
= retarray
->base_addr
;
2521 /* Parameter adjustments */
2523 c_offset
= 1 + c_dim1
;
2526 a_offset
= 1 + a_dim1
;
2529 b_offset
= 1 + b_dim1
;
2532 /* Early exit if possible */
2533 if (m
== 0 || n
== 0 || k
== 0)
2536 /* Empty c first. */
2537 for (j
=1; j
<=n
; j
++)
2538 for (i
=1; i
<=m
; i
++)
2539 c
[i
+ j
* c_dim1
] = (GFC_REAL_4
)0;
2541 /* Start turning the crank. */
2543 for (jj
= 1; jj
<= i1
; jj
+= 512)
2549 ujsec
= jsec
- jsec
% 4;
2551 for (ll
= 1; ll
<= i2
; ll
+= 256)
2557 ulsec
= lsec
- lsec
% 2;
2560 for (ii
= 1; ii
<= i3
; ii
+= 256)
2566 uisec
= isec
- isec
% 2;
2567 i4
= ll
+ ulsec
- 1;
2568 for (l
= ll
; l
<= i4
; l
+= 2)
2570 i5
= ii
+ uisec
- 1;
2571 for (i
= ii
; i
<= i5
; i
+= 2)
2573 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2575 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2576 a
[i
+ (l
+ 1) * a_dim1
];
2577 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2578 a
[i
+ 1 + l
* a_dim1
];
2579 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2580 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2584 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2585 a
[ii
+ isec
- 1 + l
* a_dim1
];
2586 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2587 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2593 for (i
= ii
; i
<= i4
; ++i
)
2595 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2596 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2600 uisec
= isec
- isec
% 4;
2601 i4
= jj
+ ujsec
- 1;
2602 for (j
= jj
; j
<= i4
; j
+= 4)
2604 i5
= ii
+ uisec
- 1;
2605 for (i
= ii
; i
<= i5
; i
+= 4)
2607 f11
= c
[i
+ j
* c_dim1
];
2608 f21
= c
[i
+ 1 + j
* c_dim1
];
2609 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2610 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2611 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2612 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2613 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2614 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2615 f31
= c
[i
+ 2 + j
* c_dim1
];
2616 f41
= c
[i
+ 3 + j
* c_dim1
];
2617 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2618 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2619 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2620 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2621 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2622 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2624 for (l
= ll
; l
<= i6
; ++l
)
2626 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2627 * b
[l
+ j
* b_dim1
];
2628 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2629 * b
[l
+ j
* b_dim1
];
2630 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2631 * b
[l
+ (j
+ 1) * b_dim1
];
2632 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2633 * b
[l
+ (j
+ 1) * b_dim1
];
2634 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2635 * b
[l
+ (j
+ 2) * b_dim1
];
2636 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2637 * b
[l
+ (j
+ 2) * b_dim1
];
2638 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2639 * b
[l
+ (j
+ 3) * b_dim1
];
2640 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2641 * b
[l
+ (j
+ 3) * b_dim1
];
2642 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2643 * b
[l
+ j
* b_dim1
];
2644 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2645 * b
[l
+ j
* b_dim1
];
2646 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2647 * b
[l
+ (j
+ 1) * b_dim1
];
2648 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2649 * b
[l
+ (j
+ 1) * b_dim1
];
2650 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2651 * b
[l
+ (j
+ 2) * b_dim1
];
2652 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2653 * b
[l
+ (j
+ 2) * b_dim1
];
2654 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2655 * b
[l
+ (j
+ 3) * b_dim1
];
2656 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2657 * b
[l
+ (j
+ 3) * b_dim1
];
2659 c
[i
+ j
* c_dim1
] = f11
;
2660 c
[i
+ 1 + j
* c_dim1
] = f21
;
2661 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2662 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2663 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2664 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2665 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2666 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2667 c
[i
+ 2 + j
* c_dim1
] = f31
;
2668 c
[i
+ 3 + j
* c_dim1
] = f41
;
2669 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2670 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2671 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2672 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2673 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2674 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2679 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2681 f11
= c
[i
+ j
* c_dim1
];
2682 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2683 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2684 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2686 for (l
= ll
; l
<= i6
; ++l
)
2688 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2689 257] * b
[l
+ j
* b_dim1
];
2690 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2691 257] * b
[l
+ (j
+ 1) * b_dim1
];
2692 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2693 257] * b
[l
+ (j
+ 2) * b_dim1
];
2694 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2695 257] * b
[l
+ (j
+ 3) * b_dim1
];
2697 c
[i
+ j
* c_dim1
] = f11
;
2698 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2699 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2700 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2707 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2709 i5
= ii
+ uisec
- 1;
2710 for (i
= ii
; i
<= i5
; i
+= 4)
2712 f11
= c
[i
+ j
* c_dim1
];
2713 f21
= c
[i
+ 1 + j
* c_dim1
];
2714 f31
= c
[i
+ 2 + j
* c_dim1
];
2715 f41
= c
[i
+ 3 + j
* c_dim1
];
2717 for (l
= ll
; l
<= i6
; ++l
)
2719 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2720 257] * b
[l
+ j
* b_dim1
];
2721 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2722 257] * b
[l
+ j
* b_dim1
];
2723 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2724 257] * b
[l
+ j
* b_dim1
];
2725 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2726 257] * b
[l
+ j
* b_dim1
];
2728 c
[i
+ j
* c_dim1
] = f11
;
2729 c
[i
+ 1 + j
* c_dim1
] = f21
;
2730 c
[i
+ 2 + j
* c_dim1
] = f31
;
2731 c
[i
+ 3 + j
* c_dim1
] = f41
;
2734 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2736 f11
= c
[i
+ j
* c_dim1
];
2738 for (l
= ll
; l
<= i6
; ++l
)
2740 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2741 257] * b
[l
+ j
* b_dim1
];
2743 c
[i
+ j
* c_dim1
] = f11
;
2752 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2754 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2756 const GFC_REAL_4
*restrict abase_x
;
2757 const GFC_REAL_4
*restrict bbase_y
;
2758 GFC_REAL_4
*restrict dest_y
;
2761 for (y
= 0; y
< ycount
; y
++)
2763 bbase_y
= &bbase
[y
*bystride
];
2764 dest_y
= &dest
[y
*rystride
];
2765 for (x
= 0; x
< xcount
; x
++)
2767 abase_x
= &abase
[x
*axstride
];
2769 for (n
= 0; n
< count
; n
++)
2770 s
+= abase_x
[n
] * bbase_y
[n
];
2777 const GFC_REAL_4
*restrict bbase_y
;
2780 for (y
= 0; y
< ycount
; y
++)
2782 bbase_y
= &bbase
[y
*bystride
];
2784 for (n
= 0; n
< count
; n
++)
2785 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2786 dest
[y
*rystride
] = s
;
2790 else if (axstride
< aystride
)
2792 for (y
= 0; y
< ycount
; y
++)
2793 for (x
= 0; x
< xcount
; x
++)
2794 dest
[x
*rxstride
+ y
*rystride
] = (GFC_REAL_4
)0;
2796 for (y
= 0; y
< ycount
; y
++)
2797 for (n
= 0; n
< count
; n
++)
2798 for (x
= 0; x
< xcount
; x
++)
2799 /* dest[x,y] += a[x,n] * b[n,y] */
2800 dest
[x
*rxstride
+ y
*rystride
] +=
2801 abase
[x
*axstride
+ n
*aystride
] *
2802 bbase
[n
*bxstride
+ y
*bystride
];
2804 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2806 const GFC_REAL_4
*restrict bbase_y
;
2809 for (y
= 0; y
< ycount
; y
++)
2811 bbase_y
= &bbase
[y
*bystride
];
2813 for (n
= 0; n
< count
; n
++)
2814 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2815 dest
[y
*rxstride
] = s
;
2820 const GFC_REAL_4
*restrict abase_x
;
2821 const GFC_REAL_4
*restrict bbase_y
;
2822 GFC_REAL_4
*restrict dest_y
;
2825 for (y
= 0; y
< ycount
; y
++)
2827 bbase_y
= &bbase
[y
*bystride
];
2828 dest_y
= &dest
[y
*rystride
];
2829 for (x
= 0; x
< xcount
; x
++)
2831 abase_x
= &abase
[x
*axstride
];
2833 for (n
= 0; n
< count
; n
++)
2834 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2835 dest_y
[x
*rxstride
] = s
;