1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
31 #if defined (HAVE_GFC_COMPLEX_4)
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
37 typedef void (*blas_call
)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_COMPLEX_4
*, const GFC_COMPLEX_4
*,
39 const int *, const GFC_COMPLEX_4
*, const int *,
40 const GFC_COMPLEX_4
*, GFC_COMPLEX_4
*, const int *,
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
49 The equivalent Fortran pseudo-code is:
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
72 extern void matmul_c4 (gfc_array_c4
* const restrict retarray
,
73 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
74 int blas_limit
, blas_call gemm
);
75 export_proto(matmul_c4
);
77 /* Put exhaustive list of possible architectures here here, ORed together. */
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
83 matmul_c4_avx (gfc_array_c4
* const restrict retarray
,
84 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
85 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx")));
87 matmul_c4_avx (gfc_array_c4
* const restrict retarray
,
88 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
89 int blas_limit
, blas_call gemm
)
91 const GFC_COMPLEX_4
* restrict abase
;
92 const GFC_COMPLEX_4
* restrict bbase
;
93 GFC_COMPLEX_4
* restrict dest
;
95 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
96 index_type x
, y
, n
, count
, xcount
, ycount
;
98 assert (GFC_DESCRIPTOR_RANK (a
) == 2
99 || GFC_DESCRIPTOR_RANK (b
) == 2);
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
103 Either A or B (but not both) can be rank 1:
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
112 if (retarray
->base_addr
== NULL
)
114 if (GFC_DESCRIPTOR_RANK (a
) == 1)
116 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
119 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
121 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
126 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
129 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray
,0));
135 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_4
));
136 retarray
->offset
= 0;
138 else if (unlikely (compile_options
.bounds_check
))
140 index_type ret_extent
, arg_extent
;
142 if (GFC_DESCRIPTOR_RANK (a
) == 1)
144 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
145 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
146 if (arg_extent
!= ret_extent
)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent
, (long int) arg_extent
);
151 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
153 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
154 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
155 if (arg_extent
!= ret_extent
)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent
, (long int) arg_extent
);
162 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
163 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
164 if (arg_extent
!= ret_extent
)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent
, (long int) arg_extent
);
170 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
171 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
172 if (arg_extent
!= ret_extent
)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent
, (long int) arg_extent
);
181 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
186 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
190 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
191 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
195 if (GFC_DESCRIPTOR_RANK (a
) == 1)
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
202 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
206 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
207 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
209 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
210 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
213 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
215 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
219 if (GFC_DESCRIPTOR_RANK (b
) == 1)
221 /* Treat it as a column matrix B[count,1] */
222 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
224 /* bystride should never be used for 1-dimensional b.
225 The value is only used for calculation of the
226 memory by the buffer. */
232 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
233 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
234 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
237 abase
= a
->base_addr
;
238 bbase
= b
->base_addr
;
239 dest
= retarray
->base_addr
;
241 /* Now that everything is set up, we perform the multiplication
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
248 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
249 && (bxstride
== 1 || bystride
== 1)
250 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
253 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
254 const GFC_COMPLEX_4 one
= 1, zero
= 0;
255 const int lda
= (axstride
== 1) ? aystride
: axstride
,
256 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
258 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
260 assert (gemm
!= NULL
);
261 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
262 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
268 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
276 S-901 87 Umea, Sweden
278 from netlib.org, translated to C, and modified for matmul.m4. */
280 const GFC_COMPLEX_4
*a
, *b
;
282 const index_type m
= xcount
, n
= ycount
, k
= count
;
284 /* System generated locals */
285 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
286 i1
, i2
, i3
, i4
, i5
, i6
;
288 /* Local variables */
289 GFC_COMPLEX_4 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
290 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
291 index_type i
, j
, l
, ii
, jj
, ll
;
292 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
297 c
= retarray
->base_addr
;
299 /* Parameter adjustments */
301 c_offset
= 1 + c_dim1
;
304 a_offset
= 1 + a_dim1
;
307 b_offset
= 1 + b_dim1
;
313 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_4
)0;
315 /* Early exit if possible */
316 if (m
== 0 || n
== 0 || k
== 0)
319 /* Adjust size of t1 to what is needed. */
321 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
325 t1
= malloc (t1_dim
* sizeof(GFC_COMPLEX_4
));
327 /* Start turning the crank. */
329 for (jj
= 1; jj
<= i1
; jj
+= 512)
335 ujsec
= jsec
- jsec
% 4;
337 for (ll
= 1; ll
<= i2
; ll
+= 256)
343 ulsec
= lsec
- lsec
% 2;
346 for (ii
= 1; ii
<= i3
; ii
+= 256)
352 uisec
= isec
- isec
% 2;
354 for (l
= ll
; l
<= i4
; l
+= 2)
357 for (i
= ii
; i
<= i5
; i
+= 2)
359 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
361 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
362 a
[i
+ (l
+ 1) * a_dim1
];
363 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
364 a
[i
+ 1 + l
* a_dim1
];
365 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
366 a
[i
+ 1 + (l
+ 1) * a_dim1
];
370 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
371 a
[ii
+ isec
- 1 + l
* a_dim1
];
372 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
373 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
379 for (i
= ii
; i
<= i4
; ++i
)
381 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
382 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
386 uisec
= isec
- isec
% 4;
388 for (j
= jj
; j
<= i4
; j
+= 4)
391 for (i
= ii
; i
<= i5
; i
+= 4)
393 f11
= c
[i
+ j
* c_dim1
];
394 f21
= c
[i
+ 1 + j
* c_dim1
];
395 f12
= c
[i
+ (j
+ 1) * c_dim1
];
396 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
397 f13
= c
[i
+ (j
+ 2) * c_dim1
];
398 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
399 f14
= c
[i
+ (j
+ 3) * c_dim1
];
400 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
401 f31
= c
[i
+ 2 + j
* c_dim1
];
402 f41
= c
[i
+ 3 + j
* c_dim1
];
403 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
404 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
405 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
406 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
407 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
408 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
410 for (l
= ll
; l
<= i6
; ++l
)
412 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
414 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
416 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
417 * b
[l
+ (j
+ 1) * b_dim1
];
418 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
419 * b
[l
+ (j
+ 1) * b_dim1
];
420 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
421 * b
[l
+ (j
+ 2) * b_dim1
];
422 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
423 * b
[l
+ (j
+ 2) * b_dim1
];
424 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
425 * b
[l
+ (j
+ 3) * b_dim1
];
426 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
427 * b
[l
+ (j
+ 3) * b_dim1
];
428 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
430 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
432 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
433 * b
[l
+ (j
+ 1) * b_dim1
];
434 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
435 * b
[l
+ (j
+ 1) * b_dim1
];
436 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
437 * b
[l
+ (j
+ 2) * b_dim1
];
438 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
439 * b
[l
+ (j
+ 2) * b_dim1
];
440 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
441 * b
[l
+ (j
+ 3) * b_dim1
];
442 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
443 * b
[l
+ (j
+ 3) * b_dim1
];
445 c
[i
+ j
* c_dim1
] = f11
;
446 c
[i
+ 1 + j
* c_dim1
] = f21
;
447 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
448 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
449 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
450 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
451 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
452 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
453 c
[i
+ 2 + j
* c_dim1
] = f31
;
454 c
[i
+ 3 + j
* c_dim1
] = f41
;
455 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
456 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
457 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
458 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
459 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
460 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
465 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
467 f11
= c
[i
+ j
* c_dim1
];
468 f12
= c
[i
+ (j
+ 1) * c_dim1
];
469 f13
= c
[i
+ (j
+ 2) * c_dim1
];
470 f14
= c
[i
+ (j
+ 3) * c_dim1
];
472 for (l
= ll
; l
<= i6
; ++l
)
474 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
475 257] * b
[l
+ j
* b_dim1
];
476 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
477 257] * b
[l
+ (j
+ 1) * b_dim1
];
478 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
479 257] * b
[l
+ (j
+ 2) * b_dim1
];
480 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
481 257] * b
[l
+ (j
+ 3) * b_dim1
];
483 c
[i
+ j
* c_dim1
] = f11
;
484 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
485 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
486 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
493 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
496 for (i
= ii
; i
<= i5
; i
+= 4)
498 f11
= c
[i
+ j
* c_dim1
];
499 f21
= c
[i
+ 1 + j
* c_dim1
];
500 f31
= c
[i
+ 2 + j
* c_dim1
];
501 f41
= c
[i
+ 3 + j
* c_dim1
];
503 for (l
= ll
; l
<= i6
; ++l
)
505 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
506 257] * b
[l
+ j
* b_dim1
];
507 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
508 257] * b
[l
+ j
* b_dim1
];
509 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
510 257] * b
[l
+ j
* b_dim1
];
511 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
512 257] * b
[l
+ j
* b_dim1
];
514 c
[i
+ j
* c_dim1
] = f11
;
515 c
[i
+ 1 + j
* c_dim1
] = f21
;
516 c
[i
+ 2 + j
* c_dim1
] = f31
;
517 c
[i
+ 3 + j
* c_dim1
] = f41
;
520 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
522 f11
= c
[i
+ j
* c_dim1
];
524 for (l
= ll
; l
<= i6
; ++l
)
526 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
527 257] * b
[l
+ j
* b_dim1
];
529 c
[i
+ j
* c_dim1
] = f11
;
539 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
541 if (GFC_DESCRIPTOR_RANK (a
) != 1)
543 const GFC_COMPLEX_4
*restrict abase_x
;
544 const GFC_COMPLEX_4
*restrict bbase_y
;
545 GFC_COMPLEX_4
*restrict dest_y
;
548 for (y
= 0; y
< ycount
; y
++)
550 bbase_y
= &bbase
[y
*bystride
];
551 dest_y
= &dest
[y
*rystride
];
552 for (x
= 0; x
< xcount
; x
++)
554 abase_x
= &abase
[x
*axstride
];
555 s
= (GFC_COMPLEX_4
) 0;
556 for (n
= 0; n
< count
; n
++)
557 s
+= abase_x
[n
] * bbase_y
[n
];
564 const GFC_COMPLEX_4
*restrict bbase_y
;
567 for (y
= 0; y
< ycount
; y
++)
569 bbase_y
= &bbase
[y
*bystride
];
570 s
= (GFC_COMPLEX_4
) 0;
571 for (n
= 0; n
< count
; n
++)
572 s
+= abase
[n
*axstride
] * bbase_y
[n
];
573 dest
[y
*rystride
] = s
;
577 else if (axstride
< aystride
)
579 for (y
= 0; y
< ycount
; y
++)
580 for (x
= 0; x
< xcount
; x
++)
581 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_4
)0;
583 for (y
= 0; y
< ycount
; y
++)
584 for (n
= 0; n
< count
; n
++)
585 for (x
= 0; x
< xcount
; x
++)
586 /* dest[x,y] += a[x,n] * b[n,y] */
587 dest
[x
*rxstride
+ y
*rystride
] +=
588 abase
[x
*axstride
+ n
*aystride
] *
589 bbase
[n
*bxstride
+ y
*bystride
];
591 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
593 const GFC_COMPLEX_4
*restrict bbase_y
;
596 for (y
= 0; y
< ycount
; y
++)
598 bbase_y
= &bbase
[y
*bystride
];
599 s
= (GFC_COMPLEX_4
) 0;
600 for (n
= 0; n
< count
; n
++)
601 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
602 dest
[y
*rxstride
] = s
;
607 const GFC_COMPLEX_4
*restrict abase_x
;
608 const GFC_COMPLEX_4
*restrict bbase_y
;
609 GFC_COMPLEX_4
*restrict dest_y
;
612 for (y
= 0; y
< ycount
; y
++)
614 bbase_y
= &bbase
[y
*bystride
];
615 dest_y
= &dest
[y
*rystride
];
616 for (x
= 0; x
< xcount
; x
++)
618 abase_x
= &abase
[x
*axstride
];
619 s
= (GFC_COMPLEX_4
) 0;
620 for (n
= 0; n
< count
; n
++)
621 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
622 dest_y
[x
*rxstride
] = s
;
631 #endif /* HAVE_AVX */
635 matmul_c4_avx2 (gfc_array_c4
* const restrict retarray
,
636 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
637 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx2,fma")));
639 matmul_c4_avx2 (gfc_array_c4
* const restrict retarray
,
640 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
641 int blas_limit
, blas_call gemm
)
643 const GFC_COMPLEX_4
* restrict abase
;
644 const GFC_COMPLEX_4
* restrict bbase
;
645 GFC_COMPLEX_4
* restrict dest
;
647 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
648 index_type x
, y
, n
, count
, xcount
, ycount
;
650 assert (GFC_DESCRIPTOR_RANK (a
) == 2
651 || GFC_DESCRIPTOR_RANK (b
) == 2);
653 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
655 Either A or B (but not both) can be rank 1:
657 o One-dimensional argument A is implicitly treated as a row matrix
658 dimensioned [1,count], so xcount=1.
660 o One-dimensional argument B is implicitly treated as a column matrix
661 dimensioned [count, 1], so ycount=1.
664 if (retarray
->base_addr
== NULL
)
666 if (GFC_DESCRIPTOR_RANK (a
) == 1)
668 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
669 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
671 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
673 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
674 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
678 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
679 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
681 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
682 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
683 GFC_DESCRIPTOR_EXTENT(retarray
,0));
687 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_4
));
688 retarray
->offset
= 0;
690 else if (unlikely (compile_options
.bounds_check
))
692 index_type ret_extent
, arg_extent
;
694 if (GFC_DESCRIPTOR_RANK (a
) == 1)
696 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
697 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
698 if (arg_extent
!= ret_extent
)
699 runtime_error ("Incorrect extent in return array in"
700 " MATMUL intrinsic: is %ld, should be %ld",
701 (long int) ret_extent
, (long int) arg_extent
);
703 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
705 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
706 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
707 if (arg_extent
!= ret_extent
)
708 runtime_error ("Incorrect extent in return array in"
709 " MATMUL intrinsic: is %ld, should be %ld",
710 (long int) ret_extent
, (long int) arg_extent
);
714 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
715 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
716 if (arg_extent
!= ret_extent
)
717 runtime_error ("Incorrect extent in return array in"
718 " MATMUL intrinsic for dimension 1:"
719 " is %ld, should be %ld",
720 (long int) ret_extent
, (long int) arg_extent
);
722 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
723 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
724 if (arg_extent
!= ret_extent
)
725 runtime_error ("Incorrect extent in return array in"
726 " MATMUL intrinsic for dimension 2:"
727 " is %ld, should be %ld",
728 (long int) ret_extent
, (long int) arg_extent
);
733 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
735 /* One-dimensional result may be addressed in the code below
736 either as a row or a column matrix. We want both cases to
738 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
742 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
743 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
747 if (GFC_DESCRIPTOR_RANK (a
) == 1)
749 /* Treat it as a a row matrix A[1,count]. */
750 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
754 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
758 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
759 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
761 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
762 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
765 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
767 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
768 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
771 if (GFC_DESCRIPTOR_RANK (b
) == 1)
773 /* Treat it as a column matrix B[count,1] */
774 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
776 /* bystride should never be used for 1-dimensional b.
777 The value is only used for calculation of the
778 memory by the buffer. */
784 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
785 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
786 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
789 abase
= a
->base_addr
;
790 bbase
= b
->base_addr
;
791 dest
= retarray
->base_addr
;
793 /* Now that everything is set up, we perform the multiplication
796 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
797 #define min(a,b) ((a) <= (b) ? (a) : (b))
798 #define max(a,b) ((a) >= (b) ? (a) : (b))
800 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
801 && (bxstride
== 1 || bystride
== 1)
802 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
805 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
806 const GFC_COMPLEX_4 one
= 1, zero
= 0;
807 const int lda
= (axstride
== 1) ? aystride
: axstride
,
808 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
810 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
812 assert (gemm
!= NULL
);
813 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
814 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
820 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
822 /* This block of code implements a tuned matmul, derived from
823 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
825 Bo Kagstrom and Per Ling
826 Department of Computing Science
828 S-901 87 Umea, Sweden
830 from netlib.org, translated to C, and modified for matmul.m4. */
832 const GFC_COMPLEX_4
*a
, *b
;
834 const index_type m
= xcount
, n
= ycount
, k
= count
;
836 /* System generated locals */
837 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
838 i1
, i2
, i3
, i4
, i5
, i6
;
840 /* Local variables */
841 GFC_COMPLEX_4 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
842 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
843 index_type i
, j
, l
, ii
, jj
, ll
;
844 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
849 c
= retarray
->base_addr
;
851 /* Parameter adjustments */
853 c_offset
= 1 + c_dim1
;
856 a_offset
= 1 + a_dim1
;
859 b_offset
= 1 + b_dim1
;
865 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_4
)0;
867 /* Early exit if possible */
868 if (m
== 0 || n
== 0 || k
== 0)
871 /* Adjust size of t1 to what is needed. */
873 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
877 t1
= malloc (t1_dim
* sizeof(GFC_COMPLEX_4
));
879 /* Start turning the crank. */
881 for (jj
= 1; jj
<= i1
; jj
+= 512)
887 ujsec
= jsec
- jsec
% 4;
889 for (ll
= 1; ll
<= i2
; ll
+= 256)
895 ulsec
= lsec
- lsec
% 2;
898 for (ii
= 1; ii
<= i3
; ii
+= 256)
904 uisec
= isec
- isec
% 2;
906 for (l
= ll
; l
<= i4
; l
+= 2)
909 for (i
= ii
; i
<= i5
; i
+= 2)
911 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
913 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
914 a
[i
+ (l
+ 1) * a_dim1
];
915 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
916 a
[i
+ 1 + l
* a_dim1
];
917 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
918 a
[i
+ 1 + (l
+ 1) * a_dim1
];
922 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
923 a
[ii
+ isec
- 1 + l
* a_dim1
];
924 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
925 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
931 for (i
= ii
; i
<= i4
; ++i
)
933 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
934 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
938 uisec
= isec
- isec
% 4;
940 for (j
= jj
; j
<= i4
; j
+= 4)
943 for (i
= ii
; i
<= i5
; i
+= 4)
945 f11
= c
[i
+ j
* c_dim1
];
946 f21
= c
[i
+ 1 + j
* c_dim1
];
947 f12
= c
[i
+ (j
+ 1) * c_dim1
];
948 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
949 f13
= c
[i
+ (j
+ 2) * c_dim1
];
950 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
951 f14
= c
[i
+ (j
+ 3) * c_dim1
];
952 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
953 f31
= c
[i
+ 2 + j
* c_dim1
];
954 f41
= c
[i
+ 3 + j
* c_dim1
];
955 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
956 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
957 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
958 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
959 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
960 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
962 for (l
= ll
; l
<= i6
; ++l
)
964 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
966 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
968 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
969 * b
[l
+ (j
+ 1) * b_dim1
];
970 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
971 * b
[l
+ (j
+ 1) * b_dim1
];
972 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
973 * b
[l
+ (j
+ 2) * b_dim1
];
974 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
975 * b
[l
+ (j
+ 2) * b_dim1
];
976 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
977 * b
[l
+ (j
+ 3) * b_dim1
];
978 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
979 * b
[l
+ (j
+ 3) * b_dim1
];
980 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
982 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
984 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
985 * b
[l
+ (j
+ 1) * b_dim1
];
986 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
987 * b
[l
+ (j
+ 1) * b_dim1
];
988 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
989 * b
[l
+ (j
+ 2) * b_dim1
];
990 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
991 * b
[l
+ (j
+ 2) * b_dim1
];
992 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
993 * b
[l
+ (j
+ 3) * b_dim1
];
994 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
995 * b
[l
+ (j
+ 3) * b_dim1
];
997 c
[i
+ j
* c_dim1
] = f11
;
998 c
[i
+ 1 + j
* c_dim1
] = f21
;
999 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1000 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1001 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1002 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1003 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1004 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1005 c
[i
+ 2 + j
* c_dim1
] = f31
;
1006 c
[i
+ 3 + j
* c_dim1
] = f41
;
1007 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1008 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1009 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1010 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1011 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1012 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1017 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1019 f11
= c
[i
+ j
* c_dim1
];
1020 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1021 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1022 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1024 for (l
= ll
; l
<= i6
; ++l
)
1026 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1027 257] * b
[l
+ j
* b_dim1
];
1028 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1029 257] * b
[l
+ (j
+ 1) * b_dim1
];
1030 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1031 257] * b
[l
+ (j
+ 2) * b_dim1
];
1032 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1033 257] * b
[l
+ (j
+ 3) * b_dim1
];
1035 c
[i
+ j
* c_dim1
] = f11
;
1036 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1037 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1038 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1045 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1047 i5
= ii
+ uisec
- 1;
1048 for (i
= ii
; i
<= i5
; i
+= 4)
1050 f11
= c
[i
+ j
* c_dim1
];
1051 f21
= c
[i
+ 1 + j
* c_dim1
];
1052 f31
= c
[i
+ 2 + j
* c_dim1
];
1053 f41
= c
[i
+ 3 + j
* c_dim1
];
1055 for (l
= ll
; l
<= i6
; ++l
)
1057 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1058 257] * b
[l
+ j
* b_dim1
];
1059 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1060 257] * b
[l
+ j
* b_dim1
];
1061 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1062 257] * b
[l
+ j
* b_dim1
];
1063 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1064 257] * b
[l
+ j
* b_dim1
];
1066 c
[i
+ j
* c_dim1
] = f11
;
1067 c
[i
+ 1 + j
* c_dim1
] = f21
;
1068 c
[i
+ 2 + j
* c_dim1
] = f31
;
1069 c
[i
+ 3 + j
* c_dim1
] = f41
;
1072 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1074 f11
= c
[i
+ j
* c_dim1
];
1076 for (l
= ll
; l
<= i6
; ++l
)
1078 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1079 257] * b
[l
+ j
* b_dim1
];
1081 c
[i
+ j
* c_dim1
] = f11
;
1091 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1093 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1095 const GFC_COMPLEX_4
*restrict abase_x
;
1096 const GFC_COMPLEX_4
*restrict bbase_y
;
1097 GFC_COMPLEX_4
*restrict dest_y
;
1100 for (y
= 0; y
< ycount
; y
++)
1102 bbase_y
= &bbase
[y
*bystride
];
1103 dest_y
= &dest
[y
*rystride
];
1104 for (x
= 0; x
< xcount
; x
++)
1106 abase_x
= &abase
[x
*axstride
];
1107 s
= (GFC_COMPLEX_4
) 0;
1108 for (n
= 0; n
< count
; n
++)
1109 s
+= abase_x
[n
] * bbase_y
[n
];
1116 const GFC_COMPLEX_4
*restrict bbase_y
;
1119 for (y
= 0; y
< ycount
; y
++)
1121 bbase_y
= &bbase
[y
*bystride
];
1122 s
= (GFC_COMPLEX_4
) 0;
1123 for (n
= 0; n
< count
; n
++)
1124 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1125 dest
[y
*rystride
] = s
;
1129 else if (axstride
< aystride
)
1131 for (y
= 0; y
< ycount
; y
++)
1132 for (x
= 0; x
< xcount
; x
++)
1133 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_4
)0;
1135 for (y
= 0; y
< ycount
; y
++)
1136 for (n
= 0; n
< count
; n
++)
1137 for (x
= 0; x
< xcount
; x
++)
1138 /* dest[x,y] += a[x,n] * b[n,y] */
1139 dest
[x
*rxstride
+ y
*rystride
] +=
1140 abase
[x
*axstride
+ n
*aystride
] *
1141 bbase
[n
*bxstride
+ y
*bystride
];
1143 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1145 const GFC_COMPLEX_4
*restrict bbase_y
;
1148 for (y
= 0; y
< ycount
; y
++)
1150 bbase_y
= &bbase
[y
*bystride
];
1151 s
= (GFC_COMPLEX_4
) 0;
1152 for (n
= 0; n
< count
; n
++)
1153 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1154 dest
[y
*rxstride
] = s
;
1159 const GFC_COMPLEX_4
*restrict abase_x
;
1160 const GFC_COMPLEX_4
*restrict bbase_y
;
1161 GFC_COMPLEX_4
*restrict dest_y
;
1164 for (y
= 0; y
< ycount
; y
++)
1166 bbase_y
= &bbase
[y
*bystride
];
1167 dest_y
= &dest
[y
*rystride
];
1168 for (x
= 0; x
< xcount
; x
++)
1170 abase_x
= &abase
[x
*axstride
];
1171 s
= (GFC_COMPLEX_4
) 0;
1172 for (n
= 0; n
< count
; n
++)
1173 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1174 dest_y
[x
*rxstride
] = s
;
1183 #endif /* HAVE_AVX2 */
1187 matmul_c4_avx512f (gfc_array_c4
* const restrict retarray
,
1188 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
1189 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx512f")));
1191 matmul_c4_avx512f (gfc_array_c4
* const restrict retarray
,
1192 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
1193 int blas_limit
, blas_call gemm
)
1195 const GFC_COMPLEX_4
* restrict abase
;
1196 const GFC_COMPLEX_4
* restrict bbase
;
1197 GFC_COMPLEX_4
* restrict dest
;
1199 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1200 index_type x
, y
, n
, count
, xcount
, ycount
;
1202 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1203 || GFC_DESCRIPTOR_RANK (b
) == 2);
1205 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1207 Either A or B (but not both) can be rank 1:
1209 o One-dimensional argument A is implicitly treated as a row matrix
1210 dimensioned [1,count], so xcount=1.
1212 o One-dimensional argument B is implicitly treated as a column matrix
1213 dimensioned [count, 1], so ycount=1.
1216 if (retarray
->base_addr
== NULL
)
1218 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1220 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1221 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1223 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1225 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1226 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1230 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1231 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1233 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1234 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1235 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1239 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_4
));
1240 retarray
->offset
= 0;
1242 else if (unlikely (compile_options
.bounds_check
))
1244 index_type ret_extent
, arg_extent
;
1246 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1248 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1249 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1250 if (arg_extent
!= ret_extent
)
1251 runtime_error ("Incorrect extent in return array in"
1252 " MATMUL intrinsic: is %ld, should be %ld",
1253 (long int) ret_extent
, (long int) arg_extent
);
1255 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1257 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1258 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1259 if (arg_extent
!= ret_extent
)
1260 runtime_error ("Incorrect extent in return array in"
1261 " MATMUL intrinsic: is %ld, should be %ld",
1262 (long int) ret_extent
, (long int) arg_extent
);
1266 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1267 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1268 if (arg_extent
!= ret_extent
)
1269 runtime_error ("Incorrect extent in return array in"
1270 " MATMUL intrinsic for dimension 1:"
1271 " is %ld, should be %ld",
1272 (long int) ret_extent
, (long int) arg_extent
);
1274 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1275 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1276 if (arg_extent
!= ret_extent
)
1277 runtime_error ("Incorrect extent in return array in"
1278 " MATMUL intrinsic for dimension 2:"
1279 " is %ld, should be %ld",
1280 (long int) ret_extent
, (long int) arg_extent
);
1285 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1287 /* One-dimensional result may be addressed in the code below
1288 either as a row or a column matrix. We want both cases to
1290 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1294 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1295 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1299 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1301 /* Treat it as a a row matrix A[1,count]. */
1302 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1306 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1310 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1311 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1313 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1314 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1317 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1319 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1320 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1323 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1325 /* Treat it as a column matrix B[count,1] */
1326 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1328 /* bystride should never be used for 1-dimensional b.
1329 The value is only used for calculation of the
1330 memory by the buffer. */
1336 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1337 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1338 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1341 abase
= a
->base_addr
;
1342 bbase
= b
->base_addr
;
1343 dest
= retarray
->base_addr
;
1345 /* Now that everything is set up, we perform the multiplication
1348 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1349 #define min(a,b) ((a) <= (b) ? (a) : (b))
1350 #define max(a,b) ((a) >= (b) ? (a) : (b))
1352 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1353 && (bxstride
== 1 || bystride
== 1)
1354 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1355 > POW3(blas_limit
)))
1357 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1358 const GFC_COMPLEX_4 one
= 1, zero
= 0;
1359 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1360 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1362 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1364 assert (gemm
!= NULL
);
1365 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1366 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1372 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1374 /* This block of code implements a tuned matmul, derived from
1375 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1377 Bo Kagstrom and Per Ling
1378 Department of Computing Science
1380 S-901 87 Umea, Sweden
1382 from netlib.org, translated to C, and modified for matmul.m4. */
1384 const GFC_COMPLEX_4
*a
, *b
;
1386 const index_type m
= xcount
, n
= ycount
, k
= count
;
1388 /* System generated locals */
1389 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1390 i1
, i2
, i3
, i4
, i5
, i6
;
1392 /* Local variables */
1393 GFC_COMPLEX_4 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1394 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1395 index_type i
, j
, l
, ii
, jj
, ll
;
1396 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1401 c
= retarray
->base_addr
;
1403 /* Parameter adjustments */
1405 c_offset
= 1 + c_dim1
;
1408 a_offset
= 1 + a_dim1
;
1411 b_offset
= 1 + b_dim1
;
1414 /* Empty c first. */
1415 for (j
=1; j
<=n
; j
++)
1416 for (i
=1; i
<=m
; i
++)
1417 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_4
)0;
1419 /* Early exit if possible */
1420 if (m
== 0 || n
== 0 || k
== 0)
1423 /* Adjust size of t1 to what is needed. */
1425 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
1429 t1
= malloc (t1_dim
* sizeof(GFC_COMPLEX_4
));
1431 /* Start turning the crank. */
1433 for (jj
= 1; jj
<= i1
; jj
+= 512)
1439 ujsec
= jsec
- jsec
% 4;
1441 for (ll
= 1; ll
<= i2
; ll
+= 256)
1447 ulsec
= lsec
- lsec
% 2;
1450 for (ii
= 1; ii
<= i3
; ii
+= 256)
1456 uisec
= isec
- isec
% 2;
1457 i4
= ll
+ ulsec
- 1;
1458 for (l
= ll
; l
<= i4
; l
+= 2)
1460 i5
= ii
+ uisec
- 1;
1461 for (i
= ii
; i
<= i5
; i
+= 2)
1463 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
1465 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
1466 a
[i
+ (l
+ 1) * a_dim1
];
1467 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
1468 a
[i
+ 1 + l
* a_dim1
];
1469 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
1470 a
[i
+ 1 + (l
+ 1) * a_dim1
];
1474 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
1475 a
[ii
+ isec
- 1 + l
* a_dim1
];
1476 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
1477 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
1483 for (i
= ii
; i
<= i4
; ++i
)
1485 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
1486 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
1490 uisec
= isec
- isec
% 4;
1491 i4
= jj
+ ujsec
- 1;
1492 for (j
= jj
; j
<= i4
; j
+= 4)
1494 i5
= ii
+ uisec
- 1;
1495 for (i
= ii
; i
<= i5
; i
+= 4)
1497 f11
= c
[i
+ j
* c_dim1
];
1498 f21
= c
[i
+ 1 + j
* c_dim1
];
1499 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1500 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
1501 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1502 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
1503 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1504 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
1505 f31
= c
[i
+ 2 + j
* c_dim1
];
1506 f41
= c
[i
+ 3 + j
* c_dim1
];
1507 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
1508 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
1509 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
1510 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
1511 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
1512 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
1514 for (l
= ll
; l
<= i6
; ++l
)
1516 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1517 * b
[l
+ j
* b_dim1
];
1518 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1519 * b
[l
+ j
* b_dim1
];
1520 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1521 * b
[l
+ (j
+ 1) * b_dim1
];
1522 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1523 * b
[l
+ (j
+ 1) * b_dim1
];
1524 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1525 * b
[l
+ (j
+ 2) * b_dim1
];
1526 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1527 * b
[l
+ (j
+ 2) * b_dim1
];
1528 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1529 * b
[l
+ (j
+ 3) * b_dim1
];
1530 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1531 * b
[l
+ (j
+ 3) * b_dim1
];
1532 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1533 * b
[l
+ j
* b_dim1
];
1534 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1535 * b
[l
+ j
* b_dim1
];
1536 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1537 * b
[l
+ (j
+ 1) * b_dim1
];
1538 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1539 * b
[l
+ (j
+ 1) * b_dim1
];
1540 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1541 * b
[l
+ (j
+ 2) * b_dim1
];
1542 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1543 * b
[l
+ (j
+ 2) * b_dim1
];
1544 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1545 * b
[l
+ (j
+ 3) * b_dim1
];
1546 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1547 * b
[l
+ (j
+ 3) * b_dim1
];
1549 c
[i
+ j
* c_dim1
] = f11
;
1550 c
[i
+ 1 + j
* c_dim1
] = f21
;
1551 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1552 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1553 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1554 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1555 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1556 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1557 c
[i
+ 2 + j
* c_dim1
] = f31
;
1558 c
[i
+ 3 + j
* c_dim1
] = f41
;
1559 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1560 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1561 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1562 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1563 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1564 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1569 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1571 f11
= c
[i
+ j
* c_dim1
];
1572 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1573 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1574 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1576 for (l
= ll
; l
<= i6
; ++l
)
1578 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1579 257] * b
[l
+ j
* b_dim1
];
1580 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1581 257] * b
[l
+ (j
+ 1) * b_dim1
];
1582 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1583 257] * b
[l
+ (j
+ 2) * b_dim1
];
1584 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1585 257] * b
[l
+ (j
+ 3) * b_dim1
];
1587 c
[i
+ j
* c_dim1
] = f11
;
1588 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1589 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1590 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1597 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1599 i5
= ii
+ uisec
- 1;
1600 for (i
= ii
; i
<= i5
; i
+= 4)
1602 f11
= c
[i
+ j
* c_dim1
];
1603 f21
= c
[i
+ 1 + j
* c_dim1
];
1604 f31
= c
[i
+ 2 + j
* c_dim1
];
1605 f41
= c
[i
+ 3 + j
* c_dim1
];
1607 for (l
= ll
; l
<= i6
; ++l
)
1609 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1610 257] * b
[l
+ j
* b_dim1
];
1611 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1612 257] * b
[l
+ j
* b_dim1
];
1613 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1614 257] * b
[l
+ j
* b_dim1
];
1615 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1616 257] * b
[l
+ j
* b_dim1
];
1618 c
[i
+ j
* c_dim1
] = f11
;
1619 c
[i
+ 1 + j
* c_dim1
] = f21
;
1620 c
[i
+ 2 + j
* c_dim1
] = f31
;
1621 c
[i
+ 3 + j
* c_dim1
] = f41
;
1624 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1626 f11
= c
[i
+ j
* c_dim1
];
1628 for (l
= ll
; l
<= i6
; ++l
)
1630 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1631 257] * b
[l
+ j
* b_dim1
];
1633 c
[i
+ j
* c_dim1
] = f11
;
1643 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1645 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1647 const GFC_COMPLEX_4
*restrict abase_x
;
1648 const GFC_COMPLEX_4
*restrict bbase_y
;
1649 GFC_COMPLEX_4
*restrict dest_y
;
1652 for (y
= 0; y
< ycount
; y
++)
1654 bbase_y
= &bbase
[y
*bystride
];
1655 dest_y
= &dest
[y
*rystride
];
1656 for (x
= 0; x
< xcount
; x
++)
1658 abase_x
= &abase
[x
*axstride
];
1659 s
= (GFC_COMPLEX_4
) 0;
1660 for (n
= 0; n
< count
; n
++)
1661 s
+= abase_x
[n
] * bbase_y
[n
];
1668 const GFC_COMPLEX_4
*restrict bbase_y
;
1671 for (y
= 0; y
< ycount
; y
++)
1673 bbase_y
= &bbase
[y
*bystride
];
1674 s
= (GFC_COMPLEX_4
) 0;
1675 for (n
= 0; n
< count
; n
++)
1676 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1677 dest
[y
*rystride
] = s
;
1681 else if (axstride
< aystride
)
1683 for (y
= 0; y
< ycount
; y
++)
1684 for (x
= 0; x
< xcount
; x
++)
1685 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_4
)0;
1687 for (y
= 0; y
< ycount
; y
++)
1688 for (n
= 0; n
< count
; n
++)
1689 for (x
= 0; x
< xcount
; x
++)
1690 /* dest[x,y] += a[x,n] * b[n,y] */
1691 dest
[x
*rxstride
+ y
*rystride
] +=
1692 abase
[x
*axstride
+ n
*aystride
] *
1693 bbase
[n
*bxstride
+ y
*bystride
];
1695 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1697 const GFC_COMPLEX_4
*restrict bbase_y
;
1700 for (y
= 0; y
< ycount
; y
++)
1702 bbase_y
= &bbase
[y
*bystride
];
1703 s
= (GFC_COMPLEX_4
) 0;
1704 for (n
= 0; n
< count
; n
++)
1705 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1706 dest
[y
*rxstride
] = s
;
1711 const GFC_COMPLEX_4
*restrict abase_x
;
1712 const GFC_COMPLEX_4
*restrict bbase_y
;
1713 GFC_COMPLEX_4
*restrict dest_y
;
1716 for (y
= 0; y
< ycount
; y
++)
1718 bbase_y
= &bbase
[y
*bystride
];
1719 dest_y
= &dest
[y
*rystride
];
1720 for (x
= 0; x
< xcount
; x
++)
1722 abase_x
= &abase
[x
*axstride
];
1723 s
= (GFC_COMPLEX_4
) 0;
1724 for (n
= 0; n
< count
; n
++)
1725 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1726 dest_y
[x
*rxstride
] = s
;
1735 #endif /* HAVE_AVX512F */
1737 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
1739 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1741 matmul_c4_avx128_fma3 (gfc_array_c4
* const restrict retarray
,
1742 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
1743 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx,fma")));
1744 internal_proto(matmul_c4_avx128_fma3
);
1747 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1749 matmul_c4_avx128_fma4 (gfc_array_c4
* const restrict retarray
,
1750 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
1751 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx,fma4")));
1752 internal_proto(matmul_c4_avx128_fma4
);
1755 /* Function to fall back to if there is no special processor-specific version. */
1757 matmul_c4_vanilla (gfc_array_c4
* const restrict retarray
,
1758 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
1759 int blas_limit
, blas_call gemm
)
1761 const GFC_COMPLEX_4
* restrict abase
;
1762 const GFC_COMPLEX_4
* restrict bbase
;
1763 GFC_COMPLEX_4
* restrict dest
;
1765 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1766 index_type x
, y
, n
, count
, xcount
, ycount
;
1768 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1769 || GFC_DESCRIPTOR_RANK (b
) == 2);
1771 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1773 Either A or B (but not both) can be rank 1:
1775 o One-dimensional argument A is implicitly treated as a row matrix
1776 dimensioned [1,count], so xcount=1.
1778 o One-dimensional argument B is implicitly treated as a column matrix
1779 dimensioned [count, 1], so ycount=1.
1782 if (retarray
->base_addr
== NULL
)
1784 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1786 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1787 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1789 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1791 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1792 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1796 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1797 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1799 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1800 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1801 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1805 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_4
));
1806 retarray
->offset
= 0;
1808 else if (unlikely (compile_options
.bounds_check
))
1810 index_type ret_extent
, arg_extent
;
1812 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1814 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1815 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1816 if (arg_extent
!= ret_extent
)
1817 runtime_error ("Incorrect extent in return array in"
1818 " MATMUL intrinsic: is %ld, should be %ld",
1819 (long int) ret_extent
, (long int) arg_extent
);
1821 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1823 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1824 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1825 if (arg_extent
!= ret_extent
)
1826 runtime_error ("Incorrect extent in return array in"
1827 " MATMUL intrinsic: is %ld, should be %ld",
1828 (long int) ret_extent
, (long int) arg_extent
);
1832 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1833 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1834 if (arg_extent
!= ret_extent
)
1835 runtime_error ("Incorrect extent in return array in"
1836 " MATMUL intrinsic for dimension 1:"
1837 " is %ld, should be %ld",
1838 (long int) ret_extent
, (long int) arg_extent
);
1840 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1841 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1842 if (arg_extent
!= ret_extent
)
1843 runtime_error ("Incorrect extent in return array in"
1844 " MATMUL intrinsic for dimension 2:"
1845 " is %ld, should be %ld",
1846 (long int) ret_extent
, (long int) arg_extent
);
1851 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1853 /* One-dimensional result may be addressed in the code below
1854 either as a row or a column matrix. We want both cases to
1856 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1860 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1861 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1865 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1867 /* Treat it as a a row matrix A[1,count]. */
1868 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1872 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1876 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1877 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1879 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1880 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1883 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1885 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1886 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1889 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1891 /* Treat it as a column matrix B[count,1] */
1892 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1894 /* bystride should never be used for 1-dimensional b.
1895 The value is only used for calculation of the
1896 memory by the buffer. */
1902 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1903 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1904 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1907 abase
= a
->base_addr
;
1908 bbase
= b
->base_addr
;
1909 dest
= retarray
->base_addr
;
1911 /* Now that everything is set up, we perform the multiplication
1914 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1915 #define min(a,b) ((a) <= (b) ? (a) : (b))
1916 #define max(a,b) ((a) >= (b) ? (a) : (b))
1918 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1919 && (bxstride
== 1 || bystride
== 1)
1920 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1921 > POW3(blas_limit
)))
1923 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1924 const GFC_COMPLEX_4 one
= 1, zero
= 0;
1925 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1926 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1928 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1930 assert (gemm
!= NULL
);
1931 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1932 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1938 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1940 /* This block of code implements a tuned matmul, derived from
1941 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1943 Bo Kagstrom and Per Ling
1944 Department of Computing Science
1946 S-901 87 Umea, Sweden
1948 from netlib.org, translated to C, and modified for matmul.m4. */
1950 const GFC_COMPLEX_4
*a
, *b
;
1952 const index_type m
= xcount
, n
= ycount
, k
= count
;
1954 /* System generated locals */
1955 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1956 i1
, i2
, i3
, i4
, i5
, i6
;
1958 /* Local variables */
1959 GFC_COMPLEX_4 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1960 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1961 index_type i
, j
, l
, ii
, jj
, ll
;
1962 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1967 c
= retarray
->base_addr
;
1969 /* Parameter adjustments */
1971 c_offset
= 1 + c_dim1
;
1974 a_offset
= 1 + a_dim1
;
1977 b_offset
= 1 + b_dim1
;
1980 /* Empty c first. */
1981 for (j
=1; j
<=n
; j
++)
1982 for (i
=1; i
<=m
; i
++)
1983 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_4
)0;
1985 /* Early exit if possible */
1986 if (m
== 0 || n
== 0 || k
== 0)
1989 /* Adjust size of t1 to what is needed. */
1991 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
1995 t1
= malloc (t1_dim
* sizeof(GFC_COMPLEX_4
));
1997 /* Start turning the crank. */
1999 for (jj
= 1; jj
<= i1
; jj
+= 512)
2005 ujsec
= jsec
- jsec
% 4;
2007 for (ll
= 1; ll
<= i2
; ll
+= 256)
2013 ulsec
= lsec
- lsec
% 2;
2016 for (ii
= 1; ii
<= i3
; ii
+= 256)
2022 uisec
= isec
- isec
% 2;
2023 i4
= ll
+ ulsec
- 1;
2024 for (l
= ll
; l
<= i4
; l
+= 2)
2026 i5
= ii
+ uisec
- 1;
2027 for (i
= ii
; i
<= i5
; i
+= 2)
2029 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2031 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2032 a
[i
+ (l
+ 1) * a_dim1
];
2033 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2034 a
[i
+ 1 + l
* a_dim1
];
2035 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2036 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2040 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2041 a
[ii
+ isec
- 1 + l
* a_dim1
];
2042 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2043 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2049 for (i
= ii
; i
<= i4
; ++i
)
2051 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2052 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2056 uisec
= isec
- isec
% 4;
2057 i4
= jj
+ ujsec
- 1;
2058 for (j
= jj
; j
<= i4
; j
+= 4)
2060 i5
= ii
+ uisec
- 1;
2061 for (i
= ii
; i
<= i5
; i
+= 4)
2063 f11
= c
[i
+ j
* c_dim1
];
2064 f21
= c
[i
+ 1 + j
* c_dim1
];
2065 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2066 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2067 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2068 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2069 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2070 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2071 f31
= c
[i
+ 2 + j
* c_dim1
];
2072 f41
= c
[i
+ 3 + j
* c_dim1
];
2073 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2074 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2075 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2076 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2077 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2078 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2080 for (l
= ll
; l
<= i6
; ++l
)
2082 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2083 * b
[l
+ j
* b_dim1
];
2084 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2085 * b
[l
+ j
* b_dim1
];
2086 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2087 * b
[l
+ (j
+ 1) * b_dim1
];
2088 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2089 * b
[l
+ (j
+ 1) * b_dim1
];
2090 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2091 * b
[l
+ (j
+ 2) * b_dim1
];
2092 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2093 * b
[l
+ (j
+ 2) * b_dim1
];
2094 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2095 * b
[l
+ (j
+ 3) * b_dim1
];
2096 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2097 * b
[l
+ (j
+ 3) * b_dim1
];
2098 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2099 * b
[l
+ j
* b_dim1
];
2100 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2101 * b
[l
+ j
* b_dim1
];
2102 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2103 * b
[l
+ (j
+ 1) * b_dim1
];
2104 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2105 * b
[l
+ (j
+ 1) * b_dim1
];
2106 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2107 * b
[l
+ (j
+ 2) * b_dim1
];
2108 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2109 * b
[l
+ (j
+ 2) * b_dim1
];
2110 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2111 * b
[l
+ (j
+ 3) * b_dim1
];
2112 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2113 * b
[l
+ (j
+ 3) * b_dim1
];
2115 c
[i
+ j
* c_dim1
] = f11
;
2116 c
[i
+ 1 + j
* c_dim1
] = f21
;
2117 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2118 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2119 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2120 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2121 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2122 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2123 c
[i
+ 2 + j
* c_dim1
] = f31
;
2124 c
[i
+ 3 + j
* c_dim1
] = f41
;
2125 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2126 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2127 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2128 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2129 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2130 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2135 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2137 f11
= c
[i
+ j
* c_dim1
];
2138 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2139 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2140 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2142 for (l
= ll
; l
<= i6
; ++l
)
2144 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2145 257] * b
[l
+ j
* b_dim1
];
2146 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2147 257] * b
[l
+ (j
+ 1) * b_dim1
];
2148 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2149 257] * b
[l
+ (j
+ 2) * b_dim1
];
2150 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2151 257] * b
[l
+ (j
+ 3) * b_dim1
];
2153 c
[i
+ j
* c_dim1
] = f11
;
2154 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2155 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2156 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2163 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2165 i5
= ii
+ uisec
- 1;
2166 for (i
= ii
; i
<= i5
; i
+= 4)
2168 f11
= c
[i
+ j
* c_dim1
];
2169 f21
= c
[i
+ 1 + j
* c_dim1
];
2170 f31
= c
[i
+ 2 + j
* c_dim1
];
2171 f41
= c
[i
+ 3 + j
* c_dim1
];
2173 for (l
= ll
; l
<= i6
; ++l
)
2175 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2176 257] * b
[l
+ j
* b_dim1
];
2177 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2178 257] * b
[l
+ j
* b_dim1
];
2179 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2180 257] * b
[l
+ j
* b_dim1
];
2181 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2182 257] * b
[l
+ j
* b_dim1
];
2184 c
[i
+ j
* c_dim1
] = f11
;
2185 c
[i
+ 1 + j
* c_dim1
] = f21
;
2186 c
[i
+ 2 + j
* c_dim1
] = f31
;
2187 c
[i
+ 3 + j
* c_dim1
] = f41
;
2190 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2192 f11
= c
[i
+ j
* c_dim1
];
2194 for (l
= ll
; l
<= i6
; ++l
)
2196 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2197 257] * b
[l
+ j
* b_dim1
];
2199 c
[i
+ j
* c_dim1
] = f11
;
2209 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2211 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2213 const GFC_COMPLEX_4
*restrict abase_x
;
2214 const GFC_COMPLEX_4
*restrict bbase_y
;
2215 GFC_COMPLEX_4
*restrict dest_y
;
2218 for (y
= 0; y
< ycount
; y
++)
2220 bbase_y
= &bbase
[y
*bystride
];
2221 dest_y
= &dest
[y
*rystride
];
2222 for (x
= 0; x
< xcount
; x
++)
2224 abase_x
= &abase
[x
*axstride
];
2225 s
= (GFC_COMPLEX_4
) 0;
2226 for (n
= 0; n
< count
; n
++)
2227 s
+= abase_x
[n
] * bbase_y
[n
];
2234 const GFC_COMPLEX_4
*restrict bbase_y
;
2237 for (y
= 0; y
< ycount
; y
++)
2239 bbase_y
= &bbase
[y
*bystride
];
2240 s
= (GFC_COMPLEX_4
) 0;
2241 for (n
= 0; n
< count
; n
++)
2242 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2243 dest
[y
*rystride
] = s
;
2247 else if (axstride
< aystride
)
2249 for (y
= 0; y
< ycount
; y
++)
2250 for (x
= 0; x
< xcount
; x
++)
2251 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_4
)0;
2253 for (y
= 0; y
< ycount
; y
++)
2254 for (n
= 0; n
< count
; n
++)
2255 for (x
= 0; x
< xcount
; x
++)
2256 /* dest[x,y] += a[x,n] * b[n,y] */
2257 dest
[x
*rxstride
+ y
*rystride
] +=
2258 abase
[x
*axstride
+ n
*aystride
] *
2259 bbase
[n
*bxstride
+ y
*bystride
];
2261 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2263 const GFC_COMPLEX_4
*restrict bbase_y
;
2266 for (y
= 0; y
< ycount
; y
++)
2268 bbase_y
= &bbase
[y
*bystride
];
2269 s
= (GFC_COMPLEX_4
) 0;
2270 for (n
= 0; n
< count
; n
++)
2271 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2272 dest
[y
*rxstride
] = s
;
2277 const GFC_COMPLEX_4
*restrict abase_x
;
2278 const GFC_COMPLEX_4
*restrict bbase_y
;
2279 GFC_COMPLEX_4
*restrict dest_y
;
2282 for (y
= 0; y
< ycount
; y
++)
2284 bbase_y
= &bbase
[y
*bystride
];
2285 dest_y
= &dest
[y
*rystride
];
2286 for (x
= 0; x
< xcount
; x
++)
2288 abase_x
= &abase
[x
*axstride
];
2289 s
= (GFC_COMPLEX_4
) 0;
2290 for (n
= 0; n
< count
; n
++)
2291 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2292 dest_y
[x
*rxstride
] = s
;
2302 /* Compiling main function, with selection code for the processor. */
2304 /* Currently, this is i386 only. Adjust for other architectures. */
2306 #include <config/i386/cpuinfo.h>
2307 void matmul_c4 (gfc_array_c4
* const restrict retarray
,
2308 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
2309 int blas_limit
, blas_call gemm
)
2311 static void (*matmul_p
) (gfc_array_c4
* const restrict retarray
,
2312 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
2313 int blas_limit
, blas_call gemm
);
2315 void (*matmul_fn
) (gfc_array_c4
* const restrict retarray
,
2316 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
2317 int blas_limit
, blas_call gemm
);
2319 matmul_fn
= __atomic_load_n (&matmul_p
, __ATOMIC_RELAXED
);
2320 if (matmul_fn
== NULL
)
2322 matmul_fn
= matmul_c4_vanilla
;
2323 if (__cpu_model
.__cpu_vendor
== VENDOR_INTEL
)
2325 /* Run down the available processors in order of preference. */
2327 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX512F
))
2329 matmul_fn
= matmul_c4_avx512f
;
2333 #endif /* HAVE_AVX512F */
2336 if ((__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX2
))
2337 && (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_FMA
)))
2339 matmul_fn
= matmul_c4_avx2
;
2346 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2348 matmul_fn
= matmul_c4_avx
;
2351 #endif /* HAVE_AVX */
2353 else if (__cpu_model
.__cpu_vendor
== VENDOR_AMD
)
2355 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2356 if ((__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2357 && (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_FMA
)))
2359 matmul_fn
= matmul_c4_avx128_fma3
;
2363 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2364 if ((__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2365 && (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_FMA4
)))
2367 matmul_fn
= matmul_c4_avx128_fma4
;
2374 __atomic_store_n (&matmul_p
, matmul_fn
, __ATOMIC_RELAXED
);
2377 (*matmul_fn
) (retarray
, a
, b
, try_blas
, blas_limit
, gemm
);
2380 #else /* Just the vanilla function. */
2383 matmul_c4 (gfc_array_c4
* const restrict retarray
,
2384 gfc_array_c4
* const restrict a
, gfc_array_c4
* const restrict b
, int try_blas
,
2385 int blas_limit
, blas_call gemm
)
2387 const GFC_COMPLEX_4
* restrict abase
;
2388 const GFC_COMPLEX_4
* restrict bbase
;
2389 GFC_COMPLEX_4
* restrict dest
;
2391 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
2392 index_type x
, y
, n
, count
, xcount
, ycount
;
2394 assert (GFC_DESCRIPTOR_RANK (a
) == 2
2395 || GFC_DESCRIPTOR_RANK (b
) == 2);
2397 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2399 Either A or B (but not both) can be rank 1:
2401 o One-dimensional argument A is implicitly treated as a row matrix
2402 dimensioned [1,count], so xcount=1.
2404 o One-dimensional argument B is implicitly treated as a column matrix
2405 dimensioned [count, 1], so ycount=1.
2408 if (retarray
->base_addr
== NULL
)
2410 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2412 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2413 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
2415 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2417 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2418 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2422 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2423 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2425 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
2426 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
2427 GFC_DESCRIPTOR_EXTENT(retarray
,0));
2431 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_4
));
2432 retarray
->offset
= 0;
2434 else if (unlikely (compile_options
.bounds_check
))
2436 index_type ret_extent
, arg_extent
;
2438 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2440 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2441 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2442 if (arg_extent
!= ret_extent
)
2443 runtime_error ("Incorrect extent in return array in"
2444 " MATMUL intrinsic: is %ld, should be %ld",
2445 (long int) ret_extent
, (long int) arg_extent
);
2447 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2449 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2450 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2451 if (arg_extent
!= ret_extent
)
2452 runtime_error ("Incorrect extent in return array in"
2453 " MATMUL intrinsic: is %ld, should be %ld",
2454 (long int) ret_extent
, (long int) arg_extent
);
2458 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2459 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2460 if (arg_extent
!= ret_extent
)
2461 runtime_error ("Incorrect extent in return array in"
2462 " MATMUL intrinsic for dimension 1:"
2463 " is %ld, should be %ld",
2464 (long int) ret_extent
, (long int) arg_extent
);
2466 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2467 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
2468 if (arg_extent
!= ret_extent
)
2469 runtime_error ("Incorrect extent in return array in"
2470 " MATMUL intrinsic for dimension 2:"
2471 " is %ld, should be %ld",
2472 (long int) ret_extent
, (long int) arg_extent
);
2477 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
2479 /* One-dimensional result may be addressed in the code below
2480 either as a row or a column matrix. We want both cases to
2482 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2486 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2487 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
2491 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2493 /* Treat it as a a row matrix A[1,count]. */
2494 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2498 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
2502 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2503 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
2505 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
2506 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
2509 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
2511 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
2512 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2515 if (GFC_DESCRIPTOR_RANK (b
) == 1)
2517 /* Treat it as a column matrix B[count,1] */
2518 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2520 /* bystride should never be used for 1-dimensional b.
2521 The value is only used for calculation of the
2522 memory by the buffer. */
2528 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2529 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
2530 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
2533 abase
= a
->base_addr
;
2534 bbase
= b
->base_addr
;
2535 dest
= retarray
->base_addr
;
2537 /* Now that everything is set up, we perform the multiplication
2540 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2541 #define min(a,b) ((a) <= (b) ? (a) : (b))
2542 #define max(a,b) ((a) >= (b) ? (a) : (b))
2544 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
2545 && (bxstride
== 1 || bystride
== 1)
2546 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
2547 > POW3(blas_limit
)))
2549 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
2550 const GFC_COMPLEX_4 one
= 1, zero
= 0;
2551 const int lda
= (axstride
== 1) ? aystride
: axstride
,
2552 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
2554 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
2556 assert (gemm
!= NULL
);
2557 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
2558 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
2564 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
2566 /* This block of code implements a tuned matmul, derived from
2567 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2569 Bo Kagstrom and Per Ling
2570 Department of Computing Science
2572 S-901 87 Umea, Sweden
2574 from netlib.org, translated to C, and modified for matmul.m4. */
2576 const GFC_COMPLEX_4
*a
, *b
;
2578 const index_type m
= xcount
, n
= ycount
, k
= count
;
2580 /* System generated locals */
2581 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
2582 i1
, i2
, i3
, i4
, i5
, i6
;
2584 /* Local variables */
2585 GFC_COMPLEX_4 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
2586 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
2587 index_type i
, j
, l
, ii
, jj
, ll
;
2588 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
2593 c
= retarray
->base_addr
;
2595 /* Parameter adjustments */
2597 c_offset
= 1 + c_dim1
;
2600 a_offset
= 1 + a_dim1
;
2603 b_offset
= 1 + b_dim1
;
2606 /* Empty c first. */
2607 for (j
=1; j
<=n
; j
++)
2608 for (i
=1; i
<=m
; i
++)
2609 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_4
)0;
2611 /* Early exit if possible */
2612 if (m
== 0 || n
== 0 || k
== 0)
2615 /* Adjust size of t1 to what is needed. */
2617 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
2621 t1
= malloc (t1_dim
* sizeof(GFC_COMPLEX_4
));
2623 /* Start turning the crank. */
2625 for (jj
= 1; jj
<= i1
; jj
+= 512)
2631 ujsec
= jsec
- jsec
% 4;
2633 for (ll
= 1; ll
<= i2
; ll
+= 256)
2639 ulsec
= lsec
- lsec
% 2;
2642 for (ii
= 1; ii
<= i3
; ii
+= 256)
2648 uisec
= isec
- isec
% 2;
2649 i4
= ll
+ ulsec
- 1;
2650 for (l
= ll
; l
<= i4
; l
+= 2)
2652 i5
= ii
+ uisec
- 1;
2653 for (i
= ii
; i
<= i5
; i
+= 2)
2655 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2657 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2658 a
[i
+ (l
+ 1) * a_dim1
];
2659 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2660 a
[i
+ 1 + l
* a_dim1
];
2661 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2662 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2666 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2667 a
[ii
+ isec
- 1 + l
* a_dim1
];
2668 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2669 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2675 for (i
= ii
; i
<= i4
; ++i
)
2677 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2678 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2682 uisec
= isec
- isec
% 4;
2683 i4
= jj
+ ujsec
- 1;
2684 for (j
= jj
; j
<= i4
; j
+= 4)
2686 i5
= ii
+ uisec
- 1;
2687 for (i
= ii
; i
<= i5
; i
+= 4)
2689 f11
= c
[i
+ j
* c_dim1
];
2690 f21
= c
[i
+ 1 + j
* c_dim1
];
2691 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2692 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2693 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2694 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2695 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2696 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2697 f31
= c
[i
+ 2 + j
* c_dim1
];
2698 f41
= c
[i
+ 3 + j
* c_dim1
];
2699 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2700 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2701 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2702 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2703 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2704 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2706 for (l
= ll
; l
<= i6
; ++l
)
2708 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2709 * b
[l
+ j
* b_dim1
];
2710 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2711 * b
[l
+ j
* b_dim1
];
2712 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2713 * b
[l
+ (j
+ 1) * b_dim1
];
2714 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2715 * b
[l
+ (j
+ 1) * b_dim1
];
2716 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2717 * b
[l
+ (j
+ 2) * b_dim1
];
2718 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2719 * b
[l
+ (j
+ 2) * b_dim1
];
2720 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2721 * b
[l
+ (j
+ 3) * b_dim1
];
2722 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2723 * b
[l
+ (j
+ 3) * b_dim1
];
2724 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2725 * b
[l
+ j
* b_dim1
];
2726 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2727 * b
[l
+ j
* b_dim1
];
2728 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2729 * b
[l
+ (j
+ 1) * b_dim1
];
2730 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2731 * b
[l
+ (j
+ 1) * b_dim1
];
2732 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2733 * b
[l
+ (j
+ 2) * b_dim1
];
2734 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2735 * b
[l
+ (j
+ 2) * b_dim1
];
2736 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2737 * b
[l
+ (j
+ 3) * b_dim1
];
2738 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2739 * b
[l
+ (j
+ 3) * b_dim1
];
2741 c
[i
+ j
* c_dim1
] = f11
;
2742 c
[i
+ 1 + j
* c_dim1
] = f21
;
2743 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2744 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2745 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2746 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2747 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2748 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2749 c
[i
+ 2 + j
* c_dim1
] = f31
;
2750 c
[i
+ 3 + j
* c_dim1
] = f41
;
2751 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2752 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2753 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2754 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2755 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2756 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2761 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2763 f11
= c
[i
+ j
* c_dim1
];
2764 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2765 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2766 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2768 for (l
= ll
; l
<= i6
; ++l
)
2770 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2771 257] * b
[l
+ j
* b_dim1
];
2772 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2773 257] * b
[l
+ (j
+ 1) * b_dim1
];
2774 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2775 257] * b
[l
+ (j
+ 2) * b_dim1
];
2776 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2777 257] * b
[l
+ (j
+ 3) * b_dim1
];
2779 c
[i
+ j
* c_dim1
] = f11
;
2780 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2781 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2782 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2789 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2791 i5
= ii
+ uisec
- 1;
2792 for (i
= ii
; i
<= i5
; i
+= 4)
2794 f11
= c
[i
+ j
* c_dim1
];
2795 f21
= c
[i
+ 1 + j
* c_dim1
];
2796 f31
= c
[i
+ 2 + j
* c_dim1
];
2797 f41
= c
[i
+ 3 + j
* c_dim1
];
2799 for (l
= ll
; l
<= i6
; ++l
)
2801 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2802 257] * b
[l
+ j
* b_dim1
];
2803 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2804 257] * b
[l
+ j
* b_dim1
];
2805 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2806 257] * b
[l
+ j
* b_dim1
];
2807 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2808 257] * b
[l
+ j
* b_dim1
];
2810 c
[i
+ j
* c_dim1
] = f11
;
2811 c
[i
+ 1 + j
* c_dim1
] = f21
;
2812 c
[i
+ 2 + j
* c_dim1
] = f31
;
2813 c
[i
+ 3 + j
* c_dim1
] = f41
;
2816 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2818 f11
= c
[i
+ j
* c_dim1
];
2820 for (l
= ll
; l
<= i6
; ++l
)
2822 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2823 257] * b
[l
+ j
* b_dim1
];
2825 c
[i
+ j
* c_dim1
] = f11
;
2835 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2837 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2839 const GFC_COMPLEX_4
*restrict abase_x
;
2840 const GFC_COMPLEX_4
*restrict bbase_y
;
2841 GFC_COMPLEX_4
*restrict dest_y
;
2844 for (y
= 0; y
< ycount
; y
++)
2846 bbase_y
= &bbase
[y
*bystride
];
2847 dest_y
= &dest
[y
*rystride
];
2848 for (x
= 0; x
< xcount
; x
++)
2850 abase_x
= &abase
[x
*axstride
];
2851 s
= (GFC_COMPLEX_4
) 0;
2852 for (n
= 0; n
< count
; n
++)
2853 s
+= abase_x
[n
] * bbase_y
[n
];
2860 const GFC_COMPLEX_4
*restrict bbase_y
;
2863 for (y
= 0; y
< ycount
; y
++)
2865 bbase_y
= &bbase
[y
*bystride
];
2866 s
= (GFC_COMPLEX_4
) 0;
2867 for (n
= 0; n
< count
; n
++)
2868 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2869 dest
[y
*rystride
] = s
;
2873 else if (axstride
< aystride
)
2875 for (y
= 0; y
< ycount
; y
++)
2876 for (x
= 0; x
< xcount
; x
++)
2877 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_4
)0;
2879 for (y
= 0; y
< ycount
; y
++)
2880 for (n
= 0; n
< count
; n
++)
2881 for (x
= 0; x
< xcount
; x
++)
2882 /* dest[x,y] += a[x,n] * b[n,y] */
2883 dest
[x
*rxstride
+ y
*rystride
] +=
2884 abase
[x
*axstride
+ n
*aystride
] *
2885 bbase
[n
*bxstride
+ y
*bystride
];
2887 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2889 const GFC_COMPLEX_4
*restrict bbase_y
;
2892 for (y
= 0; y
< ycount
; y
++)
2894 bbase_y
= &bbase
[y
*bystride
];
2895 s
= (GFC_COMPLEX_4
) 0;
2896 for (n
= 0; n
< count
; n
++)
2897 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2898 dest
[y
*rxstride
] = s
;
2903 const GFC_COMPLEX_4
*restrict abase_x
;
2904 const GFC_COMPLEX_4
*restrict bbase_y
;
2905 GFC_COMPLEX_4
*restrict dest_y
;
2908 for (y
= 0; y
< ycount
; y
++)
2910 bbase_y
= &bbase
[y
*bystride
];
2911 dest_y
= &dest
[y
*rystride
];
2912 for (x
= 0; x
< xcount
; x
++)
2914 abase_x
= &abase
[x
*axstride
];
2915 s
= (GFC_COMPLEX_4
) 0;
2916 for (n
= 0; n
< count
; n
++)
2917 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2918 dest_y
[x
*rxstride
] = s
;