2 ! { dg-skip-if "AArch64 tiny code model does not support programs larger than 1MiB" {aarch64_tiny} }
4 program strassen_matmul
6 integer, parameter :: N
= 1024
7 double precision, save :: A(N
,N
), B(N
,N
), C(N
,N
), D(N
,N
)
8 double precision :: start
, end
11 call random_number (A
)
12 call random_number (B
)
13 start
= omp_get_wtime ()
15 end = omp_get_wtime ()
16 write(*,'(a, f10.6)') ' Time for matmul = ', end - start
18 start
= omp_get_wtime ()
19 call strassen (A
, B
, D
, N
)
20 end = omp_get_wtime ()
21 write(*,'(a, f10.6)') ' Time for Strassen = ', end - start
22 if (sqrt (sum ((C
- D
) ** 2)) / N
.gt
. 0.1) STOP 1
24 start
= omp_get_wtime ()
27 call strassen (A
, B
, D
, N
)
28 !$omp end single nowait
30 end = omp_get_wtime ()
31 write(*,'(a, f10.6)') ' Time for Strassen MP = ', end - start
32 if (sqrt (sum ((C
- D
) ** 2)) / N
.gt
. 0.1) STOP 2
36 recursive subroutine strassen (A
, B
, C
, N
)
37 integer, intent(in
) :: N
38 double precision, intent(in
) :: A(N
,N
), B(N
,N
)
39 double precision, intent(out
) :: C(N
,N
)
40 double precision :: T(N
/2,N
/2,7)
43 if (iand (N
,1) .ne
. 0 .or
. N
< 64) then
49 !$omp task shared (A, B, T)
50 call strassen (A(:K
,:K
) + A(L
:,L
:), B(:K
,:K
) + B(L
:,L
:), T(:,:,1), K
)
52 !$omp task shared (A, B, T)
53 call strassen (A(L
:,:K
) + A(L
:,L
:), B(:K
,:K
), T(:,:,2), K
)
55 !$omp task shared (A, B, T)
56 call strassen (A(:K
,:K
), B(:K
,L
:) - B(L
:,L
:), T(:,:,3), K
)
58 !$omp task shared (A, B, T)
59 call strassen (A(L
:,L
:), B(L
:,:K
) - B(:K
,:K
), T(:,:,4), K
)
61 !$omp task shared (A, B, T)
62 call strassen (A(:K
,:K
) + A(:K
,L
:), B(L
:,L
:), T(:,:,5), K
)
64 !$omp task shared (A, B, T)
65 call strassen (A(L
:,:K
) - A(:K
,:K
), B(:K
,:K
) + B(:K
,L
:), T(:,:,6), K
)
67 !$omp task shared (A, B, T)
68 call strassen (A(:K
,L
:) - A(L
:,L
:), B(L
:,:K
) + B(L
:,L
:), T(:,:,7), K
)
71 C(:K
,:K
) = T(:,:,1) + T(:,:,4) - T(:,:,5) + T(:,:,7)
72 C(L
:,:K
) = T(:,:,2) + T(:,:,4)
73 C(:K
,L
:) = T(:,:,3) + T(:,:,5)
74 C(L
:,L
:) = T(:,:,1) - T(:,:,2) + T(:,:,3) + T(:,:,6)
75 end subroutine strassen