1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
6 #if defined(LIB_JPEGLI_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE)
7 #ifdef LIB_JPEGLI_TRANSPOSE_INL_H_
8 #undef LIB_JPEGLI_TRANSPOSE_INL_H_
10 #define LIB_JPEGLI_TRANSPOSE_INL_H_
13 #include "lib/jxl/base/compiler_specific.h"
15 HWY_BEFORE_NAMESPACE();
17 namespace HWY_NAMESPACE
{
21 static JXL_INLINE
void Transpose8x8Block(const float* JXL_RESTRICT from
,
22 float* JXL_RESTRICT to
) {
23 const HWY_CAPPED(float, 8) d
;
24 auto i0
= Load(d
, from
);
25 auto i1
= Load(d
, from
+ 1 * 8);
26 auto i2
= Load(d
, from
+ 2 * 8);
27 auto i3
= Load(d
, from
+ 3 * 8);
28 auto i4
= Load(d
, from
+ 4 * 8);
29 auto i5
= Load(d
, from
+ 5 * 8);
30 auto i6
= Load(d
, from
+ 6 * 8);
31 auto i7
= Load(d
, from
+ 7 * 8);
33 const auto q0
= InterleaveLower(d
, i0
, i2
);
34 const auto q1
= InterleaveLower(d
, i1
, i3
);
35 const auto q2
= InterleaveUpper(d
, i0
, i2
);
36 const auto q3
= InterleaveUpper(d
, i1
, i3
);
37 const auto q4
= InterleaveLower(d
, i4
, i6
);
38 const auto q5
= InterleaveLower(d
, i5
, i7
);
39 const auto q6
= InterleaveUpper(d
, i4
, i6
);
40 const auto q7
= InterleaveUpper(d
, i5
, i7
);
42 const auto r0
= InterleaveLower(d
, q0
, q1
);
43 const auto r1
= InterleaveUpper(d
, q0
, q1
);
44 const auto r2
= InterleaveLower(d
, q2
, q3
);
45 const auto r3
= InterleaveUpper(d
, q2
, q3
);
46 const auto r4
= InterleaveLower(d
, q4
, q5
);
47 const auto r5
= InterleaveUpper(d
, q4
, q5
);
48 const auto r6
= InterleaveLower(d
, q6
, q7
);
49 const auto r7
= InterleaveUpper(d
, q6
, q7
);
51 i0
= ConcatLowerLower(d
, r4
, r0
);
52 i1
= ConcatLowerLower(d
, r5
, r1
);
53 i2
= ConcatLowerLower(d
, r6
, r2
);
54 i3
= ConcatLowerLower(d
, r7
, r3
);
55 i4
= ConcatUpperUpper(d
, r4
, r0
);
56 i5
= ConcatUpperUpper(d
, r5
, r1
);
57 i6
= ConcatUpperUpper(d
, r6
, r2
);
58 i7
= ConcatUpperUpper(d
, r7
, r3
);
61 Store(i1
, d
, to
+ 1 * 8);
62 Store(i2
, d
, to
+ 2 * 8);
63 Store(i3
, d
, to
+ 3 * 8);
64 Store(i4
, d
, to
+ 4 * 8);
65 Store(i5
, d
, to
+ 5 * 8);
66 Store(i6
, d
, to
+ 6 * 8);
67 Store(i7
, d
, to
+ 7 * 8);
69 #elif HWY_TARGET != HWY_SCALAR
70 static JXL_INLINE
void Transpose8x8Block(const float* JXL_RESTRICT from
,
71 float* JXL_RESTRICT to
) {
72 const HWY_CAPPED(float, 4) d
;
73 for (size_t n
= 0; n
< 8; n
+= 4) {
74 for (size_t m
= 0; m
< 8; m
+= 4) {
75 auto p0
= Load(d
, from
+ n
* 8 + m
);
76 auto p1
= Load(d
, from
+ (n
+ 1) * 8 + m
);
77 auto p2
= Load(d
, from
+ (n
+ 2) * 8 + m
);
78 auto p3
= Load(d
, from
+ (n
+ 3) * 8 + m
);
79 const auto q0
= InterleaveLower(d
, p0
, p2
);
80 const auto q1
= InterleaveLower(d
, p1
, p3
);
81 const auto q2
= InterleaveUpper(d
, p0
, p2
);
82 const auto q3
= InterleaveUpper(d
, p1
, p3
);
84 const auto r0
= InterleaveLower(d
, q0
, q1
);
85 const auto r1
= InterleaveUpper(d
, q0
, q1
);
86 const auto r2
= InterleaveLower(d
, q2
, q3
);
87 const auto r3
= InterleaveUpper(d
, q2
, q3
);
88 Store(r0
, d
, to
+ m
* 8 + n
);
89 Store(r1
, d
, to
+ (1 + m
) * 8 + n
);
90 Store(r2
, d
, to
+ (2 + m
) * 8 + n
);
91 Store(r3
, d
, to
+ (3 + m
) * 8 + n
);
96 static JXL_INLINE
void Transpose8x8Block(const float* JXL_RESTRICT from
,
97 float* JXL_RESTRICT to
) {
98 for (size_t n
= 0; n
< 8; ++n
) {
99 for (size_t m
= 0; m
< 8; ++m
) {
100 to
[8 * n
+ m
] = from
[8 * m
+ n
];
106 // NOLINTNEXTLINE(google-readability-namespace-comments)
108 } // namespace HWY_NAMESPACE
109 } // namespace jpegli
110 HWY_AFTER_NAMESPACE();
111 #endif // LIB_JPEGLI_TRANSPOSE_INL_H_