1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
6 #include "lib/jpegli/upsample.h"
10 #undef HWY_TARGET_INCLUDE
11 #define HWY_TARGET_INCLUDE "lib/jpegli/upsample.cc"
12 #include <hwy/foreach_target.h>
13 #include <hwy/highway.h>
15 HWY_BEFORE_NAMESPACE();
17 namespace HWY_NAMESPACE
{
19 // These templates are not found via ADL.
20 using hwy::HWY_NAMESPACE::Mul
;
21 using hwy::HWY_NAMESPACE::MulAdd
;
22 using hwy::HWY_NAMESPACE::Vec
;
25 using hwy::HWY_NAMESPACE::Half
;
26 using hwy::HWY_NAMESPACE::Vec
;
27 template <size_t i
, class DF
, class V
>
28 HWY_INLINE Vec
<Half
<Half
<DF
>>> Quarter(const DF df
, V v
) {
31 auto half
= i
>= 2 ? UpperHalf(HF(), v
) : LowerHalf(HF(), v
);
32 return i
& 1 ? UpperHalf(HHF(), half
) : LowerHalf(HHF(), half
);
35 template <class DF
, class V
>
36 HWY_INLINE Vec
<DF
> Concat4(const DF df
, V v0
, V v1
, V v2
, V v3
) {
38 return Combine(DF(), Combine(HF(), v3
, v2
), Combine(HF(), v1
, v0
));
43 // Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
45 template <class DF
, class V
, typename T
>
46 void StoreInterleaved(const DF df
, V v0
, V v1
, T
* mem
) {
47 static_assert(sizeof(T
) == 4, "only use StoreInterleaved for 4-byte types");
48 #if HWY_TARGET == HWY_SCALAR
50 Store(v1
, df
, mem
+ 1);
52 Store(InterleaveLower(df
, v0
, v1
), df
, mem
);
53 Store(InterleaveUpper(df
, v0
, v1
), df
, mem
+ Lanes(df
));
55 if (!HWY_CAP_GE512
|| Lanes(df
) == 8) {
56 auto t0
= InterleaveLower(df
, v0
, v1
);
57 auto t1
= InterleaveUpper(df
, v0
, v1
);
58 Store(ConcatLowerLower(df
, t1
, t0
), df
, mem
);
59 Store(ConcatUpperUpper(df
, t1
, t0
), df
, mem
+ Lanes(df
));
62 auto t0
= InterleaveLower(df
, v0
, v1
);
63 auto t1
= InterleaveUpper(df
, v0
, v1
);
64 Store(Concat4(df
, Quarter
<0>(df
, t0
), Quarter
<0>(df
, t1
),
65 Quarter
<1>(df
, t0
), Quarter
<1>(df
, t1
)),
67 Store(Concat4(df
, Quarter
<2>(df
, t0
), Quarter
<2>(df
, t1
),
68 Quarter
<3>(df
, t0
), Quarter
<3>(df
, t1
)),
75 void Upsample2Horizontal(float* JXL_RESTRICT row
,
76 float* JXL_RESTRICT scratch_space
, size_t len_out
) {
78 auto threefour
= Set(df
, 0.75f
);
79 auto onefour
= Set(df
, 0.25f
);
80 const size_t len_in
= (len_out
+ 1) >> 1;
81 memcpy(scratch_space
, row
, len_in
* sizeof(row
[0]));
82 scratch_space
[-1] = scratch_space
[0];
83 scratch_space
[len_in
] = scratch_space
[len_in
- 1];
84 for (size_t x
= 0; x
< len_in
; x
+= Lanes(df
)) {
85 auto current
= Mul(Load(df
, scratch_space
+ x
), threefour
);
86 auto prev
= LoadU(df
, scratch_space
+ x
- 1);
87 auto next
= LoadU(df
, scratch_space
+ x
+ 1);
88 auto left
= MulAdd(onefour
, prev
, current
);
89 auto right
= MulAdd(onefour
, next
, current
);
90 StoreInterleaved(df
, left
, right
, row
+ x
* 2);
94 void Upsample2Vertical(const float* JXL_RESTRICT row_top
,
95 const float* JXL_RESTRICT row_mid
,
96 const float* JXL_RESTRICT row_bot
,
97 float* JXL_RESTRICT row_out0
,
98 float* JXL_RESTRICT row_out1
, size_t len
) {
100 auto threefour
= Set(df
, 0.75f
);
101 auto onefour
= Set(df
, 0.25f
);
102 for (size_t x
= 0; x
< len
; x
+= Lanes(df
)) {
103 auto it
= Load(df
, row_top
+ x
);
104 auto im
= Load(df
, row_mid
+ x
);
105 auto ib
= Load(df
, row_bot
+ x
);
106 auto im_scaled
= Mul(im
, threefour
);
107 Store(MulAdd(it
, onefour
, im_scaled
), df
, row_out0
+ x
);
108 Store(MulAdd(ib
, onefour
, im_scaled
), df
, row_out1
+ x
);
112 // NOLINTNEXTLINE(google-readability-namespace-comments)
113 } // namespace HWY_NAMESPACE
114 } // namespace jpegli
115 HWY_AFTER_NAMESPACE();
120 HWY_EXPORT(Upsample2Horizontal
);
121 HWY_EXPORT(Upsample2Vertical
);
123 void Upsample2Horizontal(float* JXL_RESTRICT row
,
124 float* JXL_RESTRICT scratch_space
, size_t len_out
) {
125 return HWY_DYNAMIC_DISPATCH(Upsample2Horizontal
)(row
, scratch_space
, len_out
);
128 void Upsample2Vertical(const float* JXL_RESTRICT row_top
,
129 const float* JXL_RESTRICT row_mid
,
130 const float* JXL_RESTRICT row_bot
,
131 float* JXL_RESTRICT row_out0
,
132 float* JXL_RESTRICT row_out1
, size_t len
) {
133 return HWY_DYNAMIC_DISPATCH(Upsample2Vertical
)(row_top
, row_mid
, row_bot
,
134 row_out0
, row_out1
, len
);
136 } // namespace jpegli