1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
6 #include "lib/jpegli/downsample.h"
8 #undef HWY_TARGET_INCLUDE
9 #define HWY_TARGET_INCLUDE "lib/jpegli/downsample.cc"
10 #include <hwy/foreach_target.h>
11 #include <hwy/highway.h>
13 #include "lib/jpegli/encode_internal.h"
14 #include "lib/jpegli/error.h"
16 HWY_BEFORE_NAMESPACE();
18 namespace HWY_NAMESPACE
{
20 // These templates are not found via ADL.
21 using hwy::HWY_NAMESPACE::Add
;
22 using hwy::HWY_NAMESPACE::Mul
;
23 using hwy::HWY_NAMESPACE::Vec
;
25 using D
= HWY_CAPPED(float, 8);
28 void DownsampleRow2x1(const float* row_in
, size_t len
, float* row_out
) {
29 const size_t N
= Lanes(d
);
30 const size_t len_out
= len
/ 2;
31 const auto mul
= Set(d
, 0.5f
);
33 for (size_t x
= 0; x
< len_out
; x
+= N
) {
34 LoadInterleaved2(d
, row_in
+ 2 * x
, v0
, v1
);
35 Store(Mul(mul
, Add(v0
, v1
)), d
, row_out
+ x
);
39 void DownsampleRow3x1(const float* row_in
, size_t len
, float* row_out
) {
40 const size_t N
= Lanes(d
);
41 const size_t len_out
= len
/ 3;
42 const auto mul
= Set(d
, 1.0f
/ 3);
44 for (size_t x
= 0; x
< len_out
; x
+= N
) {
45 LoadInterleaved3(d
, row_in
+ 3 * x
, v0
, v1
, v2
);
46 Store(Mul(mul
, Add(Add(v0
, v1
), v2
)), d
, row_out
+ x
);
50 void DownsampleRow4x1(const float* row_in
, size_t len
, float* row_out
) {
51 const size_t N
= Lanes(d
);
52 const size_t len_out
= len
/ 4;
53 const auto mul
= Set(d
, 0.25f
);
54 Vec
<D
> v0
, v1
, v2
, v3
;
55 for (size_t x
= 0; x
< len_out
; x
+= N
) {
56 LoadInterleaved4(d
, row_in
+ 4 * x
, v0
, v1
, v2
, v3
);
57 Store(Mul(mul
, Add(Add(v0
, v1
), Add(v2
, v3
))), d
, row_out
+ x
);
61 void Downsample2x1(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
63 DownsampleRow2x1(rows_in
[0], len
, row_out
);
66 void Downsample3x1(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
68 DownsampleRow3x1(rows_in
[0], len
, row_out
);
71 void Downsample4x1(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
73 DownsampleRow4x1(rows_in
[0], len
, row_out
);
76 void Downsample1x2(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
78 const size_t N
= Lanes(d
);
79 const auto mul
= Set(d
, 0.5f
);
80 float* row0
= rows_in
[0];
81 float* row1
= rows_in
[1];
82 for (size_t x
= 0; x
< len
; x
+= N
) {
83 Store(Mul(mul
, Add(Load(d
, row0
+ x
), Load(d
, row1
+ x
))), d
, row_out
+ x
);
87 void Downsample2x2(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
89 const size_t N
= Lanes(d
);
90 const size_t len_out
= len
/ 2;
91 const auto mul
= Set(d
, 0.25f
);
92 float* row0
= rows_in
[0];
93 float* row1
= rows_in
[1];
94 Vec
<D
> v0
, v1
, v2
, v3
;
95 for (size_t x
= 0; x
< len_out
; x
+= N
) {
96 LoadInterleaved2(d
, row0
+ 2 * x
, v0
, v1
);
97 LoadInterleaved2(d
, row1
+ 2 * x
, v2
, v3
);
98 Store(Mul(mul
, Add(Add(v0
, v1
), Add(v2
, v3
))), d
, row_out
+ x
);
102 void Downsample3x2(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
104 DownsampleRow3x1(rows_in
[0], len
, rows_in
[0]);
105 DownsampleRow3x1(rows_in
[1], len
, rows_in
[1]);
106 Downsample1x2(rows_in
, len
/ 3, row_out
);
109 void Downsample4x2(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
111 DownsampleRow4x1(rows_in
[0], len
, rows_in
[0]);
112 DownsampleRow4x1(rows_in
[1], len
, rows_in
[1]);
113 Downsample1x2(rows_in
, len
/ 4, row_out
);
116 void Downsample1x3(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
118 const size_t N
= Lanes(d
);
119 const auto mul
= Set(d
, 1.0f
/ 3);
120 float* row0
= rows_in
[0];
121 float* row1
= rows_in
[1];
122 float* row2
= rows_in
[2];
123 for (size_t x
= 0; x
< len
; x
+= N
) {
124 const auto in0
= Load(d
, row0
+ x
);
125 const auto in1
= Load(d
, row1
+ x
);
126 const auto in2
= Load(d
, row2
+ x
);
127 Store(Mul(mul
, Add(Add(in0
, in1
), in2
)), d
, row_out
+ x
);
131 void Downsample2x3(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
133 DownsampleRow2x1(rows_in
[0], len
, rows_in
[0]);
134 DownsampleRow2x1(rows_in
[1], len
, rows_in
[1]);
135 DownsampleRow2x1(rows_in
[2], len
, rows_in
[2]);
136 Downsample1x3(rows_in
, len
/ 2, row_out
);
139 void Downsample3x3(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
141 DownsampleRow3x1(rows_in
[0], len
, rows_in
[0]);
142 DownsampleRow3x1(rows_in
[1], len
, rows_in
[1]);
143 DownsampleRow3x1(rows_in
[2], len
, rows_in
[2]);
144 Downsample1x3(rows_in
, len
/ 3, row_out
);
147 void Downsample4x3(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
149 DownsampleRow4x1(rows_in
[0], len
, rows_in
[0]);
150 DownsampleRow4x1(rows_in
[1], len
, rows_in
[1]);
151 DownsampleRow4x1(rows_in
[2], len
, rows_in
[2]);
152 Downsample1x3(rows_in
, len
/ 4, row_out
);
155 void Downsample1x4(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
157 const size_t N
= Lanes(d
);
158 const auto mul
= Set(d
, 0.25f
);
159 float* row0
= rows_in
[0];
160 float* row1
= rows_in
[1];
161 float* row2
= rows_in
[2];
162 float* row3
= rows_in
[3];
163 for (size_t x
= 0; x
< len
; x
+= N
) {
164 const auto in0
= Load(d
, row0
+ x
);
165 const auto in1
= Load(d
, row1
+ x
);
166 const auto in2
= Load(d
, row2
+ x
);
167 const auto in3
= Load(d
, row3
+ x
);
168 Store(Mul(mul
, Add(Add(in0
, in1
), Add(in2
, in3
))), d
, row_out
+ x
);
172 void Downsample2x4(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
174 DownsampleRow2x1(rows_in
[0], len
, rows_in
[0]);
175 DownsampleRow2x1(rows_in
[1], len
, rows_in
[1]);
176 DownsampleRow2x1(rows_in
[2], len
, rows_in
[2]);
177 DownsampleRow2x1(rows_in
[3], len
, rows_in
[3]);
178 Downsample1x4(rows_in
, len
/ 2, row_out
);
181 void Downsample3x4(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
183 DownsampleRow3x1(rows_in
[0], len
, rows_in
[0]);
184 DownsampleRow3x1(rows_in
[1], len
, rows_in
[1]);
185 DownsampleRow3x1(rows_in
[2], len
, rows_in
[2]);
186 DownsampleRow3x1(rows_in
[3], len
, rows_in
[3]);
187 Downsample1x4(rows_in
, len
/ 3, row_out
);
190 void Downsample4x4(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
192 DownsampleRow4x1(rows_in
[0], len
, rows_in
[0]);
193 DownsampleRow4x1(rows_in
[1], len
, rows_in
[1]);
194 DownsampleRow4x1(rows_in
[2], len
, rows_in
[2]);
195 DownsampleRow4x1(rows_in
[3], len
, rows_in
[3]);
196 Downsample1x4(rows_in
, len
/ 4, row_out
);
199 // NOLINTNEXTLINE(google-readability-namespace-comments)
200 } // namespace HWY_NAMESPACE
201 } // namespace jpegli
202 HWY_AFTER_NAMESPACE();
207 HWY_EXPORT(Downsample1x2
);
208 HWY_EXPORT(Downsample1x3
);
209 HWY_EXPORT(Downsample1x4
);
210 HWY_EXPORT(Downsample2x1
);
211 HWY_EXPORT(Downsample2x2
);
212 HWY_EXPORT(Downsample2x3
);
213 HWY_EXPORT(Downsample2x4
);
214 HWY_EXPORT(Downsample3x1
);
215 HWY_EXPORT(Downsample3x2
);
216 HWY_EXPORT(Downsample3x3
);
217 HWY_EXPORT(Downsample3x4
);
218 HWY_EXPORT(Downsample4x1
);
219 HWY_EXPORT(Downsample4x2
);
220 HWY_EXPORT(Downsample4x3
);
221 HWY_EXPORT(Downsample4x4
);
223 void NullDownsample(float* rows_in
[MAX_SAMP_FACTOR
], size_t len
,
226 void ChooseDownsampleMethods(j_compress_ptr cinfo
) {
227 jpeg_comp_master
* m
= cinfo
->master
;
228 for (int c
= 0; c
< cinfo
->num_components
; c
++) {
229 m
->downsample_method
[c
] = nullptr;
230 jpeg_component_info
* comp
= &cinfo
->comp_info
[c
];
231 const int h_factor
= cinfo
->max_h_samp_factor
/ comp
->h_samp_factor
;
232 const int v_factor
= cinfo
->max_v_samp_factor
/ comp
->v_samp_factor
;
235 m
->downsample_method
[c
] = NullDownsample
;
236 } else if (h_factor
== 2) {
237 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample2x1
);
238 } else if (h_factor
== 3) {
239 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample3x1
);
240 } else if (h_factor
== 4) {
241 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample4x1
);
243 } else if (v_factor
== 2) {
245 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample1x2
);
246 } else if (h_factor
== 2) {
247 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample2x2
);
248 } else if (h_factor
== 3) {
249 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample3x2
);
250 } else if (h_factor
== 4) {
251 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample4x2
);
253 } else if (v_factor
== 3) {
255 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample1x2
);
256 } else if (h_factor
== 2) {
257 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample2x2
);
258 } else if (h_factor
== 3) {
259 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample3x2
);
260 } else if (h_factor
== 4) {
261 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample4x2
);
263 } else if (v_factor
== 4) {
265 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample1x4
);
266 } else if (h_factor
== 2) {
267 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample2x4
);
268 } else if (h_factor
== 3) {
269 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample3x4
);
270 } else if (h_factor
== 4) {
271 m
->downsample_method
[c
] = HWY_DYNAMIC_DISPATCH(Downsample4x4
);
274 if (m
->downsample_method
[c
] == nullptr) {
275 JPEGLI_ERROR("Unsupported downsampling ratio %dx%d", h_factor
, v_factor
);
280 void DownsampleInputBuffer(j_compress_ptr cinfo
) {
281 if (cinfo
->max_h_samp_factor
== 1 && cinfo
->max_v_samp_factor
== 1) {
284 jpeg_comp_master
* m
= cinfo
->master
;
285 const size_t iMCU_height
= DCTSIZE
* cinfo
->max_v_samp_factor
;
286 const size_t y0
= m
->next_iMCU_row
* iMCU_height
;
287 const size_t y1
= y0
+ iMCU_height
;
288 const size_t xsize_padded
= m
->xsize_blocks
* DCTSIZE
;
289 for (int c
= 0; c
< cinfo
->num_components
; c
++) {
290 jpeg_component_info
* comp
= &cinfo
->comp_info
[c
];
291 const int h_factor
= cinfo
->max_h_samp_factor
/ comp
->h_samp_factor
;
292 const int v_factor
= cinfo
->max_v_samp_factor
/ comp
->v_samp_factor
;
293 if (h_factor
== 1 && v_factor
== 1) {
296 auto& input
= *m
->smooth_input
[c
];
297 auto& output
= *m
->raw_data
[c
];
298 const size_t yout0
= y0
/ v_factor
;
299 float* rows_in
[MAX_SAMP_FACTOR
];
300 for (size_t yin
= y0
, yout
= yout0
; yin
< y1
; yin
+= v_factor
, ++yout
) {
301 for (int iy
= 0; iy
< v_factor
; ++iy
) {
302 rows_in
[iy
] = input
.Row(yin
+ iy
);
304 float* row_out
= output
.Row(yout
);
305 (*m
->downsample_method
[c
])(rows_in
, xsize_padded
, row_out
);
310 void ApplyInputSmoothing(j_compress_ptr cinfo
) {
311 if (!cinfo
->smoothing_factor
) {
314 jpeg_comp_master
* m
= cinfo
->master
;
315 const float kW1
= cinfo
->smoothing_factor
/ 1024.0;
316 const float kW0
= 1.0f
- 8.0f
* kW1
;
317 const size_t iMCU_height
= DCTSIZE
* cinfo
->max_v_samp_factor
;
318 const ssize_t y0
= m
->next_iMCU_row
* iMCU_height
;
319 const ssize_t y1
= y0
+ iMCU_height
;
320 const ssize_t xsize_padded
= m
->xsize_blocks
* DCTSIZE
;
321 for (int c
= 0; c
< cinfo
->num_components
; c
++) {
322 auto& input
= m
->input_buffer
[c
];
323 auto& output
= *m
->smooth_input
[c
];
324 if (m
->next_iMCU_row
== 0) {
325 input
.CopyRow(-1, 0, 1);
327 if (m
->next_iMCU_row
+ 1 == cinfo
->total_iMCU_rows
) {
328 size_t last_row
= m
->ysize_blocks
* DCTSIZE
- 1;
329 input
.CopyRow(last_row
+ 1, last_row
, 1);
331 // TODO(szabadka) SIMDify this.
332 for (ssize_t y
= y0
; y
< y1
; ++y
) {
333 const float* row_t
= input
.Row(y
- 1);
334 const float* row_m
= input
.Row(y
);
335 const float* row_b
= input
.Row(y
+ 1);
336 float* row_out
= output
.Row(y
);
337 for (ssize_t x
= 0; x
< xsize_padded
; ++x
) {
338 float val_tl
= row_t
[x
- 1];
339 float val_tm
= row_t
[x
];
340 float val_tr
= row_t
[x
+ 1];
341 float val_ml
= row_m
[x
- 1];
342 float val_mm
= row_m
[x
];
343 float val_mr
= row_m
[x
+ 1];
344 float val_bl
= row_b
[x
- 1];
345 float val_bm
= row_b
[x
];
346 float val_br
= row_b
[x
+ 1];
347 float val1
= (val_tl
+ val_tm
+ val_tr
+ val_ml
+ val_mr
+ val_bl
+
349 row_out
[x
] = val_mm
* kW0
+ val1
* kW1
;
355 } // namespace jpegli