Backed out 3 changesets (bug 1790375) for causing wd failures on fetch_error.py....
[gecko.git] / third_party / jpeg-xl / lib / jpegli / downsample.cc
blobdf2c156972a5b41fedeb017a0b30047313eb68e3
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
6 #include "lib/jpegli/downsample.h"
8 #undef HWY_TARGET_INCLUDE
9 #define HWY_TARGET_INCLUDE "lib/jpegli/downsample.cc"
10 #include <hwy/foreach_target.h>
11 #include <hwy/highway.h>
13 #include "lib/jpegli/encode_internal.h"
14 #include "lib/jpegli/error.h"
16 HWY_BEFORE_NAMESPACE();
17 namespace jpegli {
18 namespace HWY_NAMESPACE {
20 // These templates are not found via ADL.
21 using hwy::HWY_NAMESPACE::Add;
22 using hwy::HWY_NAMESPACE::Mul;
23 using hwy::HWY_NAMESPACE::Vec;
25 using D = HWY_CAPPED(float, 8);
26 constexpr D d;
28 void DownsampleRow2x1(const float* row_in, size_t len, float* row_out) {
29 const size_t N = Lanes(d);
30 const size_t len_out = len / 2;
31 const auto mul = Set(d, 0.5f);
32 Vec<D> v0, v1;
33 for (size_t x = 0; x < len_out; x += N) {
34 LoadInterleaved2(d, row_in + 2 * x, v0, v1);
35 Store(Mul(mul, Add(v0, v1)), d, row_out + x);
39 void DownsampleRow3x1(const float* row_in, size_t len, float* row_out) {
40 const size_t N = Lanes(d);
41 const size_t len_out = len / 3;
42 const auto mul = Set(d, 1.0f / 3);
43 Vec<D> v0, v1, v2;
44 for (size_t x = 0; x < len_out; x += N) {
45 LoadInterleaved3(d, row_in + 3 * x, v0, v1, v2);
46 Store(Mul(mul, Add(Add(v0, v1), v2)), d, row_out + x);
50 void DownsampleRow4x1(const float* row_in, size_t len, float* row_out) {
51 const size_t N = Lanes(d);
52 const size_t len_out = len / 4;
53 const auto mul = Set(d, 0.25f);
54 Vec<D> v0, v1, v2, v3;
55 for (size_t x = 0; x < len_out; x += N) {
56 LoadInterleaved4(d, row_in + 4 * x, v0, v1, v2, v3);
57 Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
61 void Downsample2x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
62 float* row_out) {
63 DownsampleRow2x1(rows_in[0], len, row_out);
66 void Downsample3x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
67 float* row_out) {
68 DownsampleRow3x1(rows_in[0], len, row_out);
71 void Downsample4x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
72 float* row_out) {
73 DownsampleRow4x1(rows_in[0], len, row_out);
76 void Downsample1x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
77 float* row_out) {
78 const size_t N = Lanes(d);
79 const auto mul = Set(d, 0.5f);
80 float* row0 = rows_in[0];
81 float* row1 = rows_in[1];
82 for (size_t x = 0; x < len; x += N) {
83 Store(Mul(mul, Add(Load(d, row0 + x), Load(d, row1 + x))), d, row_out + x);
87 void Downsample2x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
88 float* row_out) {
89 const size_t N = Lanes(d);
90 const size_t len_out = len / 2;
91 const auto mul = Set(d, 0.25f);
92 float* row0 = rows_in[0];
93 float* row1 = rows_in[1];
94 Vec<D> v0, v1, v2, v3;
95 for (size_t x = 0; x < len_out; x += N) {
96 LoadInterleaved2(d, row0 + 2 * x, v0, v1);
97 LoadInterleaved2(d, row1 + 2 * x, v2, v3);
98 Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
102 void Downsample3x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
103 float* row_out) {
104 DownsampleRow3x1(rows_in[0], len, rows_in[0]);
105 DownsampleRow3x1(rows_in[1], len, rows_in[1]);
106 Downsample1x2(rows_in, len / 3, row_out);
109 void Downsample4x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
110 float* row_out) {
111 DownsampleRow4x1(rows_in[0], len, rows_in[0]);
112 DownsampleRow4x1(rows_in[1], len, rows_in[1]);
113 Downsample1x2(rows_in, len / 4, row_out);
116 void Downsample1x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
117 float* row_out) {
118 const size_t N = Lanes(d);
119 const auto mul = Set(d, 1.0f / 3);
120 float* row0 = rows_in[0];
121 float* row1 = rows_in[1];
122 float* row2 = rows_in[2];
123 for (size_t x = 0; x < len; x += N) {
124 const auto in0 = Load(d, row0 + x);
125 const auto in1 = Load(d, row1 + x);
126 const auto in2 = Load(d, row2 + x);
127 Store(Mul(mul, Add(Add(in0, in1), in2)), d, row_out + x);
131 void Downsample2x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
132 float* row_out) {
133 DownsampleRow2x1(rows_in[0], len, rows_in[0]);
134 DownsampleRow2x1(rows_in[1], len, rows_in[1]);
135 DownsampleRow2x1(rows_in[2], len, rows_in[2]);
136 Downsample1x3(rows_in, len / 2, row_out);
139 void Downsample3x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
140 float* row_out) {
141 DownsampleRow3x1(rows_in[0], len, rows_in[0]);
142 DownsampleRow3x1(rows_in[1], len, rows_in[1]);
143 DownsampleRow3x1(rows_in[2], len, rows_in[2]);
144 Downsample1x3(rows_in, len / 3, row_out);
147 void Downsample4x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
148 float* row_out) {
149 DownsampleRow4x1(rows_in[0], len, rows_in[0]);
150 DownsampleRow4x1(rows_in[1], len, rows_in[1]);
151 DownsampleRow4x1(rows_in[2], len, rows_in[2]);
152 Downsample1x3(rows_in, len / 4, row_out);
155 void Downsample1x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
156 float* row_out) {
157 const size_t N = Lanes(d);
158 const auto mul = Set(d, 0.25f);
159 float* row0 = rows_in[0];
160 float* row1 = rows_in[1];
161 float* row2 = rows_in[2];
162 float* row3 = rows_in[3];
163 for (size_t x = 0; x < len; x += N) {
164 const auto in0 = Load(d, row0 + x);
165 const auto in1 = Load(d, row1 + x);
166 const auto in2 = Load(d, row2 + x);
167 const auto in3 = Load(d, row3 + x);
168 Store(Mul(mul, Add(Add(in0, in1), Add(in2, in3))), d, row_out + x);
172 void Downsample2x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
173 float* row_out) {
174 DownsampleRow2x1(rows_in[0], len, rows_in[0]);
175 DownsampleRow2x1(rows_in[1], len, rows_in[1]);
176 DownsampleRow2x1(rows_in[2], len, rows_in[2]);
177 DownsampleRow2x1(rows_in[3], len, rows_in[3]);
178 Downsample1x4(rows_in, len / 2, row_out);
181 void Downsample3x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
182 float* row_out) {
183 DownsampleRow3x1(rows_in[0], len, rows_in[0]);
184 DownsampleRow3x1(rows_in[1], len, rows_in[1]);
185 DownsampleRow3x1(rows_in[2], len, rows_in[2]);
186 DownsampleRow3x1(rows_in[3], len, rows_in[3]);
187 Downsample1x4(rows_in, len / 3, row_out);
190 void Downsample4x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
191 float* row_out) {
192 DownsampleRow4x1(rows_in[0], len, rows_in[0]);
193 DownsampleRow4x1(rows_in[1], len, rows_in[1]);
194 DownsampleRow4x1(rows_in[2], len, rows_in[2]);
195 DownsampleRow4x1(rows_in[3], len, rows_in[3]);
196 Downsample1x4(rows_in, len / 4, row_out);
199 // NOLINTNEXTLINE(google-readability-namespace-comments)
200 } // namespace HWY_NAMESPACE
201 } // namespace jpegli
202 HWY_AFTER_NAMESPACE();
204 #if HWY_ONCE
205 namespace jpegli {
207 HWY_EXPORT(Downsample1x2);
208 HWY_EXPORT(Downsample1x3);
209 HWY_EXPORT(Downsample1x4);
210 HWY_EXPORT(Downsample2x1);
211 HWY_EXPORT(Downsample2x2);
212 HWY_EXPORT(Downsample2x3);
213 HWY_EXPORT(Downsample2x4);
214 HWY_EXPORT(Downsample3x1);
215 HWY_EXPORT(Downsample3x2);
216 HWY_EXPORT(Downsample3x3);
217 HWY_EXPORT(Downsample3x4);
218 HWY_EXPORT(Downsample4x1);
219 HWY_EXPORT(Downsample4x2);
220 HWY_EXPORT(Downsample4x3);
221 HWY_EXPORT(Downsample4x4);
223 void NullDownsample(float* rows_in[MAX_SAMP_FACTOR], size_t len,
224 float* row_out) {}
226 void ChooseDownsampleMethods(j_compress_ptr cinfo) {
227 jpeg_comp_master* m = cinfo->master;
228 for (int c = 0; c < cinfo->num_components; c++) {
229 m->downsample_method[c] = nullptr;
230 jpeg_component_info* comp = &cinfo->comp_info[c];
231 const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
232 const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
233 if (v_factor == 1) {
234 if (h_factor == 1) {
235 m->downsample_method[c] = NullDownsample;
236 } else if (h_factor == 2) {
237 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x1);
238 } else if (h_factor == 3) {
239 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x1);
240 } else if (h_factor == 4) {
241 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x1);
243 } else if (v_factor == 2) {
244 if (h_factor == 1) {
245 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
246 } else if (h_factor == 2) {
247 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
248 } else if (h_factor == 3) {
249 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
250 } else if (h_factor == 4) {
251 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
253 } else if (v_factor == 3) {
254 if (h_factor == 1) {
255 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
256 } else if (h_factor == 2) {
257 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
258 } else if (h_factor == 3) {
259 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
260 } else if (h_factor == 4) {
261 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
263 } else if (v_factor == 4) {
264 if (h_factor == 1) {
265 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x4);
266 } else if (h_factor == 2) {
267 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x4);
268 } else if (h_factor == 3) {
269 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x4);
270 } else if (h_factor == 4) {
271 m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x4);
274 if (m->downsample_method[c] == nullptr) {
275 JPEGLI_ERROR("Unsupported downsampling ratio %dx%d", h_factor, v_factor);
280 void DownsampleInputBuffer(j_compress_ptr cinfo) {
281 if (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1) {
282 return;
284 jpeg_comp_master* m = cinfo->master;
285 const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
286 const size_t y0 = m->next_iMCU_row * iMCU_height;
287 const size_t y1 = y0 + iMCU_height;
288 const size_t xsize_padded = m->xsize_blocks * DCTSIZE;
289 for (int c = 0; c < cinfo->num_components; c++) {
290 jpeg_component_info* comp = &cinfo->comp_info[c];
291 const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
292 const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
293 if (h_factor == 1 && v_factor == 1) {
294 continue;
296 auto& input = *m->smooth_input[c];
297 auto& output = *m->raw_data[c];
298 const size_t yout0 = y0 / v_factor;
299 float* rows_in[MAX_SAMP_FACTOR];
300 for (size_t yin = y0, yout = yout0; yin < y1; yin += v_factor, ++yout) {
301 for (int iy = 0; iy < v_factor; ++iy) {
302 rows_in[iy] = input.Row(yin + iy);
304 float* row_out = output.Row(yout);
305 (*m->downsample_method[c])(rows_in, xsize_padded, row_out);
310 void ApplyInputSmoothing(j_compress_ptr cinfo) {
311 if (!cinfo->smoothing_factor) {
312 return;
314 jpeg_comp_master* m = cinfo->master;
315 const float kW1 = cinfo->smoothing_factor / 1024.0;
316 const float kW0 = 1.0f - 8.0f * kW1;
317 const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
318 const ssize_t y0 = m->next_iMCU_row * iMCU_height;
319 const ssize_t y1 = y0 + iMCU_height;
320 const ssize_t xsize_padded = m->xsize_blocks * DCTSIZE;
321 for (int c = 0; c < cinfo->num_components; c++) {
322 auto& input = m->input_buffer[c];
323 auto& output = *m->smooth_input[c];
324 if (m->next_iMCU_row == 0) {
325 input.CopyRow(-1, 0, 1);
327 if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
328 size_t last_row = m->ysize_blocks * DCTSIZE - 1;
329 input.CopyRow(last_row + 1, last_row, 1);
331 // TODO(szabadka) SIMDify this.
332 for (ssize_t y = y0; y < y1; ++y) {
333 const float* row_t = input.Row(y - 1);
334 const float* row_m = input.Row(y);
335 const float* row_b = input.Row(y + 1);
336 float* row_out = output.Row(y);
337 for (ssize_t x = 0; x < xsize_padded; ++x) {
338 float val_tl = row_t[x - 1];
339 float val_tm = row_t[x];
340 float val_tr = row_t[x + 1];
341 float val_ml = row_m[x - 1];
342 float val_mm = row_m[x];
343 float val_mr = row_m[x + 1];
344 float val_bl = row_b[x - 1];
345 float val_bm = row_b[x];
346 float val_br = row_b[x + 1];
347 float val1 = (val_tl + val_tm + val_tr + val_ml + val_mr + val_bl +
348 val_bm + val_br);
349 row_out[x] = val_mm * kW0 + val1 * kW1;
355 } // namespace jpegli
356 #endif // HWY_ONCE