Backed out 3 changesets (bug 1790375) for causing wd failures on fetch_error.py....
[gecko.git] / third_party / jpeg-xl / lib / jpegli / input.cc
blob765bf98946ae929020f23f4d2999eeb6293fb176
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
6 #include "lib/jpegli/input.h"
8 #undef HWY_TARGET_INCLUDE
9 #define HWY_TARGET_INCLUDE "lib/jpegli/input.cc"
10 #include <hwy/foreach_target.h>
11 #include <hwy/highway.h>
13 #include "lib/jpegli/encode_internal.h"
14 #include "lib/jpegli/error.h"
15 #include "lib/jxl/base/byte_order.h"
16 #include "lib/jxl/base/compiler_specific.h"
18 HWY_BEFORE_NAMESPACE();
19 namespace jpegli {
20 namespace HWY_NAMESPACE {
22 using hwy::HWY_NAMESPACE::Mul;
23 using hwy::HWY_NAMESPACE::Rebind;
24 using hwy::HWY_NAMESPACE::Vec;
26 using D = HWY_FULL(float);
27 using DU = HWY_FULL(uint32_t);
28 using DU8 = Rebind<uint8_t, D>;
29 using DU16 = Rebind<uint16_t, D>;
31 constexpr D d;
32 constexpr DU du;
33 constexpr DU8 du8;
34 constexpr DU16 du16;
36 static constexpr double kMul16 = 1.0 / 257.0;
37 static constexpr double kMulFloat = 255.0;
39 template <size_t C>
40 void ReadUint8Row(const uint8_t* row_in, size_t x0, size_t len,
41 float* row_out[kMaxComponents]) {
42 for (size_t x = x0; x < len; ++x) {
43 for (size_t c = 0; c < C; ++c) {
44 row_out[c][x] = row_in[C * x + c];
49 template <size_t C, bool swap_endianness = false>
50 void ReadUint16Row(const uint8_t* row_in, size_t x0, size_t len,
51 float* row_out[kMaxComponents]) {
52 const uint16_t* row16 = reinterpret_cast<const uint16_t*>(row_in);
53 for (size_t x = x0; x < len; ++x) {
54 for (size_t c = 0; c < C; ++c) {
55 uint16_t val = row16[C * x + c];
56 if (swap_endianness) val = JXL_BSWAP16(val);
57 row_out[c][x] = val * kMul16;
62 template <size_t C, bool swap_endianness = false>
63 void ReadFloatRow(const uint8_t* row_in, size_t x0, size_t len,
64 float* row_out[kMaxComponents]) {
65 const float* rowf = reinterpret_cast<const float*>(row_in);
66 for (size_t x = x0; x < len; ++x) {
67 for (size_t c = 0; c < C; ++c) {
68 float val = rowf[C * x + c];
69 if (swap_endianness) val = BSwapFloat(val);
70 row_out[c][x] = val * kMulFloat;
75 void ReadUint8RowSingle(const uint8_t* row_in, size_t len,
76 float* row_out[kMaxComponents]) {
77 const size_t N = Lanes(d);
78 const size_t simd_len = len & (~(N - 1));
79 float* JXL_RESTRICT const row0 = row_out[0];
80 for (size_t x = 0; x < simd_len; x += N) {
81 Store(ConvertTo(d, PromoteTo(du, LoadU(du8, row_in + x))), d, row0 + x);
83 ReadUint8Row<1>(row_in, simd_len, len, row_out);
86 void ReadUint8RowInterleaved2(const uint8_t* row_in, size_t len,
87 float* row_out[kMaxComponents]) {
88 const size_t N = Lanes(d);
89 const size_t simd_len = len & (~(N - 1));
90 float* JXL_RESTRICT const row0 = row_out[0];
91 float* JXL_RESTRICT const row1 = row_out[1];
92 Vec<DU8> out0, out1;
93 for (size_t x = 0; x < simd_len; x += N) {
94 LoadInterleaved2(du8, row_in + 2 * x, out0, out1);
95 Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
96 Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
98 ReadUint8Row<2>(row_in, simd_len, len, row_out);
101 void ReadUint8RowInterleaved3(const uint8_t* row_in, size_t len,
102 float* row_out[kMaxComponents]) {
103 const size_t N = Lanes(d);
104 const size_t simd_len = len & (~(N - 1));
105 float* JXL_RESTRICT const row0 = row_out[0];
106 float* JXL_RESTRICT const row1 = row_out[1];
107 float* JXL_RESTRICT const row2 = row_out[2];
108 Vec<DU8> out0, out1, out2;
109 for (size_t x = 0; x < simd_len; x += N) {
110 LoadInterleaved3(du8, row_in + 3 * x, out0, out1, out2);
111 Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
112 Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
113 Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
115 ReadUint8Row<3>(row_in, simd_len, len, row_out);
118 void ReadUint8RowInterleaved4(const uint8_t* row_in, size_t len,
119 float* row_out[kMaxComponents]) {
120 const size_t N = Lanes(d);
121 const size_t simd_len = len & (~(N - 1));
122 float* JXL_RESTRICT const row0 = row_out[0];
123 float* JXL_RESTRICT const row1 = row_out[1];
124 float* JXL_RESTRICT const row2 = row_out[2];
125 float* JXL_RESTRICT const row3 = row_out[3];
126 Vec<DU8> out0, out1, out2, out3;
127 for (size_t x = 0; x < simd_len; x += N) {
128 LoadInterleaved4(du8, row_in + 4 * x, out0, out1, out2, out3);
129 Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
130 Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
131 Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
132 Store(ConvertTo(d, PromoteTo(du, out3)), d, row3 + x);
134 ReadUint8Row<4>(row_in, simd_len, len, row_out);
137 void ReadUint16RowSingle(const uint8_t* row_in, size_t len,
138 float* row_out[kMaxComponents]) {
139 const size_t N = Lanes(d);
140 const size_t simd_len = len & (~(N - 1));
141 const auto mul = Set(d, kMul16);
142 const uint16_t* JXL_RESTRICT const row =
143 reinterpret_cast<const uint16_t*>(row_in);
144 float* JXL_RESTRICT const row0 = row_out[0];
145 for (size_t x = 0; x < simd_len; x += N) {
146 Store(Mul(mul, ConvertTo(d, PromoteTo(du, LoadU(du16, row + x)))), d,
147 row0 + x);
149 ReadUint16Row<1>(row_in, simd_len, len, row_out);
152 void ReadUint16RowInterleaved2(const uint8_t* row_in, size_t len,
153 float* row_out[kMaxComponents]) {
154 const size_t N = Lanes(d);
155 const size_t simd_len = len & (~(N - 1));
156 const auto mul = Set(d, kMul16);
157 const uint16_t* JXL_RESTRICT const row =
158 reinterpret_cast<const uint16_t*>(row_in);
159 float* JXL_RESTRICT const row0 = row_out[0];
160 float* JXL_RESTRICT const row1 = row_out[1];
161 Vec<DU16> out0, out1;
162 for (size_t x = 0; x < simd_len; x += N) {
163 LoadInterleaved2(du16, row + 2 * x, out0, out1);
164 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
165 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
167 ReadUint16Row<2>(row_in, simd_len, len, row_out);
170 void ReadUint16RowInterleaved3(const uint8_t* row_in, size_t len,
171 float* row_out[kMaxComponents]) {
172 const size_t N = Lanes(d);
173 const size_t simd_len = len & (~(N - 1));
174 const auto mul = Set(d, kMul16);
175 const uint16_t* JXL_RESTRICT const row =
176 reinterpret_cast<const uint16_t*>(row_in);
177 float* JXL_RESTRICT const row0 = row_out[0];
178 float* JXL_RESTRICT const row1 = row_out[1];
179 float* JXL_RESTRICT const row2 = row_out[2];
180 Vec<DU16> out0, out1, out2;
181 for (size_t x = 0; x < simd_len; x += N) {
182 LoadInterleaved3(du16, row + 3 * x, out0, out1, out2);
183 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
184 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
185 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
187 ReadUint16Row<3>(row_in, simd_len, len, row_out);
190 void ReadUint16RowInterleaved4(const uint8_t* row_in, size_t len,
191 float* row_out[kMaxComponents]) {
192 const size_t N = Lanes(d);
193 const size_t simd_len = len & (~(N - 1));
194 const auto mul = Set(d, kMul16);
195 const uint16_t* JXL_RESTRICT const row =
196 reinterpret_cast<const uint16_t*>(row_in);
197 float* JXL_RESTRICT const row0 = row_out[0];
198 float* JXL_RESTRICT const row1 = row_out[1];
199 float* JXL_RESTRICT const row2 = row_out[2];
200 float* JXL_RESTRICT const row3 = row_out[3];
201 Vec<DU16> out0, out1, out2, out3;
202 for (size_t x = 0; x < simd_len; x += N) {
203 LoadInterleaved4(du16, row + 4 * x, out0, out1, out2, out3);
204 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
205 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
206 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
207 Store(Mul(mul, ConvertTo(d, PromoteTo(du, out3))), d, row3 + x);
209 ReadUint16Row<4>(row_in, simd_len, len, row_out);
212 void ReadUint16RowSingleSwap(const uint8_t* row_in, size_t len,
213 float* row_out[kMaxComponents]) {
214 ReadUint16Row<1, true>(row_in, 0, len, row_out);
217 void ReadUint16RowInterleaved2Swap(const uint8_t* row_in, size_t len,
218 float* row_out[kMaxComponents]) {
219 ReadUint16Row<2, true>(row_in, 0, len, row_out);
222 void ReadUint16RowInterleaved3Swap(const uint8_t* row_in, size_t len,
223 float* row_out[kMaxComponents]) {
224 ReadUint16Row<3, true>(row_in, 0, len, row_out);
227 void ReadUint16RowInterleaved4Swap(const uint8_t* row_in, size_t len,
228 float* row_out[kMaxComponents]) {
229 ReadUint16Row<4, true>(row_in, 0, len, row_out);
232 void ReadFloatRowSingle(const uint8_t* row_in, size_t len,
233 float* row_out[kMaxComponents]) {
234 const size_t N = Lanes(d);
235 const size_t simd_len = len & (~(N - 1));
236 const auto mul = Set(d, kMulFloat);
237 const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
238 float* JXL_RESTRICT const row0 = row_out[0];
239 for (size_t x = 0; x < simd_len; x += N) {
240 Store(Mul(mul, LoadU(d, row + x)), d, row0 + x);
242 ReadFloatRow<1>(row_in, simd_len, len, row_out);
245 void ReadFloatRowInterleaved2(const uint8_t* row_in, size_t len,
246 float* row_out[kMaxComponents]) {
247 const size_t N = Lanes(d);
248 const size_t simd_len = len & (~(N - 1));
249 const auto mul = Set(d, kMulFloat);
250 const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
251 float* JXL_RESTRICT const row0 = row_out[0];
252 float* JXL_RESTRICT const row1 = row_out[1];
253 Vec<D> out0, out1;
254 for (size_t x = 0; x < simd_len; x += N) {
255 LoadInterleaved2(d, row + 2 * x, out0, out1);
256 Store(Mul(mul, out0), d, row0 + x);
257 Store(Mul(mul, out1), d, row1 + x);
259 ReadFloatRow<2>(row_in, simd_len, len, row_out);
262 void ReadFloatRowInterleaved3(const uint8_t* row_in, size_t len,
263 float* row_out[kMaxComponents]) {
264 const size_t N = Lanes(d);
265 const size_t simd_len = len & (~(N - 1));
266 const auto mul = Set(d, kMulFloat);
267 const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
268 float* JXL_RESTRICT const row0 = row_out[0];
269 float* JXL_RESTRICT const row1 = row_out[1];
270 float* JXL_RESTRICT const row2 = row_out[2];
271 Vec<D> out0, out1, out2;
272 for (size_t x = 0; x < simd_len; x += N) {
273 LoadInterleaved3(d, row + 3 * x, out0, out1, out2);
274 Store(Mul(mul, out0), d, row0 + x);
275 Store(Mul(mul, out1), d, row1 + x);
276 Store(Mul(mul, out2), d, row2 + x);
278 ReadFloatRow<3>(row_in, simd_len, len, row_out);
281 void ReadFloatRowInterleaved4(const uint8_t* row_in, size_t len,
282 float* row_out[kMaxComponents]) {
283 const size_t N = Lanes(d);
284 const size_t simd_len = len & (~(N - 1));
285 const auto mul = Set(d, kMulFloat);
286 const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
287 float* JXL_RESTRICT const row0 = row_out[0];
288 float* JXL_RESTRICT const row1 = row_out[1];
289 float* JXL_RESTRICT const row2 = row_out[2];
290 float* JXL_RESTRICT const row3 = row_out[3];
291 Vec<D> out0, out1, out2, out3;
292 for (size_t x = 0; x < simd_len; x += N) {
293 LoadInterleaved4(d, row + 4 * x, out0, out1, out2, out3);
294 Store(Mul(mul, out0), d, row0 + x);
295 Store(Mul(mul, out1), d, row1 + x);
296 Store(Mul(mul, out2), d, row2 + x);
297 Store(Mul(mul, out3), d, row3 + x);
299 ReadFloatRow<4>(row_in, simd_len, len, row_out);
302 void ReadFloatRowSingleSwap(const uint8_t* row_in, size_t len,
303 float* row_out[kMaxComponents]) {
304 ReadFloatRow<1, true>(row_in, 0, len, row_out);
307 void ReadFloatRowInterleaved2Swap(const uint8_t* row_in, size_t len,
308 float* row_out[kMaxComponents]) {
309 ReadFloatRow<2, true>(row_in, 0, len, row_out);
312 void ReadFloatRowInterleaved3Swap(const uint8_t* row_in, size_t len,
313 float* row_out[kMaxComponents]) {
314 ReadFloatRow<3, true>(row_in, 0, len, row_out);
317 void ReadFloatRowInterleaved4Swap(const uint8_t* row_in, size_t len,
318 float* row_out[kMaxComponents]) {
319 ReadFloatRow<4, true>(row_in, 0, len, row_out);
322 // NOLINTNEXTLINE(google-readability-namespace-comments)
323 } // namespace HWY_NAMESPACE
324 } // namespace jpegli
325 HWY_AFTER_NAMESPACE();
327 #if HWY_ONCE
328 namespace jpegli {
330 HWY_EXPORT(ReadUint8RowSingle);
331 HWY_EXPORT(ReadUint8RowInterleaved2);
332 HWY_EXPORT(ReadUint8RowInterleaved3);
333 HWY_EXPORT(ReadUint8RowInterleaved4);
334 HWY_EXPORT(ReadUint16RowSingle);
335 HWY_EXPORT(ReadUint16RowInterleaved2);
336 HWY_EXPORT(ReadUint16RowInterleaved3);
337 HWY_EXPORT(ReadUint16RowInterleaved4);
338 HWY_EXPORT(ReadUint16RowSingleSwap);
339 HWY_EXPORT(ReadUint16RowInterleaved2Swap);
340 HWY_EXPORT(ReadUint16RowInterleaved3Swap);
341 HWY_EXPORT(ReadUint16RowInterleaved4Swap);
342 HWY_EXPORT(ReadFloatRowSingle);
343 HWY_EXPORT(ReadFloatRowInterleaved2);
344 HWY_EXPORT(ReadFloatRowInterleaved3);
345 HWY_EXPORT(ReadFloatRowInterleaved4);
346 HWY_EXPORT(ReadFloatRowSingleSwap);
347 HWY_EXPORT(ReadFloatRowInterleaved2Swap);
348 HWY_EXPORT(ReadFloatRowInterleaved3Swap);
349 HWY_EXPORT(ReadFloatRowInterleaved4Swap);
351 void ChooseInputMethod(j_compress_ptr cinfo) {
352 jpeg_comp_master* m = cinfo->master;
353 bool swap_endianness =
354 (m->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) ||
355 (m->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian());
356 m->input_method = nullptr;
357 if (m->data_type == JPEGLI_TYPE_UINT8) {
358 if (cinfo->raw_data_in || cinfo->input_components == 1) {
359 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowSingle);
360 } else if (cinfo->input_components == 2) {
361 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved2);
362 } else if (cinfo->input_components == 3) {
363 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved3);
364 } else if (cinfo->input_components == 4) {
365 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved4);
367 } else if (m->data_type == JPEGLI_TYPE_UINT16 && !swap_endianness) {
368 if (cinfo->raw_data_in || cinfo->input_components == 1) {
369 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingle);
370 } else if (cinfo->input_components == 2) {
371 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2);
372 } else if (cinfo->input_components == 3) {
373 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3);
374 } else if (cinfo->input_components == 4) {
375 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4);
377 } else if (m->data_type == JPEGLI_TYPE_UINT16 && swap_endianness) {
378 if (cinfo->raw_data_in || cinfo->input_components == 1) {
379 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingleSwap);
380 } else if (cinfo->input_components == 2) {
381 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2Swap);
382 } else if (cinfo->input_components == 3) {
383 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3Swap);
384 } else if (cinfo->input_components == 4) {
385 m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4Swap);
387 } else if (m->data_type == JPEGLI_TYPE_FLOAT && !swap_endianness) {
388 if (cinfo->raw_data_in || cinfo->input_components == 1) {
389 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingle);
390 } else if (cinfo->input_components == 2) {
391 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2);
392 } else if (cinfo->input_components == 3) {
393 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3);
394 } else if (cinfo->input_components == 4) {
395 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4);
397 } else if (m->data_type == JPEGLI_TYPE_FLOAT && swap_endianness) {
398 if (cinfo->raw_data_in || cinfo->input_components == 1) {
399 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingleSwap);
400 } else if (cinfo->input_components == 2) {
401 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2Swap);
402 } else if (cinfo->input_components == 3) {
403 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3Swap);
404 } else if (cinfo->input_components == 4) {
405 m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4Swap);
408 if (m->input_method == nullptr) {
409 JPEGLI_ERROR("Could not find input method.");
413 } // namespace jpegli
414 #endif // HWY_ONCE