1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
6 #include "lib/jpegli/input.h"
8 #undef HWY_TARGET_INCLUDE
9 #define HWY_TARGET_INCLUDE "lib/jpegli/input.cc"
10 #include <hwy/foreach_target.h>
11 #include <hwy/highway.h>
13 #include "lib/jpegli/encode_internal.h"
14 #include "lib/jpegli/error.h"
15 #include "lib/jxl/base/byte_order.h"
16 #include "lib/jxl/base/compiler_specific.h"
18 HWY_BEFORE_NAMESPACE();
20 namespace HWY_NAMESPACE
{
22 using hwy::HWY_NAMESPACE::Mul
;
23 using hwy::HWY_NAMESPACE::Rebind
;
24 using hwy::HWY_NAMESPACE::Vec
;
26 using D
= HWY_FULL(float);
27 using DU
= HWY_FULL(uint32_t);
28 using DU8
= Rebind
<uint8_t, D
>;
29 using DU16
= Rebind
<uint16_t, D
>;
36 static constexpr double kMul16
= 1.0 / 257.0;
37 static constexpr double kMulFloat
= 255.0;
40 void ReadUint8Row(const uint8_t* row_in
, size_t x0
, size_t len
,
41 float* row_out
[kMaxComponents
]) {
42 for (size_t x
= x0
; x
< len
; ++x
) {
43 for (size_t c
= 0; c
< C
; ++c
) {
44 row_out
[c
][x
] = row_in
[C
* x
+ c
];
49 template <size_t C
, bool swap_endianness
= false>
50 void ReadUint16Row(const uint8_t* row_in
, size_t x0
, size_t len
,
51 float* row_out
[kMaxComponents
]) {
52 const uint16_t* row16
= reinterpret_cast<const uint16_t*>(row_in
);
53 for (size_t x
= x0
; x
< len
; ++x
) {
54 for (size_t c
= 0; c
< C
; ++c
) {
55 uint16_t val
= row16
[C
* x
+ c
];
56 if (swap_endianness
) val
= JXL_BSWAP16(val
);
57 row_out
[c
][x
] = val
* kMul16
;
62 template <size_t C
, bool swap_endianness
= false>
63 void ReadFloatRow(const uint8_t* row_in
, size_t x0
, size_t len
,
64 float* row_out
[kMaxComponents
]) {
65 const float* rowf
= reinterpret_cast<const float*>(row_in
);
66 for (size_t x
= x0
; x
< len
; ++x
) {
67 for (size_t c
= 0; c
< C
; ++c
) {
68 float val
= rowf
[C
* x
+ c
];
69 if (swap_endianness
) val
= BSwapFloat(val
);
70 row_out
[c
][x
] = val
* kMulFloat
;
75 void ReadUint8RowSingle(const uint8_t* row_in
, size_t len
,
76 float* row_out
[kMaxComponents
]) {
77 const size_t N
= Lanes(d
);
78 const size_t simd_len
= len
& (~(N
- 1));
79 float* JXL_RESTRICT
const row0
= row_out
[0];
80 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
81 Store(ConvertTo(d
, PromoteTo(du
, LoadU(du8
, row_in
+ x
))), d
, row0
+ x
);
83 ReadUint8Row
<1>(row_in
, simd_len
, len
, row_out
);
86 void ReadUint8RowInterleaved2(const uint8_t* row_in
, size_t len
,
87 float* row_out
[kMaxComponents
]) {
88 const size_t N
= Lanes(d
);
89 const size_t simd_len
= len
& (~(N
- 1));
90 float* JXL_RESTRICT
const row0
= row_out
[0];
91 float* JXL_RESTRICT
const row1
= row_out
[1];
93 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
94 LoadInterleaved2(du8
, row_in
+ 2 * x
, out0
, out1
);
95 Store(ConvertTo(d
, PromoteTo(du
, out0
)), d
, row0
+ x
);
96 Store(ConvertTo(d
, PromoteTo(du
, out1
)), d
, row1
+ x
);
98 ReadUint8Row
<2>(row_in
, simd_len
, len
, row_out
);
101 void ReadUint8RowInterleaved3(const uint8_t* row_in
, size_t len
,
102 float* row_out
[kMaxComponents
]) {
103 const size_t N
= Lanes(d
);
104 const size_t simd_len
= len
& (~(N
- 1));
105 float* JXL_RESTRICT
const row0
= row_out
[0];
106 float* JXL_RESTRICT
const row1
= row_out
[1];
107 float* JXL_RESTRICT
const row2
= row_out
[2];
108 Vec
<DU8
> out0
, out1
, out2
;
109 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
110 LoadInterleaved3(du8
, row_in
+ 3 * x
, out0
, out1
, out2
);
111 Store(ConvertTo(d
, PromoteTo(du
, out0
)), d
, row0
+ x
);
112 Store(ConvertTo(d
, PromoteTo(du
, out1
)), d
, row1
+ x
);
113 Store(ConvertTo(d
, PromoteTo(du
, out2
)), d
, row2
+ x
);
115 ReadUint8Row
<3>(row_in
, simd_len
, len
, row_out
);
118 void ReadUint8RowInterleaved4(const uint8_t* row_in
, size_t len
,
119 float* row_out
[kMaxComponents
]) {
120 const size_t N
= Lanes(d
);
121 const size_t simd_len
= len
& (~(N
- 1));
122 float* JXL_RESTRICT
const row0
= row_out
[0];
123 float* JXL_RESTRICT
const row1
= row_out
[1];
124 float* JXL_RESTRICT
const row2
= row_out
[2];
125 float* JXL_RESTRICT
const row3
= row_out
[3];
126 Vec
<DU8
> out0
, out1
, out2
, out3
;
127 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
128 LoadInterleaved4(du8
, row_in
+ 4 * x
, out0
, out1
, out2
, out3
);
129 Store(ConvertTo(d
, PromoteTo(du
, out0
)), d
, row0
+ x
);
130 Store(ConvertTo(d
, PromoteTo(du
, out1
)), d
, row1
+ x
);
131 Store(ConvertTo(d
, PromoteTo(du
, out2
)), d
, row2
+ x
);
132 Store(ConvertTo(d
, PromoteTo(du
, out3
)), d
, row3
+ x
);
134 ReadUint8Row
<4>(row_in
, simd_len
, len
, row_out
);
137 void ReadUint16RowSingle(const uint8_t* row_in
, size_t len
,
138 float* row_out
[kMaxComponents
]) {
139 const size_t N
= Lanes(d
);
140 const size_t simd_len
= len
& (~(N
- 1));
141 const auto mul
= Set(d
, kMul16
);
142 const uint16_t* JXL_RESTRICT
const row
=
143 reinterpret_cast<const uint16_t*>(row_in
);
144 float* JXL_RESTRICT
const row0
= row_out
[0];
145 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
146 Store(Mul(mul
, ConvertTo(d
, PromoteTo(du
, LoadU(du16
, row
+ x
)))), d
,
149 ReadUint16Row
<1>(row_in
, simd_len
, len
, row_out
);
152 void ReadUint16RowInterleaved2(const uint8_t* row_in
, size_t len
,
153 float* row_out
[kMaxComponents
]) {
154 const size_t N
= Lanes(d
);
155 const size_t simd_len
= len
& (~(N
- 1));
156 const auto mul
= Set(d
, kMul16
);
157 const uint16_t* JXL_RESTRICT
const row
=
158 reinterpret_cast<const uint16_t*>(row_in
);
159 float* JXL_RESTRICT
const row0
= row_out
[0];
160 float* JXL_RESTRICT
const row1
= row_out
[1];
161 Vec
<DU16
> out0
, out1
;
162 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
163 LoadInterleaved2(du16
, row
+ 2 * x
, out0
, out1
);
164 Store(Mul(mul
, ConvertTo(d
, PromoteTo(du
, out0
))), d
, row0
+ x
);
165 Store(Mul(mul
, ConvertTo(d
, PromoteTo(du
, out1
))), d
, row1
+ x
);
167 ReadUint16Row
<2>(row_in
, simd_len
, len
, row_out
);
170 void ReadUint16RowInterleaved3(const uint8_t* row_in
, size_t len
,
171 float* row_out
[kMaxComponents
]) {
172 const size_t N
= Lanes(d
);
173 const size_t simd_len
= len
& (~(N
- 1));
174 const auto mul
= Set(d
, kMul16
);
175 const uint16_t* JXL_RESTRICT
const row
=
176 reinterpret_cast<const uint16_t*>(row_in
);
177 float* JXL_RESTRICT
const row0
= row_out
[0];
178 float* JXL_RESTRICT
const row1
= row_out
[1];
179 float* JXL_RESTRICT
const row2
= row_out
[2];
180 Vec
<DU16
> out0
, out1
, out2
;
181 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
182 LoadInterleaved3(du16
, row
+ 3 * x
, out0
, out1
, out2
);
183 Store(Mul(mul
, ConvertTo(d
, PromoteTo(du
, out0
))), d
, row0
+ x
);
184 Store(Mul(mul
, ConvertTo(d
, PromoteTo(du
, out1
))), d
, row1
+ x
);
185 Store(Mul(mul
, ConvertTo(d
, PromoteTo(du
, out2
))), d
, row2
+ x
);
187 ReadUint16Row
<3>(row_in
, simd_len
, len
, row_out
);
190 void ReadUint16RowInterleaved4(const uint8_t* row_in
, size_t len
,
191 float* row_out
[kMaxComponents
]) {
192 const size_t N
= Lanes(d
);
193 const size_t simd_len
= len
& (~(N
- 1));
194 const auto mul
= Set(d
, kMul16
);
195 const uint16_t* JXL_RESTRICT
const row
=
196 reinterpret_cast<const uint16_t*>(row_in
);
197 float* JXL_RESTRICT
const row0
= row_out
[0];
198 float* JXL_RESTRICT
const row1
= row_out
[1];
199 float* JXL_RESTRICT
const row2
= row_out
[2];
200 float* JXL_RESTRICT
const row3
= row_out
[3];
201 Vec
<DU16
> out0
, out1
, out2
, out3
;
202 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
203 LoadInterleaved4(du16
, row
+ 4 * x
, out0
, out1
, out2
, out3
);
204 Store(Mul(mul
, ConvertTo(d
, PromoteTo(du
, out0
))), d
, row0
+ x
);
205 Store(Mul(mul
, ConvertTo(d
, PromoteTo(du
, out1
))), d
, row1
+ x
);
206 Store(Mul(mul
, ConvertTo(d
, PromoteTo(du
, out2
))), d
, row2
+ x
);
207 Store(Mul(mul
, ConvertTo(d
, PromoteTo(du
, out3
))), d
, row3
+ x
);
209 ReadUint16Row
<4>(row_in
, simd_len
, len
, row_out
);
212 void ReadUint16RowSingleSwap(const uint8_t* row_in
, size_t len
,
213 float* row_out
[kMaxComponents
]) {
214 ReadUint16Row
<1, true>(row_in
, 0, len
, row_out
);
217 void ReadUint16RowInterleaved2Swap(const uint8_t* row_in
, size_t len
,
218 float* row_out
[kMaxComponents
]) {
219 ReadUint16Row
<2, true>(row_in
, 0, len
, row_out
);
222 void ReadUint16RowInterleaved3Swap(const uint8_t* row_in
, size_t len
,
223 float* row_out
[kMaxComponents
]) {
224 ReadUint16Row
<3, true>(row_in
, 0, len
, row_out
);
227 void ReadUint16RowInterleaved4Swap(const uint8_t* row_in
, size_t len
,
228 float* row_out
[kMaxComponents
]) {
229 ReadUint16Row
<4, true>(row_in
, 0, len
, row_out
);
232 void ReadFloatRowSingle(const uint8_t* row_in
, size_t len
,
233 float* row_out
[kMaxComponents
]) {
234 const size_t N
= Lanes(d
);
235 const size_t simd_len
= len
& (~(N
- 1));
236 const auto mul
= Set(d
, kMulFloat
);
237 const float* JXL_RESTRICT
const row
= reinterpret_cast<const float*>(row_in
);
238 float* JXL_RESTRICT
const row0
= row_out
[0];
239 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
240 Store(Mul(mul
, LoadU(d
, row
+ x
)), d
, row0
+ x
);
242 ReadFloatRow
<1>(row_in
, simd_len
, len
, row_out
);
245 void ReadFloatRowInterleaved2(const uint8_t* row_in
, size_t len
,
246 float* row_out
[kMaxComponents
]) {
247 const size_t N
= Lanes(d
);
248 const size_t simd_len
= len
& (~(N
- 1));
249 const auto mul
= Set(d
, kMulFloat
);
250 const float* JXL_RESTRICT
const row
= reinterpret_cast<const float*>(row_in
);
251 float* JXL_RESTRICT
const row0
= row_out
[0];
252 float* JXL_RESTRICT
const row1
= row_out
[1];
254 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
255 LoadInterleaved2(d
, row
+ 2 * x
, out0
, out1
);
256 Store(Mul(mul
, out0
), d
, row0
+ x
);
257 Store(Mul(mul
, out1
), d
, row1
+ x
);
259 ReadFloatRow
<2>(row_in
, simd_len
, len
, row_out
);
262 void ReadFloatRowInterleaved3(const uint8_t* row_in
, size_t len
,
263 float* row_out
[kMaxComponents
]) {
264 const size_t N
= Lanes(d
);
265 const size_t simd_len
= len
& (~(N
- 1));
266 const auto mul
= Set(d
, kMulFloat
);
267 const float* JXL_RESTRICT
const row
= reinterpret_cast<const float*>(row_in
);
268 float* JXL_RESTRICT
const row0
= row_out
[0];
269 float* JXL_RESTRICT
const row1
= row_out
[1];
270 float* JXL_RESTRICT
const row2
= row_out
[2];
271 Vec
<D
> out0
, out1
, out2
;
272 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
273 LoadInterleaved3(d
, row
+ 3 * x
, out0
, out1
, out2
);
274 Store(Mul(mul
, out0
), d
, row0
+ x
);
275 Store(Mul(mul
, out1
), d
, row1
+ x
);
276 Store(Mul(mul
, out2
), d
, row2
+ x
);
278 ReadFloatRow
<3>(row_in
, simd_len
, len
, row_out
);
281 void ReadFloatRowInterleaved4(const uint8_t* row_in
, size_t len
,
282 float* row_out
[kMaxComponents
]) {
283 const size_t N
= Lanes(d
);
284 const size_t simd_len
= len
& (~(N
- 1));
285 const auto mul
= Set(d
, kMulFloat
);
286 const float* JXL_RESTRICT
const row
= reinterpret_cast<const float*>(row_in
);
287 float* JXL_RESTRICT
const row0
= row_out
[0];
288 float* JXL_RESTRICT
const row1
= row_out
[1];
289 float* JXL_RESTRICT
const row2
= row_out
[2];
290 float* JXL_RESTRICT
const row3
= row_out
[3];
291 Vec
<D
> out0
, out1
, out2
, out3
;
292 for (size_t x
= 0; x
< simd_len
; x
+= N
) {
293 LoadInterleaved4(d
, row
+ 4 * x
, out0
, out1
, out2
, out3
);
294 Store(Mul(mul
, out0
), d
, row0
+ x
);
295 Store(Mul(mul
, out1
), d
, row1
+ x
);
296 Store(Mul(mul
, out2
), d
, row2
+ x
);
297 Store(Mul(mul
, out3
), d
, row3
+ x
);
299 ReadFloatRow
<4>(row_in
, simd_len
, len
, row_out
);
302 void ReadFloatRowSingleSwap(const uint8_t* row_in
, size_t len
,
303 float* row_out
[kMaxComponents
]) {
304 ReadFloatRow
<1, true>(row_in
, 0, len
, row_out
);
307 void ReadFloatRowInterleaved2Swap(const uint8_t* row_in
, size_t len
,
308 float* row_out
[kMaxComponents
]) {
309 ReadFloatRow
<2, true>(row_in
, 0, len
, row_out
);
312 void ReadFloatRowInterleaved3Swap(const uint8_t* row_in
, size_t len
,
313 float* row_out
[kMaxComponents
]) {
314 ReadFloatRow
<3, true>(row_in
, 0, len
, row_out
);
317 void ReadFloatRowInterleaved4Swap(const uint8_t* row_in
, size_t len
,
318 float* row_out
[kMaxComponents
]) {
319 ReadFloatRow
<4, true>(row_in
, 0, len
, row_out
);
322 // NOLINTNEXTLINE(google-readability-namespace-comments)
323 } // namespace HWY_NAMESPACE
324 } // namespace jpegli
325 HWY_AFTER_NAMESPACE();
330 HWY_EXPORT(ReadUint8RowSingle
);
331 HWY_EXPORT(ReadUint8RowInterleaved2
);
332 HWY_EXPORT(ReadUint8RowInterleaved3
);
333 HWY_EXPORT(ReadUint8RowInterleaved4
);
334 HWY_EXPORT(ReadUint16RowSingle
);
335 HWY_EXPORT(ReadUint16RowInterleaved2
);
336 HWY_EXPORT(ReadUint16RowInterleaved3
);
337 HWY_EXPORT(ReadUint16RowInterleaved4
);
338 HWY_EXPORT(ReadUint16RowSingleSwap
);
339 HWY_EXPORT(ReadUint16RowInterleaved2Swap
);
340 HWY_EXPORT(ReadUint16RowInterleaved3Swap
);
341 HWY_EXPORT(ReadUint16RowInterleaved4Swap
);
342 HWY_EXPORT(ReadFloatRowSingle
);
343 HWY_EXPORT(ReadFloatRowInterleaved2
);
344 HWY_EXPORT(ReadFloatRowInterleaved3
);
345 HWY_EXPORT(ReadFloatRowInterleaved4
);
346 HWY_EXPORT(ReadFloatRowSingleSwap
);
347 HWY_EXPORT(ReadFloatRowInterleaved2Swap
);
348 HWY_EXPORT(ReadFloatRowInterleaved3Swap
);
349 HWY_EXPORT(ReadFloatRowInterleaved4Swap
);
351 void ChooseInputMethod(j_compress_ptr cinfo
) {
352 jpeg_comp_master
* m
= cinfo
->master
;
353 bool swap_endianness
=
354 (m
->endianness
== JPEGLI_LITTLE_ENDIAN
&& !IsLittleEndian()) ||
355 (m
->endianness
== JPEGLI_BIG_ENDIAN
&& IsLittleEndian());
356 m
->input_method
= nullptr;
357 if (m
->data_type
== JPEGLI_TYPE_UINT8
) {
358 if (cinfo
->raw_data_in
|| cinfo
->input_components
== 1) {
359 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint8RowSingle
);
360 } else if (cinfo
->input_components
== 2) {
361 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved2
);
362 } else if (cinfo
->input_components
== 3) {
363 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved3
);
364 } else if (cinfo
->input_components
== 4) {
365 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved4
);
367 } else if (m
->data_type
== JPEGLI_TYPE_UINT16
&& !swap_endianness
) {
368 if (cinfo
->raw_data_in
|| cinfo
->input_components
== 1) {
369 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint16RowSingle
);
370 } else if (cinfo
->input_components
== 2) {
371 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2
);
372 } else if (cinfo
->input_components
== 3) {
373 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3
);
374 } else if (cinfo
->input_components
== 4) {
375 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4
);
377 } else if (m
->data_type
== JPEGLI_TYPE_UINT16
&& swap_endianness
) {
378 if (cinfo
->raw_data_in
|| cinfo
->input_components
== 1) {
379 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint16RowSingleSwap
);
380 } else if (cinfo
->input_components
== 2) {
381 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2Swap
);
382 } else if (cinfo
->input_components
== 3) {
383 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3Swap
);
384 } else if (cinfo
->input_components
== 4) {
385 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4Swap
);
387 } else if (m
->data_type
== JPEGLI_TYPE_FLOAT
&& !swap_endianness
) {
388 if (cinfo
->raw_data_in
|| cinfo
->input_components
== 1) {
389 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadFloatRowSingle
);
390 } else if (cinfo
->input_components
== 2) {
391 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2
);
392 } else if (cinfo
->input_components
== 3) {
393 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3
);
394 } else if (cinfo
->input_components
== 4) {
395 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4
);
397 } else if (m
->data_type
== JPEGLI_TYPE_FLOAT
&& swap_endianness
) {
398 if (cinfo
->raw_data_in
|| cinfo
->input_components
== 1) {
399 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadFloatRowSingleSwap
);
400 } else if (cinfo
->input_components
== 2) {
401 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2Swap
);
402 } else if (cinfo
->input_components
== 3) {
403 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3Swap
);
404 } else if (cinfo
->input_components
== 4) {
405 m
->input_method
= HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4Swap
);
408 if (m
->input_method
== nullptr) {
409 JPEGLI_ERROR("Could not find input method.");
413 } // namespace jpegli