third_party/jpeg-xl/lib/jpegli/input.cc

   1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style
   4 // license that can be found in the LICENSE file.
   5
   6 #include "lib/jpegli/input.h"
   7
   8 #undef HWY_TARGET_INCLUDE
   9 #define HWY_TARGET_INCLUDE "lib/jpegli/input.cc"
  10 #include <hwy/foreach_target.h>
  11 #include <hwy/highway.h>
  12
  13 #include "lib/jpegli/encode_internal.h"
  14 #include "lib/jpegli/error.h"
  15 #include "lib/jxl/base/byte_order.h"
  16 #include "lib/jxl/base/compiler_specific.h"
  17
  18 HWY_BEFORE_NAMESPACE();
  19 namespace jpegli {
  20 namespace HWY_NAMESPACE {
  21
  22 using hwy::HWY_NAMESPACE::Mul;
  23 using hwy::HWY_NAMESPACE::Rebind;
  24 using hwy::HWY_NAMESPACE::Vec;
  25
  26 using D = HWY_FULL(float);
  27 using DU = HWY_FULL(uint32_t);
  28 using DU8 = Rebind<uint8_t, D>;
  29 using DU16 = Rebind<uint16_t, D>;
  30
  31 constexpr D d;
  32 constexpr DU du;
  33 constexpr DU8 du8;
  34 constexpr DU16 du16;
  35
  36 static constexpr double kMul16 = 1.0 / 257.0;
  37 static constexpr double kMulFloat = 255.0;
  38
  39 template <size_t C>
  40 void ReadUint8Row(const uint8_t* row_in, size_t x0, size_t len,
  41                   float* row_out[kMaxComponents]) {
  42   for (size_t x = x0; x < len; ++x) {
  43     for (size_t c = 0; c < C; ++c) {
  44       row_out[c][x] = row_in[C * x + c];
  45     }
  46   }
  47 }
  48
  49 template <size_t C, bool swap_endianness = false>
  50 void ReadUint16Row(const uint8_t* row_in, size_t x0, size_t len,
  51                    float* row_out[kMaxComponents]) {
  52   const uint16_t* row16 = reinterpret_cast<const uint16_t*>(row_in);
  53   for (size_t x = x0; x < len; ++x) {
  54     for (size_t c = 0; c < C; ++c) {
  55       uint16_t val = row16[C * x + c];
  56       if (swap_endianness) val = JXL_BSWAP16(val);
  57       row_out[c][x] = val * kMul16;
  58     }
  59   }
  60 }
  61
  62 template <size_t C, bool swap_endianness = false>
  63 void ReadFloatRow(const uint8_t* row_in, size_t x0, size_t len,
  64                   float* row_out[kMaxComponents]) {
  65   const float* rowf = reinterpret_cast<const float*>(row_in);
  66   for (size_t x = x0; x < len; ++x) {
  67     for (size_t c = 0; c < C; ++c) {
  68       float val = rowf[C * x + c];
  69       if (swap_endianness) val = BSwapFloat(val);
  70       row_out[c][x] = val * kMulFloat;
  71     }
  72   }
  73 }
  74
  75 void ReadUint8RowSingle(const uint8_t* row_in, size_t len,
  76                         float* row_out[kMaxComponents]) {
  77   const size_t N = Lanes(d);
  78   const size_t simd_len = len & (~(N - 1));
  79   float* JXL_RESTRICT const row0 = row_out[0];
  80   for (size_t x = 0; x < simd_len; x += N) {
  81     Store(ConvertTo(d, PromoteTo(du, LoadU(du8, row_in + x))), d, row0 + x);
  82   }
  83   ReadUint8Row<1>(row_in, simd_len, len, row_out);
  84 }
  85
  86 void ReadUint8RowInterleaved2(const uint8_t* row_in, size_t len,
  87                               float* row_out[kMaxComponents]) {
  88   const size_t N = Lanes(d);
  89   const size_t simd_len = len & (~(N - 1));
  90   float* JXL_RESTRICT const row0 = row_out[0];
  91   float* JXL_RESTRICT const row1 = row_out[1];
  92   Vec<DU8> out0, out1;
  93   for (size_t x = 0; x < simd_len; x += N) {
  94     LoadInterleaved2(du8, row_in + 2 * x, out0, out1);
  95     Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
  96     Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
  97   }
  98   ReadUint8Row<2>(row_in, simd_len, len, row_out);
  99 }
 100
 101 void ReadUint8RowInterleaved3(const uint8_t* row_in, size_t len,
 102                               float* row_out[kMaxComponents]) {
 103   const size_t N = Lanes(d);
 104   const size_t simd_len = len & (~(N - 1));
 105   float* JXL_RESTRICT const row0 = row_out[0];
 106   float* JXL_RESTRICT const row1 = row_out[1];
 107   float* JXL_RESTRICT const row2 = row_out[2];
 108   Vec<DU8> out0, out1, out2;
 109   for (size_t x = 0; x < simd_len; x += N) {
 110     LoadInterleaved3(du8, row_in + 3 * x, out0, out1, out2);
 111     Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
 112     Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
 113     Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
 114   }
 115   ReadUint8Row<3>(row_in, simd_len, len, row_out);
 116 }
 117
 118 void ReadUint8RowInterleaved4(const uint8_t* row_in, size_t len,
 119                               float* row_out[kMaxComponents]) {
 120   const size_t N = Lanes(d);
 121   const size_t simd_len = len & (~(N - 1));
 122   float* JXL_RESTRICT const row0 = row_out[0];
 123   float* JXL_RESTRICT const row1 = row_out[1];
 124   float* JXL_RESTRICT const row2 = row_out[2];
 125   float* JXL_RESTRICT const row3 = row_out[3];
 126   Vec<DU8> out0, out1, out2, out3;
 127   for (size_t x = 0; x < simd_len; x += N) {
 128     LoadInterleaved4(du8, row_in + 4 * x, out0, out1, out2, out3);
 129     Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
 130     Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
 131     Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
 132     Store(ConvertTo(d, PromoteTo(du, out3)), d, row3 + x);
 133   }
 134   ReadUint8Row<4>(row_in, simd_len, len, row_out);
 135 }
 136
 137 void ReadUint16RowSingle(const uint8_t* row_in, size_t len,
 138                          float* row_out[kMaxComponents]) {
 139   const size_t N = Lanes(d);
 140   const size_t simd_len = len & (~(N - 1));
 141   const auto mul = Set(d, kMul16);
 142   const uint16_t* JXL_RESTRICT const row =
 143       reinterpret_cast<const uint16_t*>(row_in);
 144   float* JXL_RESTRICT const row0 = row_out[0];
 145   for (size_t x = 0; x < simd_len; x += N) {
 146     Store(Mul(mul, ConvertTo(d, PromoteTo(du, LoadU(du16, row + x)))), d,
 147           row0 + x);
 148   }
 149   ReadUint16Row<1>(row_in, simd_len, len, row_out);
 150 }
 151
 152 void ReadUint16RowInterleaved2(const uint8_t* row_in, size_t len,
 153                                float* row_out[kMaxComponents]) {
 154   const size_t N = Lanes(d);
 155   const size_t simd_len = len & (~(N - 1));
 156   const auto mul = Set(d, kMul16);
 157   const uint16_t* JXL_RESTRICT const row =
 158       reinterpret_cast<const uint16_t*>(row_in);
 159   float* JXL_RESTRICT const row0 = row_out[0];
 160   float* JXL_RESTRICT const row1 = row_out[1];
 161   Vec<DU16> out0, out1;
 162   for (size_t x = 0; x < simd_len; x += N) {
 163     LoadInterleaved2(du16, row + 2 * x, out0, out1);
 164     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
 165     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
 166   }
 167   ReadUint16Row<2>(row_in, simd_len, len, row_out);
 168 }
 169
 170 void ReadUint16RowInterleaved3(const uint8_t* row_in, size_t len,
 171                                float* row_out[kMaxComponents]) {
 172   const size_t N = Lanes(d);
 173   const size_t simd_len = len & (~(N - 1));
 174   const auto mul = Set(d, kMul16);
 175   const uint16_t* JXL_RESTRICT const row =
 176       reinterpret_cast<const uint16_t*>(row_in);
 177   float* JXL_RESTRICT const row0 = row_out[0];
 178   float* JXL_RESTRICT const row1 = row_out[1];
 179   float* JXL_RESTRICT const row2 = row_out[2];
 180   Vec<DU16> out0, out1, out2;
 181   for (size_t x = 0; x < simd_len; x += N) {
 182     LoadInterleaved3(du16, row + 3 * x, out0, out1, out2);
 183     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
 184     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
 185     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
 186   }
 187   ReadUint16Row<3>(row_in, simd_len, len, row_out);
 188 }
 189
 190 void ReadUint16RowInterleaved4(const uint8_t* row_in, size_t len,
 191                                float* row_out[kMaxComponents]) {
 192   const size_t N = Lanes(d);
 193   const size_t simd_len = len & (~(N - 1));
 194   const auto mul = Set(d, kMul16);
 195   const uint16_t* JXL_RESTRICT const row =
 196       reinterpret_cast<const uint16_t*>(row_in);
 197   float* JXL_RESTRICT const row0 = row_out[0];
 198   float* JXL_RESTRICT const row1 = row_out[1];
 199   float* JXL_RESTRICT const row2 = row_out[2];
 200   float* JXL_RESTRICT const row3 = row_out[3];
 201   Vec<DU16> out0, out1, out2, out3;
 202   for (size_t x = 0; x < simd_len; x += N) {
 203     LoadInterleaved4(du16, row + 4 * x, out0, out1, out2, out3);
 204     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
 205     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
 206     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
 207     Store(Mul(mul, ConvertTo(d, PromoteTo(du, out3))), d, row3 + x);
 208   }
 209   ReadUint16Row<4>(row_in, simd_len, len, row_out);
 210 }
 211
 212 void ReadUint16RowSingleSwap(const uint8_t* row_in, size_t len,
 213                              float* row_out[kMaxComponents]) {
 214   ReadUint16Row<1, true>(row_in, 0, len, row_out);
 215 }
 216
 217 void ReadUint16RowInterleaved2Swap(const uint8_t* row_in, size_t len,
 218                                    float* row_out[kMaxComponents]) {
 219   ReadUint16Row<2, true>(row_in, 0, len, row_out);
 220 }
 221
 222 void ReadUint16RowInterleaved3Swap(const uint8_t* row_in, size_t len,
 223                                    float* row_out[kMaxComponents]) {
 224   ReadUint16Row<3, true>(row_in, 0, len, row_out);
 225 }
 226
 227 void ReadUint16RowInterleaved4Swap(const uint8_t* row_in, size_t len,
 228                                    float* row_out[kMaxComponents]) {
 229   ReadUint16Row<4, true>(row_in, 0, len, row_out);
 230 }
 231
 232 void ReadFloatRowSingle(const uint8_t* row_in, size_t len,
 233                         float* row_out[kMaxComponents]) {
 234   const size_t N = Lanes(d);
 235   const size_t simd_len = len & (~(N - 1));
 236   const auto mul = Set(d, kMulFloat);
 237   const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
 238   float* JXL_RESTRICT const row0 = row_out[0];
 239   for (size_t x = 0; x < simd_len; x += N) {
 240     Store(Mul(mul, LoadU(d, row + x)), d, row0 + x);
 241   }
 242   ReadFloatRow<1>(row_in, simd_len, len, row_out);
 243 }
 244
 245 void ReadFloatRowInterleaved2(const uint8_t* row_in, size_t len,
 246                               float* row_out[kMaxComponents]) {
 247   const size_t N = Lanes(d);
 248   const size_t simd_len = len & (~(N - 1));
 249   const auto mul = Set(d, kMulFloat);
 250   const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
 251   float* JXL_RESTRICT const row0 = row_out[0];
 252   float* JXL_RESTRICT const row1 = row_out[1];
 253   Vec<D> out0, out1;
 254   for (size_t x = 0; x < simd_len; x += N) {
 255     LoadInterleaved2(d, row + 2 * x, out0, out1);
 256     Store(Mul(mul, out0), d, row0 + x);
 257     Store(Mul(mul, out1), d, row1 + x);
 258   }
 259   ReadFloatRow<2>(row_in, simd_len, len, row_out);
 260 }
 261
 262 void ReadFloatRowInterleaved3(const uint8_t* row_in, size_t len,
 263                               float* row_out[kMaxComponents]) {
 264   const size_t N = Lanes(d);
 265   const size_t simd_len = len & (~(N - 1));
 266   const auto mul = Set(d, kMulFloat);
 267   const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
 268   float* JXL_RESTRICT const row0 = row_out[0];
 269   float* JXL_RESTRICT const row1 = row_out[1];
 270   float* JXL_RESTRICT const row2 = row_out[2];
 271   Vec<D> out0, out1, out2;
 272   for (size_t x = 0; x < simd_len; x += N) {
 273     LoadInterleaved3(d, row + 3 * x, out0, out1, out2);
 274     Store(Mul(mul, out0), d, row0 + x);
 275     Store(Mul(mul, out1), d, row1 + x);
 276     Store(Mul(mul, out2), d, row2 + x);
 277   }
 278   ReadFloatRow<3>(row_in, simd_len, len, row_out);
 279 }
 280
 281 void ReadFloatRowInterleaved4(const uint8_t* row_in, size_t len,
 282                               float* row_out[kMaxComponents]) {
 283   const size_t N = Lanes(d);
 284   const size_t simd_len = len & (~(N - 1));
 285   const auto mul = Set(d, kMulFloat);
 286   const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
 287   float* JXL_RESTRICT const row0 = row_out[0];
 288   float* JXL_RESTRICT const row1 = row_out[1];
 289   float* JXL_RESTRICT const row2 = row_out[2];
 290   float* JXL_RESTRICT const row3 = row_out[3];
 291   Vec<D> out0, out1, out2, out3;
 292   for (size_t x = 0; x < simd_len; x += N) {
 293     LoadInterleaved4(d, row + 4 * x, out0, out1, out2, out3);
 294     Store(Mul(mul, out0), d, row0 + x);
 295     Store(Mul(mul, out1), d, row1 + x);
 296     Store(Mul(mul, out2), d, row2 + x);
 297     Store(Mul(mul, out3), d, row3 + x);
 298   }
 299   ReadFloatRow<4>(row_in, simd_len, len, row_out);
 300 }
 301
 302 void ReadFloatRowSingleSwap(const uint8_t* row_in, size_t len,
 303                             float* row_out[kMaxComponents]) {
 304   ReadFloatRow<1, true>(row_in, 0, len, row_out);
 305 }
 306
 307 void ReadFloatRowInterleaved2Swap(const uint8_t* row_in, size_t len,
 308                                   float* row_out[kMaxComponents]) {
 309   ReadFloatRow<2, true>(row_in, 0, len, row_out);
 310 }
 311
 312 void ReadFloatRowInterleaved3Swap(const uint8_t* row_in, size_t len,
 313                                   float* row_out[kMaxComponents]) {
 314   ReadFloatRow<3, true>(row_in, 0, len, row_out);
 315 }
 316
 317 void ReadFloatRowInterleaved4Swap(const uint8_t* row_in, size_t len,
 318                                   float* row_out[kMaxComponents]) {
 319   ReadFloatRow<4, true>(row_in, 0, len, row_out);
 320 }
 321
 322 // NOLINTNEXTLINE(google-readability-namespace-comments)
 323 }  // namespace HWY_NAMESPACE
 324 }  // namespace jpegli
 325 HWY_AFTER_NAMESPACE();
 326
 327 #if HWY_ONCE
 328 namespace jpegli {
 329
 330 HWY_EXPORT(ReadUint8RowSingle);
 331 HWY_EXPORT(ReadUint8RowInterleaved2);
 332 HWY_EXPORT(ReadUint8RowInterleaved3);
 333 HWY_EXPORT(ReadUint8RowInterleaved4);
 334 HWY_EXPORT(ReadUint16RowSingle);
 335 HWY_EXPORT(ReadUint16RowInterleaved2);
 336 HWY_EXPORT(ReadUint16RowInterleaved3);
 337 HWY_EXPORT(ReadUint16RowInterleaved4);
 338 HWY_EXPORT(ReadUint16RowSingleSwap);
 339 HWY_EXPORT(ReadUint16RowInterleaved2Swap);
 340 HWY_EXPORT(ReadUint16RowInterleaved3Swap);
 341 HWY_EXPORT(ReadUint16RowInterleaved4Swap);
 342 HWY_EXPORT(ReadFloatRowSingle);
 343 HWY_EXPORT(ReadFloatRowInterleaved2);
 344 HWY_EXPORT(ReadFloatRowInterleaved3);
 345 HWY_EXPORT(ReadFloatRowInterleaved4);
 346 HWY_EXPORT(ReadFloatRowSingleSwap);
 347 HWY_EXPORT(ReadFloatRowInterleaved2Swap);
 348 HWY_EXPORT(ReadFloatRowInterleaved3Swap);
 349 HWY_EXPORT(ReadFloatRowInterleaved4Swap);
 350
 351 void ChooseInputMethod(j_compress_ptr cinfo) {
 352   jpeg_comp_master* m = cinfo->master;
 353   bool swap_endianness =
 354       (m->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) ||
 355       (m->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian());
 356   m->input_method = nullptr;
 357   if (m->data_type == JPEGLI_TYPE_UINT8) {
 358     if (cinfo->raw_data_in || cinfo->input_components == 1) {
 359       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowSingle);
 360     } else if (cinfo->input_components == 2) {
 361       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved2);
 362     } else if (cinfo->input_components == 3) {
 363       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved3);
 364     } else if (cinfo->input_components == 4) {
 365       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved4);
 366     }
 367   } else if (m->data_type == JPEGLI_TYPE_UINT16 && !swap_endianness) {
 368     if (cinfo->raw_data_in || cinfo->input_components == 1) {
 369       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingle);
 370     } else if (cinfo->input_components == 2) {
 371       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2);
 372     } else if (cinfo->input_components == 3) {
 373       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3);
 374     } else if (cinfo->input_components == 4) {
 375       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4);
 376     }
 377   } else if (m->data_type == JPEGLI_TYPE_UINT16 && swap_endianness) {
 378     if (cinfo->raw_data_in || cinfo->input_components == 1) {
 379       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingleSwap);
 380     } else if (cinfo->input_components == 2) {
 381       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2Swap);
 382     } else if (cinfo->input_components == 3) {
 383       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3Swap);
 384     } else if (cinfo->input_components == 4) {
 385       m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4Swap);
 386     }
 387   } else if (m->data_type == JPEGLI_TYPE_FLOAT && !swap_endianness) {
 388     if (cinfo->raw_data_in || cinfo->input_components == 1) {
 389       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingle);
 390     } else if (cinfo->input_components == 2) {
 391       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2);
 392     } else if (cinfo->input_components == 3) {
 393       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3);
 394     } else if (cinfo->input_components == 4) {
 395       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4);
 396     }
 397   } else if (m->data_type == JPEGLI_TYPE_FLOAT && swap_endianness) {
 398     if (cinfo->raw_data_in || cinfo->input_components == 1) {
 399       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingleSwap);
 400     } else if (cinfo->input_components == 2) {
 401       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2Swap);
 402     } else if (cinfo->input_components == 3) {
 403       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3Swap);
 404     } else if (cinfo->input_components == 4) {
 405       m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4Swap);
 406     }
 407   }
 408   if (m->input_method == nullptr) {
 409     JPEGLI_ERROR("Could not find input method.");
 410   }
 411 }
 412
 413 }  // namespace jpegli
 414 #endif  // HWY_ONCE