third_party/jpeg-xl/lib/jpegli/dct-inl.h

   1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style
   4 // license that can be found in the LICENSE file.
   5
   6 #if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
   7 #ifdef LIB_JPEGLI_DCT_INL_H_
   8 #undef LIB_JPEGLI_DCT_INL_H_
   9 #else
  10 #define LIB_JPEGLI_DCT_INL_H_
  11 #endif
  12
  13 #include "lib/jpegli/transpose-inl.h"
  14 #include "lib/jxl/base/compiler_specific.h"
  15
  16 HWY_BEFORE_NAMESPACE();
  17 namespace jpegli {
  18 namespace HWY_NAMESPACE {
  19 namespace {
  20
  21 // These templates are not found via ADL.
  22 using hwy::HWY_NAMESPACE::Abs;
  23 using hwy::HWY_NAMESPACE::Add;
  24 using hwy::HWY_NAMESPACE::DemoteTo;
  25 using hwy::HWY_NAMESPACE::Ge;
  26 using hwy::HWY_NAMESPACE::IfThenElseZero;
  27 using hwy::HWY_NAMESPACE::Mul;
  28 using hwy::HWY_NAMESPACE::MulAdd;
  29 using hwy::HWY_NAMESPACE::Rebind;
  30 using hwy::HWY_NAMESPACE::Round;
  31 using hwy::HWY_NAMESPACE::Sub;
  32 using hwy::HWY_NAMESPACE::Vec;
  33
  34 using D = HWY_FULL(float);
  35 using DI = HWY_FULL(int32_t);
  36
  37 template <size_t N>
  38 void AddReverse(const float* JXL_RESTRICT a_in1,
  39                 const float* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) {
  40   HWY_CAPPED(float, 8) d8;
  41   for (size_t i = 0; i < N; i++) {
  42     auto in1 = Load(d8, a_in1 + i * 8);
  43     auto in2 = Load(d8, a_in2 + (N - i - 1) * 8);
  44     Store(Add(in1, in2), d8, a_out + i * 8);
  45   }
  46 }
  47
  48 template <size_t N>
  49 void SubReverse(const float* JXL_RESTRICT a_in1,
  50                 const float* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) {
  51   HWY_CAPPED(float, 8) d8;
  52   for (size_t i = 0; i < N; i++) {
  53     auto in1 = Load(d8, a_in1 + i * 8);
  54     auto in2 = Load(d8, a_in2 + (N - i - 1) * 8);
  55     Store(Sub(in1, in2), d8, a_out + i * 8);
  56   }
  57 }
  58
  59 template <size_t N>
  60 void B(float* JXL_RESTRICT coeff) {
  61   HWY_CAPPED(float, 8) d8;
  62   constexpr float kSqrt2 = 1.41421356237f;
  63   auto sqrt2 = Set(d8, kSqrt2);
  64   auto in1 = Load(d8, coeff);
  65   auto in2 = Load(d8, coeff + 8);
  66   Store(MulAdd(in1, sqrt2, in2), d8, coeff);
  67   for (size_t i = 1; i + 1 < N; i++) {
  68     auto in1 = Load(d8, coeff + i * 8);
  69     auto in2 = Load(d8, coeff + (i + 1) * 8);
  70     Store(Add(in1, in2), d8, coeff + i * 8);
  71   }
  72 }
  73
  74 // Ideally optimized away by compiler (except the multiply).
  75 template <size_t N>
  76 void InverseEvenOdd(const float* JXL_RESTRICT a_in, float* JXL_RESTRICT a_out) {
  77   HWY_CAPPED(float, 8) d8;
  78   for (size_t i = 0; i < N / 2; i++) {
  79     auto in1 = Load(d8, a_in + i * 8);
  80     Store(in1, d8, a_out + 2 * i * 8);
  81   }
  82   for (size_t i = N / 2; i < N; i++) {
  83     auto in1 = Load(d8, a_in + i * 8);
  84     Store(in1, d8, a_out + (2 * (i - N / 2) + 1) * 8);
  85   }
  86 }
  87
  88 // Constants for DCT implementation. Generated by the following snippet:
  89 // for i in range(N // 2):
  90 //    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
  91 template <size_t N>
  92 struct WcMultipliers;
  93
  94 template <>
  95 struct WcMultipliers<4> {
  96   static constexpr float kMultipliers[] = {
  97       0.541196100146197,
  98       1.3065629648763764,
  99   };
 100 };
 101
 102 template <>
 103 struct WcMultipliers<8> {
 104   static constexpr float kMultipliers[] = {
 105       0.5097955791041592,
 106       0.6013448869350453,
 107       0.8999762231364156,
 108       2.5629154477415055,
 109   };
 110 };
 111
 112 #if JXL_CXX_LANG < JXL_CXX_17
 113 constexpr float WcMultipliers<4>::kMultipliers[];
 114 constexpr float WcMultipliers<8>::kMultipliers[];
 115 #endif
 116
 117 // Invoked on full vector.
 118 template <size_t N>
 119 void Multiply(float* JXL_RESTRICT coeff) {
 120   HWY_CAPPED(float, 8) d8;
 121   for (size_t i = 0; i < N / 2; i++) {
 122     auto in1 = Load(d8, coeff + (N / 2 + i) * 8);
 123     auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
 124     Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8);
 125   }
 126 }
 127
 128 void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride,
 129                    size_t off, float* JXL_RESTRICT coeff) {
 130   HWY_CAPPED(float, 8) d8;
 131   for (size_t i = 0; i < 8; i++) {
 132     Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8);
 133   }
 134 }
 135
 136 void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output,
 137                           size_t off) {
 138   HWY_CAPPED(float, 8) d8;
 139   auto mul = Set(d8, 1.0f / 8);
 140   for (size_t i = 0; i < 8; i++) {
 141     StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off);
 142   }
 143 }
 144
 145 template <size_t N>
 146 struct DCT1DImpl;
 147
 148 template <>
 149 struct DCT1DImpl<1> {
 150   JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
 151 };
 152
 153 template <>
 154 struct DCT1DImpl<2> {
 155   JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
 156     HWY_CAPPED(float, 8) d8;
 157     auto in1 = Load(d8, mem);
 158     auto in2 = Load(d8, mem + 8);
 159     Store(Add(in1, in2), d8, mem);
 160     Store(Sub(in1, in2), d8, mem + 8);
 161   }
 162 };
 163
 164 template <size_t N>
 165 struct DCT1DImpl {
 166   void operator()(float* JXL_RESTRICT mem) {
 167     HWY_ALIGN float tmp[N * 8];
 168     AddReverse<N / 2>(mem, mem + N * 4, tmp);
 169     DCT1DImpl<N / 2>()(tmp);
 170     SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);
 171     Multiply<N>(tmp);
 172     DCT1DImpl<N / 2>()(tmp + N * 4);
 173     B<N / 2>(tmp + N * 4);
 174     InverseEvenOdd<N>(tmp, mem);
 175   }
 176 };
 177
 178 void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride,
 179            float* JXL_RESTRICT output) {
 180   HWY_CAPPED(float, 8) d8;
 181   HWY_ALIGN float tmp[64];
 182   for (size_t i = 0; i < 8; i += Lanes(d8)) {
 183     // TODO(veluca): consider removing the temporary memory here (as is done in
 184     // IDCT), if it turns out that some compilers don't optimize away the loads
 185     // and this is performance-critical.
 186     LoadFromBlock(pixels, pixels_stride, i, tmp);
 187     DCT1DImpl<8>()(tmp);
 188     StoreToBlockAndScale(tmp, output, i);
 189   }
 190 }
 191
 192 JXL_INLINE JXL_MAYBE_UNUSED void TransformFromPixels(
 193     const float* JXL_RESTRICT pixels, size_t pixels_stride,
 194     float* JXL_RESTRICT coefficients, float* JXL_RESTRICT scratch_space) {
 195   DCT1D(pixels, pixels_stride, scratch_space);
 196   Transpose8x8Block(scratch_space, coefficients);
 197   DCT1D(coefficients, 8, scratch_space);
 198   Transpose8x8Block(scratch_space, coefficients);
 199 }
 200
 201 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
 202                                                      int16_t* out) {
 203   Rebind<int16_t, DI> di16;
 204   Store(DemoteTo(di16, ival), di16, out);
 205 }
 206
 207 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
 208                                                      int32_t* out) {
 209   DI di;
 210   Store(ival, di, out);
 211 }
 212
 213 template <typename T>
 214 void QuantizeBlock(const float* dct, const float* qmc, float aq_strength,
 215                    const float* zero_bias_offset, const float* zero_bias_mul,
 216                    T* block) {
 217   D d;
 218   DI di;
 219   const auto aq_mul = Set(d, aq_strength);
 220   for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
 221     const auto val = Load(d, dct + k);
 222     const auto q = Load(d, qmc + k);
 223     const auto qval = Mul(val, q);
 224     const auto zb_offset = Load(d, zero_bias_offset + k);
 225     const auto zb_mul = Load(d, zero_bias_mul + k);
 226     const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
 227     const auto nzero_mask = Ge(Abs(qval), threshold);
 228     const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval)));
 229     StoreQuantizedValue(ival, block + k);
 230   }
 231 }
 232
 233 template <typename T>
 234 void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride,
 235                              const float* JXL_RESTRICT qmc,
 236                              int16_t last_dc_coeff, float aq_strength,
 237                              const float* zero_bias_offset,
 238                              const float* zero_bias_mul,
 239                              float* JXL_RESTRICT tmp, T* block) {
 240   float* JXL_RESTRICT dct = tmp;
 241   float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2;
 242   TransformFromPixels(pixels, stride, dct, scratch_space);
 243   QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul, block);
 244   // Center DC values around zero.
 245   static constexpr float kDCBias = 128.0f;
 246   const float dc = (dct[0] - kDCBias) * qmc[0];
 247   float dc_threshold = zero_bias_offset[0] + aq_strength * zero_bias_mul[0];
 248   if (std::abs(dc - last_dc_coeff) < dc_threshold) {
 249     block[0] = last_dc_coeff;
 250   } else {
 251     block[0] = std::round(dc);
 252   }
 253 }
 254
 255 // NOLINTNEXTLINE(google-readability-namespace-comments)
 256 }  // namespace
 257 }  // namespace HWY_NAMESPACE
 258 }  // namespace jpegli
 259 HWY_AFTER_NAMESPACE();
 260 #endif  // LIB_JPEGLI_DCT_INL_H_