third_party/jpeg-xl/lib/jpegli/dct-inl.h

   1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style
   4 // license that can be found in the LICENSE file.
   5
   6 #if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
   7 #ifdef LIB_JPEGLI_DCT_INL_H_
   8 #undef LIB_JPEGLI_DCT_INL_H_
   9 #else
  10 #define LIB_JPEGLI_DCT_INL_H_
  11 #endif
  12
  13 #include "lib/jpegli/transpose-inl.h"
  14 #include "lib/jxl/base/compiler_specific.h"
  15
  16 HWY_BEFORE_NAMESPACE();
  17 namespace jpegli {
  18 namespace HWY_NAMESPACE {
  19 namespace {
  20
  21 // These templates are not found via ADL.
  22 using hwy::HWY_NAMESPACE::Abs;
  23 using hwy::HWY_NAMESPACE::Add;
  24 using hwy::HWY_NAMESPACE::DemoteTo;
  25 using hwy::HWY_NAMESPACE::Ge;
  26 using hwy::HWY_NAMESPACE::IfThenElseZero;
  27 using hwy::HWY_NAMESPACE::Mul;
  28 using hwy::HWY_NAMESPACE::MulAdd;
  29 using hwy::HWY_NAMESPACE::Rebind;
  30 using hwy::HWY_NAMESPACE::Round;
  31 using hwy::HWY_NAMESPACE::Sub;
  32 using hwy::HWY_NAMESPACE::Vec;
  33
  34 using D = HWY_FULL(float);
  35 using DI = HWY_FULL(int32_t);
  36
  37 template <size_t N>
  38 void AddReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
  39                 float* JXL_RESTRICT aout) {
  40   HWY_CAPPED(float, 8) d8;
  41   for (size_t i = 0; i < N; i++) {
  42     auto in1 = Load(d8, ain1 + i * 8);
  43     auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
  44     Store(Add(in1, in2), d8, aout + i * 8);
  45   }
  46 }
  47
  48 template <size_t N>
  49 void SubReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
  50                 float* JXL_RESTRICT aout) {
  51   HWY_CAPPED(float, 8) d8;
  52   for (size_t i = 0; i < N; i++) {
  53     auto in1 = Load(d8, ain1 + i * 8);
  54     auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
  55     Store(Sub(in1, in2), d8, aout + i * 8);
  56   }
  57 }
  58
  59 template <size_t N>
  60 void B(float* JXL_RESTRICT coeff) {
  61   HWY_CAPPED(float, 8) d8;
  62   constexpr float kSqrt2 = 1.41421356237f;
  63   auto sqrt2 = Set(d8, kSqrt2);
  64   auto in1 = Load(d8, coeff);
  65   auto in2 = Load(d8, coeff + 8);
  66   Store(MulAdd(in1, sqrt2, in2), d8, coeff);
  67   for (size_t i = 1; i + 1 < N; i++) {
  68     auto in1 = Load(d8, coeff + i * 8);
  69     auto in2 = Load(d8, coeff + (i + 1) * 8);
  70     Store(Add(in1, in2), d8, coeff + i * 8);
  71   }
  72 }
  73
  74 // Ideally optimized away by compiler (except the multiply).
  75 template <size_t N>
  76 void InverseEvenOdd(const float* JXL_RESTRICT ain, float* JXL_RESTRICT aout) {
  77   HWY_CAPPED(float, 8) d8;
  78   for (size_t i = 0; i < N / 2; i++) {
  79     auto in1 = Load(d8, ain + i * 8);
  80     Store(in1, d8, aout + 2 * i * 8);
  81   }
  82   for (size_t i = N / 2; i < N; i++) {
  83     auto in1 = Load(d8, ain + i * 8);
  84     Store(in1, d8, aout + (2 * (i - N / 2) + 1) * 8);
  85   }
  86 }
  87
  88 // Constants for DCT implementation. Generated by the following snippet:
  89 // for i in range(N // 2):
  90 //    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
  91 template <size_t N>
  92 struct WcMultipliers;
  93
  94 template <>
  95 struct WcMultipliers<4> {
  96   static constexpr float kMultipliers[] = {
  97       0.541196100146197,
  98       1.3065629648763764,
  99   };
 100 };
 101
 102 template <>
 103 struct WcMultipliers<8> {
 104   static constexpr float kMultipliers[] = {
 105       0.5097955791041592,
 106       0.6013448869350453,
 107       0.8999762231364156,
 108       2.5629154477415055,
 109   };
 110 };
 111
 112 constexpr float WcMultipliers<4>::kMultipliers[];
 113 constexpr float WcMultipliers<8>::kMultipliers[];
 114
 115 // Invoked on full vector.
 116 template <size_t N>
 117 void Multiply(float* JXL_RESTRICT coeff) {
 118   HWY_CAPPED(float, 8) d8;
 119   for (size_t i = 0; i < N / 2; i++) {
 120     auto in1 = Load(d8, coeff + (N / 2 + i) * 8);
 121     auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
 122     Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8);
 123   }
 124 }
 125
 126 void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride,
 127                    size_t off, float* JXL_RESTRICT coeff) {
 128   HWY_CAPPED(float, 8) d8;
 129   for (size_t i = 0; i < 8; i++) {
 130     Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8);
 131   }
 132 }
 133
 134 void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output,
 135                           size_t off) {
 136   HWY_CAPPED(float, 8) d8;
 137   auto mul = Set(d8, 1.0f / 8);
 138   for (size_t i = 0; i < 8; i++) {
 139     StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off);
 140   }
 141 }
 142
 143 template <size_t N>
 144 struct DCT1DImpl;
 145
 146 template <>
 147 struct DCT1DImpl<1> {
 148   JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
 149 };
 150
 151 template <>
 152 struct DCT1DImpl<2> {
 153   JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
 154     HWY_CAPPED(float, 8) d8;
 155     auto in1 = Load(d8, mem);
 156     auto in2 = Load(d8, mem + 8);
 157     Store(Add(in1, in2), d8, mem);
 158     Store(Sub(in1, in2), d8, mem + 8);
 159   }
 160 };
 161
 162 template <size_t N>
 163 struct DCT1DImpl {
 164   void operator()(float* JXL_RESTRICT mem) {
 165     HWY_ALIGN float tmp[N * 8];
 166     AddReverse<N / 2>(mem, mem + N * 4, tmp);
 167     DCT1DImpl<N / 2>()(tmp);
 168     SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);
 169     Multiply<N>(tmp);
 170     DCT1DImpl<N / 2>()(tmp + N * 4);
 171     B<N / 2>(tmp + N * 4);
 172     InverseEvenOdd<N>(tmp, mem);
 173   }
 174 };
 175
 176 void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride,
 177            float* JXL_RESTRICT output) {
 178   HWY_CAPPED(float, 8) d8;
 179   HWY_ALIGN float tmp[64];
 180   for (size_t i = 0; i < 8; i += Lanes(d8)) {
 181     // TODO(veluca): consider removing the temporary memory here (as is done in
 182     // IDCT), if it turns out that some compilers don't optimize away the loads
 183     // and this is performance-critical.
 184     LoadFromBlock(pixels, pixels_stride, i, tmp);
 185     DCT1DImpl<8>()(tmp);
 186     StoreToBlockAndScale(tmp, output, i);
 187   }
 188 }
 189
 190 static JXL_INLINE JXL_MAYBE_UNUSED void TransformFromPixels(
 191     const float* JXL_RESTRICT pixels, size_t pixels_stride,
 192     float* JXL_RESTRICT coefficients, float* JXL_RESTRICT scratch_space) {
 193   DCT1D(pixels, pixels_stride, scratch_space);
 194   Transpose8x8Block(scratch_space, coefficients);
 195   DCT1D(coefficients, 8, scratch_space);
 196   Transpose8x8Block(scratch_space, coefficients);
 197 }
 198
 199 static JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
 200                                                             int16_t* out) {
 201   Rebind<int16_t, DI> di16;
 202   Store(DemoteTo(di16, ival), di16, out);
 203 }
 204
 205 static JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
 206                                                             int32_t* out) {
 207   DI di;
 208   Store(ival, di, out);
 209 }
 210
 211 template <typename T>
 212 void QuantizeBlock(const float* dct, const float* qmc, float aq_strength,
 213                    const float* zero_bias_offset, const float* zero_bias_mul,
 214                    T* block) {
 215   D d;
 216   DI di;
 217   const auto aq_mul = Set(d, aq_strength);
 218   for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
 219     const auto val = Load(d, dct + k);
 220     const auto q = Load(d, qmc + k);
 221     const auto qval = Mul(val, q);
 222     const auto zb_offset = Load(d, zero_bias_offset + k);
 223     const auto zb_mul = Load(d, zero_bias_mul + k);
 224     const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
 225     const auto nzero_mask = Ge(Abs(qval), threshold);
 226     const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval)));
 227     StoreQuantizedValue(ival, block + k);
 228   }
 229 }
 230
 231 template <typename T>
 232 void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride,
 233                              const float* JXL_RESTRICT qmc,
 234                              int16_t last_dc_coeff, float aq_strength,
 235                              const float* zero_bias_offset,
 236                              const float* zero_bias_mul,
 237                              float* JXL_RESTRICT tmp, T* block) {
 238   float* JXL_RESTRICT dct = tmp;
 239   float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2;
 240   TransformFromPixels(pixels, stride, dct, scratch_space);
 241   QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul, block);
 242   // Center DC values around zero.
 243   static constexpr float kDCBias = 128.0f;
 244   const float dc = (dct[0] - kDCBias) * qmc[0];
 245   float dc_threshold = zero_bias_offset[0] + aq_strength * zero_bias_mul[0];
 246   if (std::abs(dc - last_dc_coeff) < dc_threshold) {
 247     block[0] = last_dc_coeff;
 248   } else {
 249     block[0] = std::round(dc);
 250   }
 251 }
 252
 253 // NOLINTNEXTLINE(google-readability-namespace-comments)
 254 }  // namespace
 255 }  // namespace HWY_NAMESPACE
 256 }  // namespace jpegli
 257 HWY_AFTER_NAMESPACE();
 258 #endif  // LIB_JPEGLI_DCT_INL_H_