third_party/jpeg-xl/lib/jpegli/adaptive_quantization.cc

   1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style
   4 // license that can be found in the LICENSE file.
   5
   6 #include "lib/jpegli/adaptive_quantization.h"
   7
   8 #include <jxl/types.h>
   9 #include <stddef.h>
  10 #include <stdlib.h>
  11
  12 #include <algorithm>
  13 #include <cmath>
  14 #include <limits>
  15 #include <string>
  16 #include <vector>
  17
  18 #undef HWY_TARGET_INCLUDE
  19 #define HWY_TARGET_INCLUDE "lib/jpegli/adaptive_quantization.cc"
  20 #include <hwy/foreach_target.h>
  21 #include <hwy/highway.h>
  22
  23 #include "lib/jpegli/encode_internal.h"
  24 #include "lib/jxl/base/compiler_specific.h"
  25 #include "lib/jxl/base/status.h"
  26 HWY_BEFORE_NAMESPACE();
  27 namespace jpegli {
  28 namespace HWY_NAMESPACE {
  29 namespace {
  30
  31 // These templates are not found via ADL.
  32 using hwy::HWY_NAMESPACE::AbsDiff;
  33 using hwy::HWY_NAMESPACE::Add;
  34 using hwy::HWY_NAMESPACE::And;
  35 using hwy::HWY_NAMESPACE::Div;
  36 using hwy::HWY_NAMESPACE::Floor;
  37 using hwy::HWY_NAMESPACE::GetLane;
  38 using hwy::HWY_NAMESPACE::Max;
  39 using hwy::HWY_NAMESPACE::Min;
  40 using hwy::HWY_NAMESPACE::Mul;
  41 using hwy::HWY_NAMESPACE::MulAdd;
  42 using hwy::HWY_NAMESPACE::NegMulAdd;
  43 using hwy::HWY_NAMESPACE::Rebind;
  44 using hwy::HWY_NAMESPACE::ShiftLeft;
  45 using hwy::HWY_NAMESPACE::ShiftRight;
  46 using hwy::HWY_NAMESPACE::Sqrt;
  47 using hwy::HWY_NAMESPACE::Sub;
  48 using hwy::HWY_NAMESPACE::ZeroIfNegative;
  49
  50 constexpr float kInputScaling = 1.0f / 255.0f;
  51
  52 // Primary template: default to actual division.
  53 template <typename T, class V>
  54 struct FastDivision {
  55   HWY_INLINE V operator()(const V n, const V d) const { return n / d; }
  56 };
  57 // Partial specialization for float vectors.
  58 template <class V>
  59 struct FastDivision<float, V> {
  60   // One Newton-Raphson iteration.
  61   static HWY_INLINE V ReciprocalNR(const V x) {
  62     const auto rcp = ApproximateReciprocal(x);
  63     const auto sum = Add(rcp, rcp);
  64     const auto x_rcp = Mul(x, rcp);
  65     return NegMulAdd(x_rcp, rcp, sum);
  66   }
  67
  68   V operator()(const V n, const V d) const {
  69 #if JXL_TRUE  // Faster on SKX
  70     return Div(n, d);
  71 #else
  72     return n * ReciprocalNR(d);
  73 #endif
  74   }
  75 };
  76
  77 // Approximates smooth functions via rational polynomials (i.e. dividing two
  78 // polynomials). Evaluates polynomials via Horner's scheme, which is faster than
  79 // Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to
  80 // specify constants (replicated 4x) independently of the lane count.
  81 template <size_t NP, size_t NQ, class D, class V, typename T>
  82 HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x,
  83                                                      const T (&p)[NP],
  84                                                      const T (&q)[NQ]) {
  85   constexpr size_t kDegP = NP / 4 - 1;
  86   constexpr size_t kDegQ = NQ / 4 - 1;
  87   auto yp = LoadDup128(d, &p[kDegP * 4]);
  88   auto yq = LoadDup128(d, &q[kDegQ * 4]);
  89   // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
  90   // compiler warning that the index is out of bounds since we are already
  91   // checking that it is not out of bounds with (kDegP >= n) and the access
  92   // will be optimized away. Similarly with q and kDegQ.
  93   HWY_FENCE;
  94   if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4)));
  95   if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
  96   HWY_FENCE;
  97   if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4)));
  98   if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
  99   HWY_FENCE;
 100   if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4)));
 101   if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
 102   HWY_FENCE;
 103   if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4)));
 104   if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
 105   HWY_FENCE;
 106   if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4)));
 107   if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
 108   HWY_FENCE;
 109   if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4)));
 110   if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
 111   HWY_FENCE;
 112   if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
 113   if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
 114
 115   return FastDivision<T, V>()(yp, yq);
 116 }
 117
 118 // Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
 119 // L1 error ~3.9E-6
 120 template <class DF, class V>
 121 V FastLog2f(const DF df, V x) {
 122   // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
 123   HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
 124                                           HWY_REP4(1.4287160470083755E+00f),
 125                                           HWY_REP4(7.4245873327820566E-01f)};
 126   HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
 127                                           HWY_REP4(1.0096718572241148E+00f),
 128                                           HWY_REP4(1.7409343003366853E-01f)};
 129
 130   const Rebind<int32_t, DF> di;
 131   const auto x_bits = BitCast(di, x);
 132
 133   // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
 134   const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
 135   // Shifted exponent = log2; also used to clear mantissa.
 136   const auto exp_shifted = ShiftRight<23>(exp_bits);
 137   const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
 138   const auto exp_val = ConvertTo(df, exp_shifted);
 139   return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
 140              exp_val);
 141 }
 142
 143 // max relative error ~3e-7
 144 template <class DF, class V>
 145 V FastPow2f(const DF df, V x) {
 146   const Rebind<int32_t, DF> di;
 147   auto floorx = Floor(x);
 148   auto exp =
 149       BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
 150   auto frac = Sub(x, floorx);
 151   auto num = Add(frac, Set(df, 1.01749063e+01));
 152   num = MulAdd(num, frac, Set(df, 4.88687798e+01));
 153   num = MulAdd(num, frac, Set(df, 9.85506591e+01));
 154   num = Mul(num, exp);
 155   auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
 156   den = MulAdd(den, frac, Set(df, -1.94414990e+01));
 157   den = MulAdd(den, frac, Set(df, 9.85506633e+01));
 158   return Div(num, den);
 159 }
 160
 161 inline float FastPow2f(float f) {
 162   HWY_CAPPED(float, 1) D;
 163   return GetLane(FastPow2f(D, Set(D, f)));
 164 }
 165
 166 // The following functions modulate an exponent (out_val) and return the updated
 167 // value. Their descriptor is limited to 8 lanes for 8x8 blocks.
 168
 169 template <class D, class V>
 170 V ComputeMask(const D d, const V out_val) {
 171   const auto kBase = Set(d, -0.74174993f);
 172   const auto kMul4 = Set(d, 3.2353257320940401f);
 173   const auto kMul2 = Set(d, 12.906028311180409f);
 174   const auto kOffset2 = Set(d, 305.04035728311436f);
 175   const auto kMul3 = Set(d, 5.0220313103171232f);
 176   const auto kOffset3 = Set(d, 2.1925739705298404f);
 177   const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
 178   const auto kMul0 = Set(d, 0.74760422233706747f);
 179   const auto k1 = Set(d, 1.0f);
 180
 181   // Avoid division by zero.
 182   const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f));
 183   const auto v2 = Div(k1, Add(v1, kOffset2));
 184   const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3));
 185   const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4));
 186   // TODO(jyrki):
 187   // A log or two here could make sense. In butteraugli we have effectively
 188   // log(log(x + C)) for this kind of use, as a single log is used in
 189   // saturating visual masking and here the modulation values are exponential,
 190   // another log would counter that.
 191   return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
 192 }
 193
 194 // mul and mul2 represent a scaling difference between jxl and butteraugli.
 195 const float kSGmul = 226.0480446705883f;
 196 const float kSGmul2 = 1.0f / 73.377132366608819f;
 197 const float kLog2 = 0.693147181f;
 198 // Includes correction factor for std::log -> log2.
 199 const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2;
 200 const float kSGVOffset = 7.14672470003f;
 201
 202 template <bool invert, typename D, typename V>
 203 V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
 204   // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
 205   // is related to the number of photons.
 206   //
 207   // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
 208   // This ratio allows quantization to move from jxl's opsin space to
 209   // butteraugli's log-gamma space.
 210   static const float kEpsilon = 1e-2;
 211   static const float kNumOffset = kEpsilon / kInputScaling / kInputScaling;
 212   static const float kNumMul = kSGRetMul * 3 * kSGmul;
 213   static const float kVOffset = (kSGVOffset * kLog2 + kEpsilon) / kInputScaling;
 214   static const float kDenMul = kLog2 * kSGmul * kInputScaling * kInputScaling;
 215
 216   v = ZeroIfNegative(v);
 217   const auto num_mul = Set(d, kNumMul);
 218   const auto num_offset = Set(d, kNumOffset);
 219   const auto den_offset = Set(d, kVOffset);
 220   const auto den_mul = Set(d, kDenMul);
 221
 222   const auto v2 = Mul(v, v);
 223
 224   const auto num = MulAdd(num_mul, v2, num_offset);
 225   const auto den = MulAdd(Mul(den_mul, v), v2, den_offset);
 226   return invert ? Div(num, den) : Div(den, num);
 227 }
 228
 229 template <bool invert = false>
 230 float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) {
 231   using DScalar = HWY_CAPPED(float, 1);
 232   auto vscalar = Load(DScalar(), &v);
 233   return GetLane(
 234       RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar));
 235 }
 236
 237 // TODO(veluca): this function computes an approximation of the derivative of
 238 // SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or
 239 // exact derivatives. For reference, SimpleGamma was:
 240 /*
 241 template <typename D, typename V>
 242 V SimpleGamma(const D d, V v) {
 243   // A simple HDR compatible gamma function.
 244   const auto mul = Set(d, kSGmul);
 245   const auto kRetMul = Set(d, kSGRetMul);
 246   const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f);
 247   const auto kVOffset = Set(d, kSGVOffset);
 248
 249   v *= mul;
 250
 251   // This should happen rarely, but may lead to a NaN, which is rather
 252   // undesirable. Since negative photons don't exist we solve the NaNs by
 253   // clamping here.
 254   // TODO(veluca): with FastLog2f, this no longer leads to NaNs.
 255   v = ZeroIfNegative(v);
 256   return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd;
 257 }
 258 */
 259
 260 template <class D, class V>
 261 V GammaModulation(const D d, const size_t x, const size_t y,
 262                   const RowBuffer<float>& input, const V out_val) {
 263   static const float kBias = 0.16f / kInputScaling;
 264   static const float kScale = kInputScaling / 64.0f;
 265   auto overall_ratio = Zero(d);
 266   const auto bias = Set(d, kBias);
 267   const auto scale = Set(d, kScale);
 268   const float* const JXL_RESTRICT block_start = input.Row(y) + x;
 269   for (size_t dy = 0; dy < 8; ++dy) {
 270     const float* const JXL_RESTRICT row_in = block_start + dy * input.stride();
 271     for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
 272       const auto iny = Add(Load(d, row_in + dx), bias);
 273       const auto ratio_g =
 274           RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, iny);
 275       overall_ratio = Add(overall_ratio, ratio_g);
 276     }
 277   }
 278   overall_ratio = Mul(SumOfLanes(d, overall_ratio), scale);
 279   // ideally -1.0, but likely optimal correction adds some entropy, so slightly
 280   // less than that.
 281   // ln(2) constant folded in because we want std::log but have FastLog2f.
 282   const auto kGamma = Set(d, -0.15526878023684174f * 0.693147180559945f);
 283   return MulAdd(kGamma, FastLog2f(d, overall_ratio), out_val);
 284 }
 285
 286 // Change precision in 8x8 blocks that have high frequency content.
 287 template <class D, class V>
 288 V HfModulation(const D d, const size_t x, const size_t y,
 289                const RowBuffer<float>& input, const V out_val) {
 290   // Zero out the invalid differences for the rightmost value per row.
 291   const Rebind<uint32_t, D> du;
 292   HWY_ALIGN constexpr uint32_t kMaskRight[8] = {~0u, ~0u, ~0u, ~0u,
 293                                                 ~0u, ~0u, ~0u, 0};
 294
 295   auto sum = Zero(d);  // sum of absolute differences with right and below
 296   static const float kSumCoeff = -2.0052193233688884f * kInputScaling / 112.0;
 297   auto sumcoeff = Set(d, kSumCoeff);
 298
 299   const float* const JXL_RESTRICT block_start = input.Row(y) + x;
 300   for (size_t dy = 0; dy < 8; ++dy) {
 301     const float* JXL_RESTRICT row_in = block_start + dy * input.stride();
 302     const float* JXL_RESTRICT row_in_next =
 303         dy == 7 ? row_in : row_in + input.stride();
 304
 305     for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
 306       const auto p = Load(d, row_in + dx);
 307       const auto pr = LoadU(d, row_in + dx + 1);
 308       const auto mask = BitCast(d, Load(du, kMaskRight + dx));
 309       sum = Add(sum, And(mask, AbsDiff(p, pr)));
 310       const auto pd = Load(d, row_in_next + dx);
 311       sum = Add(sum, AbsDiff(p, pd));
 312     }
 313   }
 314
 315   sum = SumOfLanes(d, sum);
 316   return MulAdd(sum, sumcoeff, out_val);
 317 }
 318
 319 void PerBlockModulations(const float y_quant_01, const RowBuffer<float>& input,
 320                          const size_t yb0, const size_t yblen,
 321                          RowBuffer<float>* aq_map) {
 322   static const float kAcQuant = 0.841f;
 323   float base_level = 0.48f * kAcQuant;
 324   float kDampenRampStart = 9.0f;
 325   float kDampenRampEnd = 65.0f;
 326   float dampen = 1.0f;
 327   if (y_quant_01 >= kDampenRampStart) {
 328     dampen = 1.0f - ((y_quant_01 - kDampenRampStart) /
 329                      (kDampenRampEnd - kDampenRampStart));
 330     if (dampen < 0) {
 331       dampen = 0;
 332     }
 333   }
 334   const float mul = kAcQuant * dampen;
 335   const float add = (1.0f - dampen) * base_level;
 336   for (size_t iy = 0; iy < yblen; iy++) {
 337     const size_t yb = yb0 + iy;
 338     const size_t y = yb * 8;
 339     float* const JXL_RESTRICT row_out = aq_map->Row(yb);
 340     const HWY_CAPPED(float, 8) df;
 341     for (size_t ix = 0; ix < aq_map->xsize(); ix++) {
 342       size_t x = ix * 8;
 343       auto out_val = Set(df, row_out[ix]);
 344       out_val = ComputeMask(df, out_val);
 345       out_val = HfModulation(df, x, y, input, out_val);
 346       out_val = GammaModulation(df, x, y, input, out_val);
 347       // We want multiplicative quantization field, so everything
 348       // until this point has been modulating the exponent.
 349       row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
 350     }
 351   }
 352 }
 353
 354 template <typename D, typename V>
 355 V MaskingSqrt(const D d, V v) {
 356   static const float kLogOffset = 28;
 357   static const float kMul = 211.50759899638012f;
 358   const auto mul_v = Set(d, kMul * 1e8);
 359   const auto offset_v = Set(d, kLogOffset);
 360   return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
 361 }
 362
 363 template <typename V>
 364 void Sort4(V& min0, V& min1, V& min2, V& min3) {
 365   const auto tmp0 = Min(min0, min1);
 366   const auto tmp1 = Max(min0, min1);
 367   const auto tmp2 = Min(min2, min3);
 368   const auto tmp3 = Max(min2, min3);
 369   const auto tmp4 = Max(tmp0, tmp2);
 370   const auto tmp5 = Min(tmp1, tmp3);
 371   min0 = Min(tmp0, tmp2);
 372   min1 = Min(tmp4, tmp5);
 373   min2 = Max(tmp4, tmp5);
 374   min3 = Max(tmp1, tmp3);
 375 }
 376
 377 template <typename V>
 378 void UpdateMin4(const V v, V& min0, V& min1, V& min2, V& min3) {
 379   const auto tmp0 = Max(min0, v);
 380   const auto tmp1 = Max(min1, tmp0);
 381   const auto tmp2 = Max(min2, tmp1);
 382   min0 = Min(min0, v);
 383   min1 = Min(min1, tmp0);
 384   min2 = Min(min2, tmp1);
 385   min3 = Min(min3, tmp2);
 386 }
 387
 388 // Computes a linear combination of the 4 lowest values of the 3x3 neighborhood
 389 // of each pixel. Output is downsampled 2x.
 390 void FuzzyErosion(const RowBuffer<float>& pre_erosion, const size_t yb0,
 391                   const size_t yblen, RowBuffer<float>* tmp,
 392                   RowBuffer<float>* aq_map) {
 393   int xsize_blocks = aq_map->xsize();
 394   int xsize = pre_erosion.xsize();
 395   HWY_FULL(float) d;
 396   const auto mul0 = Set(d, 0.125f);
 397   const auto mul1 = Set(d, 0.075f);
 398   const auto mul2 = Set(d, 0.06f);
 399   const auto mul3 = Set(d, 0.05f);
 400   for (size_t iy = 0; iy < 2 * yblen; ++iy) {
 401     size_t y = 2 * yb0 + iy;
 402     const float* JXL_RESTRICT rowt = pre_erosion.Row(y - 1);
 403     const float* JXL_RESTRICT rowm = pre_erosion.Row(y);
 404     const float* JXL_RESTRICT rowb = pre_erosion.Row(y + 1);
 405     float* row_out = tmp->Row(y);
 406     for (int x = 0; x < xsize; x += Lanes(d)) {
 407       int xm1 = x - 1;
 408       int xp1 = x + 1;
 409       auto min0 = LoadU(d, rowm + x);
 410       auto min1 = LoadU(d, rowm + xm1);
 411       auto min2 = LoadU(d, rowm + xp1);
 412       auto min3 = LoadU(d, rowt + xm1);
 413       Sort4(min0, min1, min2, min3);
 414       UpdateMin4(LoadU(d, rowt + x), min0, min1, min2, min3);
 415       UpdateMin4(LoadU(d, rowt + xp1), min0, min1, min2, min3);
 416       UpdateMin4(LoadU(d, rowb + xm1), min0, min1, min2, min3);
 417       UpdateMin4(LoadU(d, rowb + x), min0, min1, min2, min3);
 418       UpdateMin4(LoadU(d, rowb + xp1), min0, min1, min2, min3);
 419       const auto v = Add(Add(Mul(mul0, min0), Mul(mul1, min1)),
 420                          Add(Mul(mul2, min2), Mul(mul3, min3)));
 421       Store(v, d, row_out + x);
 422     }
 423     if (iy % 2 == 1) {
 424       const float* JXL_RESTRICT row_out0 = tmp->Row(y - 1);
 425       float* JXL_RESTRICT aq_out = aq_map->Row(yb0 + iy / 2);
 426       for (int bx = 0, x = 0; bx < xsize_blocks; ++bx, x += 2) {
 427         aq_out[bx] =
 428             (row_out[x] + row_out[x + 1] + row_out0[x] + row_out0[x + 1]);
 429       }
 430     }
 431   }
 432 }
 433
 434 void ComputePreErosion(const RowBuffer<float>& input, const size_t xsize,
 435                        const size_t y0, const size_t ylen, int border,
 436                        float* diff_buffer, RowBuffer<float>* pre_erosion) {
 437   const size_t xsize_out = xsize / 4;
 438   const size_t y0_out = y0 / 4;
 439
 440   // The XYB gamma is 3.0 to be able to decode faster with two muls.
 441   // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
 442   // We approximate the gamma difference by adding one cubic root into
 443   // the adaptive quantization. This gives us a total gamma of 2.6666
 444   // for quantization uses.
 445   static const float match_gamma_offset = 0.019 / kInputScaling;
 446
 447   const HWY_CAPPED(float, 8) df;
 448
 449   static const float limit = 0.2f;
 450   // Computes image (padded to multiple of 8x8) of local pixel differences.
 451   // Subsample both directions by 4.
 452   for (size_t iy = 0; iy < ylen; ++iy) {
 453     size_t y = y0 + iy;
 454     const float* row_in = input.Row(y);
 455     const float* row_in1 = input.Row(y + 1);
 456     const float* row_in2 = input.Row(y - 1);
 457     float* JXL_RESTRICT row_out = diff_buffer;
 458     const auto match_gamma_offset_v = Set(df, match_gamma_offset);
 459     const auto quarter = Set(df, 0.25f);
 460     for (size_t x = 0; x < xsize; x += Lanes(df)) {
 461       const auto in = LoadU(df, row_in + x);
 462       const auto in_r = LoadU(df, row_in + x + 1);
 463       const auto in_l = LoadU(df, row_in + x - 1);
 464       const auto in_t = LoadU(df, row_in2 + x);
 465       const auto in_b = LoadU(df, row_in1 + x);
 466       const auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b)));
 467       const auto gammacv =
 468           RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>(
 469               df, Add(in, match_gamma_offset_v));
 470       auto diff = Mul(gammacv, Sub(in, base));
 471       diff = Mul(diff, diff);
 472       diff = Min(diff, Set(df, limit));
 473       diff = MaskingSqrt(df, diff);
 474       if ((iy & 3) != 0) {
 475         diff = Add(diff, LoadU(df, row_out + x));
 476       }
 477       StoreU(diff, df, row_out + x);
 478     }
 479     if (iy % 4 == 3) {
 480       size_t y_out = y0_out + iy / 4;
 481       float* row_d_out = pre_erosion->Row(y_out);
 482       for (size_t x = 0; x < xsize_out; x++) {
 483         row_d_out[x] = (row_out[x * 4] + row_out[x * 4 + 1] +
 484                         row_out[x * 4 + 2] + row_out[x * 4 + 3]) *
 485                        0.25f;
 486       }
 487       pre_erosion->PadRow(y_out, xsize_out, border);
 488     }
 489   }
 490 }
 491
 492 }  // namespace
 493
 494 // NOLINTNEXTLINE(google-readability-namespace-comments)
 495 }  // namespace HWY_NAMESPACE
 496 }  // namespace jpegli
 497 HWY_AFTER_NAMESPACE();
 498
 499 #if HWY_ONCE
 500 namespace jpegli {
 501 HWY_EXPORT(ComputePreErosion);
 502 HWY_EXPORT(FuzzyErosion);
 503 HWY_EXPORT(PerBlockModulations);
 504
 505 namespace {
 506
 507 constexpr int kPreErosionBorder = 1;
 508
 509 }  // namespace
 510
 511 void ComputeAdaptiveQuantField(j_compress_ptr cinfo) {
 512   jpeg_comp_master* m = cinfo->master;
 513   if (!m->use_adaptive_quantization) {
 514     return;
 515   }
 516   int y_channel = cinfo->jpeg_color_space == JCS_RGB ? 1 : 0;
 517   jpeg_component_info* y_comp = &cinfo->comp_info[y_channel];
 518   int y_quant_01 = cinfo->quant_tbl_ptrs[y_comp->quant_tbl_no]->quantval[1];
 519   if (m->next_iMCU_row == 0) {
 520     m->input_buffer[y_channel].CopyRow(-1, 0, 1);
 521   }
 522   if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
 523     size_t last_row = m->ysize_blocks * DCTSIZE - 1;
 524     m->input_buffer[y_channel].CopyRow(last_row + 1, last_row, 1);
 525   }
 526   const RowBuffer<float>& input = m->input_buffer[y_channel];
 527   const size_t xsize_blocks = y_comp->width_in_blocks;
 528   const size_t xsize = xsize_blocks * DCTSIZE;
 529   const size_t yb0 = m->next_iMCU_row * cinfo->max_v_samp_factor;
 530   const size_t yblen = cinfo->max_v_samp_factor;
 531   size_t y0 = yb0 * DCTSIZE;
 532   size_t ylen = cinfo->max_v_samp_factor * DCTSIZE;
 533   if (y0 == 0) {
 534     ylen += 4;
 535   } else {
 536     y0 += 4;
 537   }
 538   if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
 539     ylen -= 4;
 540   }
 541   HWY_DYNAMIC_DISPATCH(ComputePreErosion)
 542   (input, xsize, y0, ylen, kPreErosionBorder, m->diff_buffer, &m->pre_erosion);
 543   if (y0 == 0) {
 544     m->pre_erosion.CopyRow(-1, 0, kPreErosionBorder);
 545   }
 546   if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
 547     size_t last_row = m->ysize_blocks * 2 - 1;
 548     m->pre_erosion.CopyRow(last_row + 1, last_row, kPreErosionBorder);
 549   }
 550   HWY_DYNAMIC_DISPATCH(FuzzyErosion)
 551   (m->pre_erosion, yb0, yblen, &m->fuzzy_erosion_tmp, &m->quant_field);
 552   HWY_DYNAMIC_DISPATCH(PerBlockModulations)
 553   (y_quant_01, input, yb0, yblen, &m->quant_field);
 554   for (int y = 0; y < cinfo->max_v_samp_factor; ++y) {
 555     float* row = m->quant_field.Row(yb0 + y);
 556     for (size_t x = 0; x < xsize_blocks; ++x) {
 557       row[x] = std::max(0.0f, (0.6f / row[x]) - 1.0f);
 558     }
 559   }
 560 }
 561
 562 }  // namespace jpegli
 563 #endif  // HWY_ONCE