1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
6 #include "lib/jpegli/adaptive_quantization.h"
18 #undef HWY_TARGET_INCLUDE
19 #define HWY_TARGET_INCLUDE "lib/jpegli/adaptive_quantization.cc"
20 #include <hwy/foreach_target.h>
21 #include <hwy/highway.h>
23 #include "lib/jpegli/encode_internal.h"
24 #include "lib/jxl/base/compiler_specific.h"
25 #include "lib/jxl/base/status.h"
26 HWY_BEFORE_NAMESPACE();
28 namespace HWY_NAMESPACE
{
31 // These templates are not found via ADL.
32 using hwy::HWY_NAMESPACE::AbsDiff
;
33 using hwy::HWY_NAMESPACE::Add
;
34 using hwy::HWY_NAMESPACE::And
;
35 using hwy::HWY_NAMESPACE::Div
;
36 using hwy::HWY_NAMESPACE::Floor
;
37 using hwy::HWY_NAMESPACE::GetLane
;
38 using hwy::HWY_NAMESPACE::Max
;
39 using hwy::HWY_NAMESPACE::Min
;
40 using hwy::HWY_NAMESPACE::Mul
;
41 using hwy::HWY_NAMESPACE::MulAdd
;
42 using hwy::HWY_NAMESPACE::NegMulAdd
;
43 using hwy::HWY_NAMESPACE::Rebind
;
44 using hwy::HWY_NAMESPACE::ShiftLeft
;
45 using hwy::HWY_NAMESPACE::ShiftRight
;
46 using hwy::HWY_NAMESPACE::Sqrt
;
47 using hwy::HWY_NAMESPACE::Sub
;
48 using hwy::HWY_NAMESPACE::ZeroIfNegative
;
50 constexpr float kInputScaling
= 1.0f
/ 255.0f
;
52 // Primary template: default to actual division.
53 template <typename T
, class V
>
55 HWY_INLINE V
operator()(const V n
, const V d
) const { return n
/ d
; }
57 // Partial specialization for float vectors.
59 struct FastDivision
<float, V
> {
60 // One Newton-Raphson iteration.
61 static HWY_INLINE V
ReciprocalNR(const V x
) {
62 const auto rcp
= ApproximateReciprocal(x
);
63 const auto sum
= Add(rcp
, rcp
);
64 const auto x_rcp
= Mul(x
, rcp
);
65 return NegMulAdd(x_rcp
, rcp
, sum
);
68 V
operator()(const V n
, const V d
) const {
69 #if JXL_TRUE // Faster on SKX
72 return n
* ReciprocalNR(d
);
77 // Approximates smooth functions via rational polynomials (i.e. dividing two
78 // polynomials). Evaluates polynomials via Horner's scheme, which is faster than
79 // Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to
80 // specify constants (replicated 4x) independently of the lane count.
81 template <size_t NP
, size_t NQ
, class D
, class V
, typename T
>
82 HWY_INLINE HWY_MAYBE_UNUSED V
EvalRationalPolynomial(const D d
, const V x
,
85 constexpr size_t kDegP
= NP
/ 4 - 1;
86 constexpr size_t kDegQ
= NQ
/ 4 - 1;
87 auto yp
= LoadDup128(d
, &p
[kDegP
* 4]);
88 auto yq
= LoadDup128(d
, &q
[kDegQ
* 4]);
89 // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
90 // compiler warning that the index is out of bounds since we are already
91 // checking that it is not out of bounds with (kDegP >= n) and the access
92 // will be optimized away. Similarly with q and kDegQ.
94 if (kDegP
>= 1) yp
= MulAdd(yp
, x
, LoadDup128(d
, p
+ ((kDegP
- 1) * 4)));
95 if (kDegQ
>= 1) yq
= MulAdd(yq
, x
, LoadDup128(d
, q
+ ((kDegQ
- 1) * 4)));
97 if (kDegP
>= 2) yp
= MulAdd(yp
, x
, LoadDup128(d
, p
+ ((kDegP
- 2) * 4)));
98 if (kDegQ
>= 2) yq
= MulAdd(yq
, x
, LoadDup128(d
, q
+ ((kDegQ
- 2) * 4)));
100 if (kDegP
>= 3) yp
= MulAdd(yp
, x
, LoadDup128(d
, p
+ ((kDegP
- 3) * 4)));
101 if (kDegQ
>= 3) yq
= MulAdd(yq
, x
, LoadDup128(d
, q
+ ((kDegQ
- 3) * 4)));
103 if (kDegP
>= 4) yp
= MulAdd(yp
, x
, LoadDup128(d
, p
+ ((kDegP
- 4) * 4)));
104 if (kDegQ
>= 4) yq
= MulAdd(yq
, x
, LoadDup128(d
, q
+ ((kDegQ
- 4) * 4)));
106 if (kDegP
>= 5) yp
= MulAdd(yp
, x
, LoadDup128(d
, p
+ ((kDegP
- 5) * 4)));
107 if (kDegQ
>= 5) yq
= MulAdd(yq
, x
, LoadDup128(d
, q
+ ((kDegQ
- 5) * 4)));
109 if (kDegP
>= 6) yp
= MulAdd(yp
, x
, LoadDup128(d
, p
+ ((kDegP
- 6) * 4)));
110 if (kDegQ
>= 6) yq
= MulAdd(yq
, x
, LoadDup128(d
, q
+ ((kDegQ
- 6) * 4)));
112 if (kDegP
>= 7) yp
= MulAdd(yp
, x
, LoadDup128(d
, p
+ ((kDegP
- 7) * 4)));
113 if (kDegQ
>= 7) yq
= MulAdd(yq
, x
, LoadDup128(d
, q
+ ((kDegQ
- 7) * 4)));
115 return FastDivision
<T
, V
>()(yp
, yq
);
118 // Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
120 template <class DF
, class V
>
121 V
FastLog2f(const DF df
, V x
) {
122 // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
123 HWY_ALIGN
const float p
[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f
),
124 HWY_REP4(1.4287160470083755E+00f
),
125 HWY_REP4(7.4245873327820566E-01f
)};
126 HWY_ALIGN
const float q
[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f
),
127 HWY_REP4(1.0096718572241148E+00f
),
128 HWY_REP4(1.7409343003366853E-01f
)};
130 const Rebind
<int32_t, DF
> di
;
131 const auto x_bits
= BitCast(di
, x
);
133 // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
134 const auto exp_bits
= Sub(x_bits
, Set(di
, 0x3f2aaaab)); // = 2/3
135 // Shifted exponent = log2; also used to clear mantissa.
136 const auto exp_shifted
= ShiftRight
<23>(exp_bits
);
137 const auto mantissa
= BitCast(df
, Sub(x_bits
, ShiftLeft
<23>(exp_shifted
)));
138 const auto exp_val
= ConvertTo(df
, exp_shifted
);
139 return Add(EvalRationalPolynomial(df
, Sub(mantissa
, Set(df
, 1.0f
)), p
, q
),
143 // max relative error ~3e-7
144 template <class DF
, class V
>
145 V
FastPow2f(const DF df
, V x
) {
146 const Rebind
<int32_t, DF
> di
;
147 auto floorx
= Floor(x
);
149 BitCast(df
, ShiftLeft
<23>(Add(ConvertTo(di
, floorx
), Set(di
, 127))));
150 auto frac
= Sub(x
, floorx
);
151 auto num
= Add(frac
, Set(df
, 1.01749063e+01));
152 num
= MulAdd(num
, frac
, Set(df
, 4.88687798e+01));
153 num
= MulAdd(num
, frac
, Set(df
, 9.85506591e+01));
155 auto den
= MulAdd(frac
, Set(df
, 2.10242958e-01), Set(df
, -2.22328856e-02));
156 den
= MulAdd(den
, frac
, Set(df
, -1.94414990e+01));
157 den
= MulAdd(den
, frac
, Set(df
, 9.85506633e+01));
158 return Div(num
, den
);
161 inline float FastPow2f(float f
) {
162 HWY_CAPPED(float, 1) D
;
163 return GetLane(FastPow2f(D
, Set(D
, f
)));
166 // The following functions modulate an exponent (out_val) and return the updated
167 // value. Their descriptor is limited to 8 lanes for 8x8 blocks.
169 template <class D
, class V
>
170 V
ComputeMask(const D d
, const V out_val
) {
171 const auto kBase
= Set(d
, -0.74174993f
);
172 const auto kMul4
= Set(d
, 3.2353257320940401f
);
173 const auto kMul2
= Set(d
, 12.906028311180409f
);
174 const auto kOffset2
= Set(d
, 305.04035728311436f
);
175 const auto kMul3
= Set(d
, 5.0220313103171232f
);
176 const auto kOffset3
= Set(d
, 2.1925739705298404f
);
177 const auto kOffset4
= Mul(Set(d
, 0.25f
), kOffset3
);
178 const auto kMul0
= Set(d
, 0.74760422233706747f
);
179 const auto k1
= Set(d
, 1.0f
);
181 // Avoid division by zero.
182 const auto v1
= Max(Mul(out_val
, kMul0
), Set(d
, 1e-3f
));
183 const auto v2
= Div(k1
, Add(v1
, kOffset2
));
184 const auto v3
= Div(k1
, MulAdd(v1
, v1
, kOffset3
));
185 const auto v4
= Div(k1
, MulAdd(v1
, v1
, kOffset4
));
187 // A log or two here could make sense. In butteraugli we have effectively
188 // log(log(x + C)) for this kind of use, as a single log is used in
189 // saturating visual masking and here the modulation values are exponential,
190 // another log would counter that.
191 return Add(kBase
, MulAdd(kMul4
, v4
, MulAdd(kMul2
, v2
, Mul(kMul3
, v3
))));
194 // mul and mul2 represent a scaling difference between jxl and butteraugli.
195 const float kSGmul
= 226.0480446705883f
;
196 const float kSGmul2
= 1.0f
/ 73.377132366608819f
;
197 const float kLog2
= 0.693147181f
;
198 // Includes correction factor for std::log -> log2.
199 const float kSGRetMul
= kSGmul2
* 18.6580932135f
* kLog2
;
200 const float kSGVOffset
= 7.14672470003f
;
202 template <bool invert
, typename D
, typename V
>
203 V
RatioOfDerivativesOfCubicRootToSimpleGamma(const D d
, V v
) {
204 // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
205 // is related to the number of photons.
207 // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
208 // This ratio allows quantization to move from jxl's opsin space to
209 // butteraugli's log-gamma space.
210 static const float kEpsilon
= 1e-2;
211 static const float kNumOffset
= kEpsilon
/ kInputScaling
/ kInputScaling
;
212 static const float kNumMul
= kSGRetMul
* 3 * kSGmul
;
213 static const float kVOffset
= (kSGVOffset
* kLog2
+ kEpsilon
) / kInputScaling
;
214 static const float kDenMul
= kLog2
* kSGmul
* kInputScaling
* kInputScaling
;
216 v
= ZeroIfNegative(v
);
217 const auto num_mul
= Set(d
, kNumMul
);
218 const auto num_offset
= Set(d
, kNumOffset
);
219 const auto den_offset
= Set(d
, kVOffset
);
220 const auto den_mul
= Set(d
, kDenMul
);
222 const auto v2
= Mul(v
, v
);
224 const auto num
= MulAdd(num_mul
, v2
, num_offset
);
225 const auto den
= MulAdd(Mul(den_mul
, v
), v2
, den_offset
);
226 return invert
? Div(num
, den
) : Div(den
, num
);
229 template <bool invert
= false>
230 float RatioOfDerivativesOfCubicRootToSimpleGamma(float v
) {
231 using DScalar
= HWY_CAPPED(float, 1);
232 auto vscalar
= Load(DScalar(), &v
);
234 RatioOfDerivativesOfCubicRootToSimpleGamma
<invert
>(DScalar(), vscalar
));
237 // TODO(veluca): this function computes an approximation of the derivative of
238 // SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or
239 // exact derivatives. For reference, SimpleGamma was:
241 template <typename D, typename V>
242 V SimpleGamma(const D d, V v) {
243 // A simple HDR compatible gamma function.
244 const auto mul = Set(d, kSGmul);
245 const auto kRetMul = Set(d, kSGRetMul);
246 const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f);
247 const auto kVOffset = Set(d, kSGVOffset);
251 // This should happen rarely, but may lead to a NaN, which is rather
252 // undesirable. Since negative photons don't exist we solve the NaNs by
254 // TODO(veluca): with FastLog2f, this no longer leads to NaNs.
255 v = ZeroIfNegative(v);
256 return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd;
260 template <class D
, class V
>
261 V
GammaModulation(const D d
, const size_t x
, const size_t y
,
262 const RowBuffer
<float>& input
, const V out_val
) {
263 static const float kBias
= 0.16f
/ kInputScaling
;
264 static const float kScale
= kInputScaling
/ 64.0f
;
265 auto overall_ratio
= Zero(d
);
266 const auto bias
= Set(d
, kBias
);
267 const auto scale
= Set(d
, kScale
);
268 const float* const JXL_RESTRICT block_start
= input
.Row(y
) + x
;
269 for (size_t dy
= 0; dy
< 8; ++dy
) {
270 const float* const JXL_RESTRICT row_in
= block_start
+ dy
* input
.stride();
271 for (size_t dx
= 0; dx
< 8; dx
+= Lanes(d
)) {
272 const auto iny
= Add(Load(d
, row_in
+ dx
), bias
);
274 RatioOfDerivativesOfCubicRootToSimpleGamma
</*invert=*/true>(d
, iny
);
275 overall_ratio
= Add(overall_ratio
, ratio_g
);
278 overall_ratio
= Mul(SumOfLanes(d
, overall_ratio
), scale
);
279 // ideally -1.0, but likely optimal correction adds some entropy, so slightly
281 // ln(2) constant folded in because we want std::log but have FastLog2f.
282 const auto kGamma
= Set(d
, -0.15526878023684174f
* 0.693147180559945f
);
283 return MulAdd(kGamma
, FastLog2f(d
, overall_ratio
), out_val
);
286 // Change precision in 8x8 blocks that have high frequency content.
287 template <class D
, class V
>
288 V
HfModulation(const D d
, const size_t x
, const size_t y
,
289 const RowBuffer
<float>& input
, const V out_val
) {
290 // Zero out the invalid differences for the rightmost value per row.
291 const Rebind
<uint32_t, D
> du
;
292 HWY_ALIGN
constexpr uint32_t kMaskRight
[8] = {~0u, ~0u, ~0u, ~0u,
295 auto sum
= Zero(d
); // sum of absolute differences with right and below
296 static const float kSumCoeff
= -2.0052193233688884f
* kInputScaling
/ 112.0;
297 auto sumcoeff
= Set(d
, kSumCoeff
);
299 const float* const JXL_RESTRICT block_start
= input
.Row(y
) + x
;
300 for (size_t dy
= 0; dy
< 8; ++dy
) {
301 const float* JXL_RESTRICT row_in
= block_start
+ dy
* input
.stride();
302 const float* JXL_RESTRICT row_in_next
=
303 dy
== 7 ? row_in
: row_in
+ input
.stride();
305 for (size_t dx
= 0; dx
< 8; dx
+= Lanes(d
)) {
306 const auto p
= Load(d
, row_in
+ dx
);
307 const auto pr
= LoadU(d
, row_in
+ dx
+ 1);
308 const auto mask
= BitCast(d
, Load(du
, kMaskRight
+ dx
));
309 sum
= Add(sum
, And(mask
, AbsDiff(p
, pr
)));
310 const auto pd
= Load(d
, row_in_next
+ dx
);
311 sum
= Add(sum
, AbsDiff(p
, pd
));
315 sum
= SumOfLanes(d
, sum
);
316 return MulAdd(sum
, sumcoeff
, out_val
);
319 void PerBlockModulations(const float y_quant_01
, const RowBuffer
<float>& input
,
320 const size_t yb0
, const size_t yblen
,
321 RowBuffer
<float>* aq_map
) {
322 static const float kAcQuant
= 0.841f
;
323 float base_level
= 0.48f
* kAcQuant
;
324 float kDampenRampStart
= 9.0f
;
325 float kDampenRampEnd
= 65.0f
;
327 if (y_quant_01
>= kDampenRampStart
) {
328 dampen
= 1.0f
- ((y_quant_01
- kDampenRampStart
) /
329 (kDampenRampEnd
- kDampenRampStart
));
334 const float mul
= kAcQuant
* dampen
;
335 const float add
= (1.0f
- dampen
) * base_level
;
336 for (size_t iy
= 0; iy
< yblen
; iy
++) {
337 const size_t yb
= yb0
+ iy
;
338 const size_t y
= yb
* 8;
339 float* const JXL_RESTRICT row_out
= aq_map
->Row(yb
);
340 const HWY_CAPPED(float, 8) df
;
341 for (size_t ix
= 0; ix
< aq_map
->xsize(); ix
++) {
343 auto out_val
= Set(df
, row_out
[ix
]);
344 out_val
= ComputeMask(df
, out_val
);
345 out_val
= HfModulation(df
, x
, y
, input
, out_val
);
346 out_val
= GammaModulation(df
, x
, y
, input
, out_val
);
347 // We want multiplicative quantization field, so everything
348 // until this point has been modulating the exponent.
349 row_out
[ix
] = FastPow2f(GetLane(out_val
) * 1.442695041f
) * mul
+ add
;
354 template <typename D
, typename V
>
355 V
MaskingSqrt(const D d
, V v
) {
356 static const float kLogOffset
= 28;
357 static const float kMul
= 211.50759899638012f
;
358 const auto mul_v
= Set(d
, kMul
* 1e8
);
359 const auto offset_v
= Set(d
, kLogOffset
);
360 return Mul(Set(d
, 0.25f
), Sqrt(MulAdd(v
, Sqrt(mul_v
), offset_v
)));
363 template <typename V
>
364 void Sort4(V
& min0
, V
& min1
, V
& min2
, V
& min3
) {
365 const auto tmp0
= Min(min0
, min1
);
366 const auto tmp1
= Max(min0
, min1
);
367 const auto tmp2
= Min(min2
, min3
);
368 const auto tmp3
= Max(min2
, min3
);
369 const auto tmp4
= Max(tmp0
, tmp2
);
370 const auto tmp5
= Min(tmp1
, tmp3
);
371 min0
= Min(tmp0
, tmp2
);
372 min1
= Min(tmp4
, tmp5
);
373 min2
= Max(tmp4
, tmp5
);
374 min3
= Max(tmp1
, tmp3
);
377 template <typename V
>
378 void UpdateMin4(const V v
, V
& min0
, V
& min1
, V
& min2
, V
& min3
) {
379 const auto tmp0
= Max(min0
, v
);
380 const auto tmp1
= Max(min1
, tmp0
);
381 const auto tmp2
= Max(min2
, tmp1
);
383 min1
= Min(min1
, tmp0
);
384 min2
= Min(min2
, tmp1
);
385 min3
= Min(min3
, tmp2
);
388 // Computes a linear combination of the 4 lowest values of the 3x3 neighborhood
389 // of each pixel. Output is downsampled 2x.
390 void FuzzyErosion(const RowBuffer
<float>& pre_erosion
, const size_t yb0
,
391 const size_t yblen
, RowBuffer
<float>* tmp
,
392 RowBuffer
<float>* aq_map
) {
393 int xsize_blocks
= aq_map
->xsize();
394 int xsize
= pre_erosion
.xsize();
396 const auto mul0
= Set(d
, 0.125f
);
397 const auto mul1
= Set(d
, 0.075f
);
398 const auto mul2
= Set(d
, 0.06f
);
399 const auto mul3
= Set(d
, 0.05f
);
400 for (size_t iy
= 0; iy
< 2 * yblen
; ++iy
) {
401 size_t y
= 2 * yb0
+ iy
;
402 const float* JXL_RESTRICT rowt
= pre_erosion
.Row(y
- 1);
403 const float* JXL_RESTRICT rowm
= pre_erosion
.Row(y
);
404 const float* JXL_RESTRICT rowb
= pre_erosion
.Row(y
+ 1);
405 float* row_out
= tmp
->Row(y
);
406 for (int x
= 0; x
< xsize
; x
+= Lanes(d
)) {
409 auto min0
= LoadU(d
, rowm
+ x
);
410 auto min1
= LoadU(d
, rowm
+ xm1
);
411 auto min2
= LoadU(d
, rowm
+ xp1
);
412 auto min3
= LoadU(d
, rowt
+ xm1
);
413 Sort4(min0
, min1
, min2
, min3
);
414 UpdateMin4(LoadU(d
, rowt
+ x
), min0
, min1
, min2
, min3
);
415 UpdateMin4(LoadU(d
, rowt
+ xp1
), min0
, min1
, min2
, min3
);
416 UpdateMin4(LoadU(d
, rowb
+ xm1
), min0
, min1
, min2
, min3
);
417 UpdateMin4(LoadU(d
, rowb
+ x
), min0
, min1
, min2
, min3
);
418 UpdateMin4(LoadU(d
, rowb
+ xp1
), min0
, min1
, min2
, min3
);
419 const auto v
= Add(Add(Mul(mul0
, min0
), Mul(mul1
, min1
)),
420 Add(Mul(mul2
, min2
), Mul(mul3
, min3
)));
421 Store(v
, d
, row_out
+ x
);
424 const float* JXL_RESTRICT row_out0
= tmp
->Row(y
- 1);
425 float* JXL_RESTRICT aq_out
= aq_map
->Row(yb0
+ iy
/ 2);
426 for (int bx
= 0, x
= 0; bx
< xsize_blocks
; ++bx
, x
+= 2) {
428 (row_out
[x
] + row_out
[x
+ 1] + row_out0
[x
] + row_out0
[x
+ 1]);
434 void ComputePreErosion(const RowBuffer
<float>& input
, const size_t xsize
,
435 const size_t y0
, const size_t ylen
, int border
,
436 float* diff_buffer
, RowBuffer
<float>* pre_erosion
) {
437 const size_t xsize_out
= xsize
/ 4;
438 const size_t y0_out
= y0
/ 4;
440 // The XYB gamma is 3.0 to be able to decode faster with two muls.
441 // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
442 // We approximate the gamma difference by adding one cubic root into
443 // the adaptive quantization. This gives us a total gamma of 2.6666
444 // for quantization uses.
445 static const float match_gamma_offset
= 0.019 / kInputScaling
;
447 const HWY_CAPPED(float, 8) df
;
449 static const float limit
= 0.2f
;
450 // Computes image (padded to multiple of 8x8) of local pixel differences.
451 // Subsample both directions by 4.
452 for (size_t iy
= 0; iy
< ylen
; ++iy
) {
454 const float* row_in
= input
.Row(y
);
455 const float* row_in1
= input
.Row(y
+ 1);
456 const float* row_in2
= input
.Row(y
- 1);
457 float* JXL_RESTRICT row_out
= diff_buffer
;
458 const auto match_gamma_offset_v
= Set(df
, match_gamma_offset
);
459 const auto quarter
= Set(df
, 0.25f
);
460 for (size_t x
= 0; x
< xsize
; x
+= Lanes(df
)) {
461 const auto in
= LoadU(df
, row_in
+ x
);
462 const auto in_r
= LoadU(df
, row_in
+ x
+ 1);
463 const auto in_l
= LoadU(df
, row_in
+ x
- 1);
464 const auto in_t
= LoadU(df
, row_in2
+ x
);
465 const auto in_b
= LoadU(df
, row_in1
+ x
);
466 const auto base
= Mul(quarter
, Add(Add(in_r
, in_l
), Add(in_t
, in_b
)));
468 RatioOfDerivativesOfCubicRootToSimpleGamma
</*invert=*/false>(
469 df
, Add(in
, match_gamma_offset_v
));
470 auto diff
= Mul(gammacv
, Sub(in
, base
));
471 diff
= Mul(diff
, diff
);
472 diff
= Min(diff
, Set(df
, limit
));
473 diff
= MaskingSqrt(df
, diff
);
475 diff
= Add(diff
, LoadU(df
, row_out
+ x
));
477 StoreU(diff
, df
, row_out
+ x
);
480 size_t y_out
= y0_out
+ iy
/ 4;
481 float* row_d_out
= pre_erosion
->Row(y_out
);
482 for (size_t x
= 0; x
< xsize_out
; x
++) {
483 row_d_out
[x
] = (row_out
[x
* 4] + row_out
[x
* 4 + 1] +
484 row_out
[x
* 4 + 2] + row_out
[x
* 4 + 3]) *
487 pre_erosion
->PadRow(y_out
, xsize_out
, border
);
494 // NOLINTNEXTLINE(google-readability-namespace-comments)
495 } // namespace HWY_NAMESPACE
496 } // namespace jpegli
497 HWY_AFTER_NAMESPACE();
501 HWY_EXPORT(ComputePreErosion
);
502 HWY_EXPORT(FuzzyErosion
);
503 HWY_EXPORT(PerBlockModulations
);
507 constexpr int kPreErosionBorder
= 1;
511 void ComputeAdaptiveQuantField(j_compress_ptr cinfo
) {
512 jpeg_comp_master
* m
= cinfo
->master
;
513 if (!m
->use_adaptive_quantization
) {
516 int y_channel
= cinfo
->jpeg_color_space
== JCS_RGB
? 1 : 0;
517 jpeg_component_info
* y_comp
= &cinfo
->comp_info
[y_channel
];
518 int y_quant_01
= cinfo
->quant_tbl_ptrs
[y_comp
->quant_tbl_no
]->quantval
[1];
519 if (m
->next_iMCU_row
== 0) {
520 m
->input_buffer
[y_channel
].CopyRow(-1, 0, 1);
522 if (m
->next_iMCU_row
+ 1 == cinfo
->total_iMCU_rows
) {
523 size_t last_row
= m
->ysize_blocks
* DCTSIZE
- 1;
524 m
->input_buffer
[y_channel
].CopyRow(last_row
+ 1, last_row
, 1);
526 const RowBuffer
<float>& input
= m
->input_buffer
[y_channel
];
527 const size_t xsize_blocks
= y_comp
->width_in_blocks
;
528 const size_t xsize
= xsize_blocks
* DCTSIZE
;
529 const size_t yb0
= m
->next_iMCU_row
* cinfo
->max_v_samp_factor
;
530 const size_t yblen
= cinfo
->max_v_samp_factor
;
531 size_t y0
= yb0
* DCTSIZE
;
532 size_t ylen
= cinfo
->max_v_samp_factor
* DCTSIZE
;
538 if (m
->next_iMCU_row
+ 1 == cinfo
->total_iMCU_rows
) {
541 HWY_DYNAMIC_DISPATCH(ComputePreErosion
)
542 (input
, xsize
, y0
, ylen
, kPreErosionBorder
, m
->diff_buffer
, &m
->pre_erosion
);
544 m
->pre_erosion
.CopyRow(-1, 0, kPreErosionBorder
);
546 if (m
->next_iMCU_row
+ 1 == cinfo
->total_iMCU_rows
) {
547 size_t last_row
= m
->ysize_blocks
* 2 - 1;
548 m
->pre_erosion
.CopyRow(last_row
+ 1, last_row
, kPreErosionBorder
);
550 HWY_DYNAMIC_DISPATCH(FuzzyErosion
)
551 (m
->pre_erosion
, yb0
, yblen
, &m
->fuzzy_erosion_tmp
, &m
->quant_field
);
552 HWY_DYNAMIC_DISPATCH(PerBlockModulations
)
553 (y_quant_01
, input
, yb0
, yblen
, &m
->quant_field
);
554 for (int y
= 0; y
< cinfo
->max_v_samp_factor
; ++y
) {
555 float* row
= m
->quant_field
.Row(yb0
+ y
);
556 for (size_t x
= 0; x
< xsize_blocks
; ++x
) {
557 row
[x
] = std::max(0.0f
, (0.6f
/ row
[x
]) - 1.0f
);
562 } // namespace jpegli