Backed out 3 changesets (bug 1790375) for causing wd failures on fetch_error.py....
[gecko.git] / third_party / jpeg-xl / lib / jpegli / dct-inl.h
blob1cbe704002837ee6b269ce33fe3a55e0b6fe7ded
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
6 #if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
7 #ifdef LIB_JPEGLI_DCT_INL_H_
8 #undef LIB_JPEGLI_DCT_INL_H_
9 #else
10 #define LIB_JPEGLI_DCT_INL_H_
11 #endif
13 #include "lib/jpegli/transpose-inl.h"
14 #include "lib/jxl/base/compiler_specific.h"
16 HWY_BEFORE_NAMESPACE();
17 namespace jpegli {
18 namespace HWY_NAMESPACE {
19 namespace {
21 // These templates are not found via ADL.
22 using hwy::HWY_NAMESPACE::Abs;
23 using hwy::HWY_NAMESPACE::Add;
24 using hwy::HWY_NAMESPACE::DemoteTo;
25 using hwy::HWY_NAMESPACE::Ge;
26 using hwy::HWY_NAMESPACE::IfThenElseZero;
27 using hwy::HWY_NAMESPACE::Mul;
28 using hwy::HWY_NAMESPACE::MulAdd;
29 using hwy::HWY_NAMESPACE::Rebind;
30 using hwy::HWY_NAMESPACE::Round;
31 using hwy::HWY_NAMESPACE::Sub;
32 using hwy::HWY_NAMESPACE::Vec;
34 using D = HWY_FULL(float);
35 using DI = HWY_FULL(int32_t);
37 template <size_t N>
38 void AddReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
39 float* JXL_RESTRICT aout) {
40 HWY_CAPPED(float, 8) d8;
41 for (size_t i = 0; i < N; i++) {
42 auto in1 = Load(d8, ain1 + i * 8);
43 auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
44 Store(Add(in1, in2), d8, aout + i * 8);
48 template <size_t N>
49 void SubReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
50 float* JXL_RESTRICT aout) {
51 HWY_CAPPED(float, 8) d8;
52 for (size_t i = 0; i < N; i++) {
53 auto in1 = Load(d8, ain1 + i * 8);
54 auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
55 Store(Sub(in1, in2), d8, aout + i * 8);
59 template <size_t N>
60 void B(float* JXL_RESTRICT coeff) {
61 HWY_CAPPED(float, 8) d8;
62 constexpr float kSqrt2 = 1.41421356237f;
63 auto sqrt2 = Set(d8, kSqrt2);
64 auto in1 = Load(d8, coeff);
65 auto in2 = Load(d8, coeff + 8);
66 Store(MulAdd(in1, sqrt2, in2), d8, coeff);
67 for (size_t i = 1; i + 1 < N; i++) {
68 auto in1 = Load(d8, coeff + i * 8);
69 auto in2 = Load(d8, coeff + (i + 1) * 8);
70 Store(Add(in1, in2), d8, coeff + i * 8);
74 // Ideally optimized away by compiler (except the multiply).
75 template <size_t N>
76 void InverseEvenOdd(const float* JXL_RESTRICT ain, float* JXL_RESTRICT aout) {
77 HWY_CAPPED(float, 8) d8;
78 for (size_t i = 0; i < N / 2; i++) {
79 auto in1 = Load(d8, ain + i * 8);
80 Store(in1, d8, aout + 2 * i * 8);
82 for (size_t i = N / 2; i < N; i++) {
83 auto in1 = Load(d8, ain + i * 8);
84 Store(in1, d8, aout + (2 * (i - N / 2) + 1) * 8);
88 // Constants for DCT implementation. Generated by the following snippet:
89 // for i in range(N // 2):
90 // print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
91 template <size_t N>
92 struct WcMultipliers;
94 template <>
95 struct WcMultipliers<4> {
96 static constexpr float kMultipliers[] = {
97 0.541196100146197,
98 1.3065629648763764,
102 template <>
103 struct WcMultipliers<8> {
104 static constexpr float kMultipliers[] = {
105 0.5097955791041592,
106 0.6013448869350453,
107 0.8999762231364156,
108 2.5629154477415055,
112 constexpr float WcMultipliers<4>::kMultipliers[];
113 constexpr float WcMultipliers<8>::kMultipliers[];
115 // Invoked on full vector.
116 template <size_t N>
117 void Multiply(float* JXL_RESTRICT coeff) {
118 HWY_CAPPED(float, 8) d8;
119 for (size_t i = 0; i < N / 2; i++) {
120 auto in1 = Load(d8, coeff + (N / 2 + i) * 8);
121 auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
122 Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8);
126 void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride,
127 size_t off, float* JXL_RESTRICT coeff) {
128 HWY_CAPPED(float, 8) d8;
129 for (size_t i = 0; i < 8; i++) {
130 Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8);
134 void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output,
135 size_t off) {
136 HWY_CAPPED(float, 8) d8;
137 auto mul = Set(d8, 1.0f / 8);
138 for (size_t i = 0; i < 8; i++) {
139 StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off);
143 template <size_t N>
144 struct DCT1DImpl;
146 template <>
147 struct DCT1DImpl<1> {
148 JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
151 template <>
152 struct DCT1DImpl<2> {
153 JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
154 HWY_CAPPED(float, 8) d8;
155 auto in1 = Load(d8, mem);
156 auto in2 = Load(d8, mem + 8);
157 Store(Add(in1, in2), d8, mem);
158 Store(Sub(in1, in2), d8, mem + 8);
162 template <size_t N>
163 struct DCT1DImpl {
164 void operator()(float* JXL_RESTRICT mem) {
165 HWY_ALIGN float tmp[N * 8];
166 AddReverse<N / 2>(mem, mem + N * 4, tmp);
167 DCT1DImpl<N / 2>()(tmp);
168 SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);
169 Multiply<N>(tmp);
170 DCT1DImpl<N / 2>()(tmp + N * 4);
171 B<N / 2>(tmp + N * 4);
172 InverseEvenOdd<N>(tmp, mem);
176 void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride,
177 float* JXL_RESTRICT output) {
178 HWY_CAPPED(float, 8) d8;
179 HWY_ALIGN float tmp[64];
180 for (size_t i = 0; i < 8; i += Lanes(d8)) {
181 // TODO(veluca): consider removing the temporary memory here (as is done in
182 // IDCT), if it turns out that some compilers don't optimize away the loads
183 // and this is performance-critical.
184 LoadFromBlock(pixels, pixels_stride, i, tmp);
185 DCT1DImpl<8>()(tmp);
186 StoreToBlockAndScale(tmp, output, i);
190 static JXL_INLINE JXL_MAYBE_UNUSED void TransformFromPixels(
191 const float* JXL_RESTRICT pixels, size_t pixels_stride,
192 float* JXL_RESTRICT coefficients, float* JXL_RESTRICT scratch_space) {
193 DCT1D(pixels, pixels_stride, scratch_space);
194 Transpose8x8Block(scratch_space, coefficients);
195 DCT1D(coefficients, 8, scratch_space);
196 Transpose8x8Block(scratch_space, coefficients);
199 static JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
200 int16_t* out) {
201 Rebind<int16_t, DI> di16;
202 Store(DemoteTo(di16, ival), di16, out);
205 static JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
206 int32_t* out) {
207 DI di;
208 Store(ival, di, out);
211 template <typename T>
212 void QuantizeBlock(const float* dct, const float* qmc, float aq_strength,
213 const float* zero_bias_offset, const float* zero_bias_mul,
214 T* block) {
215 D d;
216 DI di;
217 const auto aq_mul = Set(d, aq_strength);
218 for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
219 const auto val = Load(d, dct + k);
220 const auto q = Load(d, qmc + k);
221 const auto qval = Mul(val, q);
222 const auto zb_offset = Load(d, zero_bias_offset + k);
223 const auto zb_mul = Load(d, zero_bias_mul + k);
224 const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
225 const auto nzero_mask = Ge(Abs(qval), threshold);
226 const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval)));
227 StoreQuantizedValue(ival, block + k);
231 template <typename T>
232 void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride,
233 const float* JXL_RESTRICT qmc,
234 int16_t last_dc_coeff, float aq_strength,
235 const float* zero_bias_offset,
236 const float* zero_bias_mul,
237 float* JXL_RESTRICT tmp, T* block) {
238 float* JXL_RESTRICT dct = tmp;
239 float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2;
240 TransformFromPixels(pixels, stride, dct, scratch_space);
241 QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul, block);
242 // Center DC values around zero.
243 static constexpr float kDCBias = 128.0f;
244 const float dc = (dct[0] - kDCBias) * qmc[0];
245 float dc_threshold = zero_bias_offset[0] + aq_strength * zero_bias_mul[0];
246 if (std::abs(dc - last_dc_coeff) < dc_threshold) {
247 block[0] = last_dc_coeff;
248 } else {
249 block[0] = std::round(dc);
253 // NOLINTNEXTLINE(google-readability-namespace-comments)
254 } // namespace
255 } // namespace HWY_NAMESPACE
256 } // namespace jpegli
257 HWY_AFTER_NAMESPACE();
258 #endif // LIB_JPEGLI_DCT_INL_H_