Merge autoland to mozilla-central. a=merge
[gecko.git] / third_party / jpeg-xl / lib / jpegli / dct-inl.h
blobfb54a152d59c36a60a996b48e5125f37935841b8
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
6 #if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
7 #ifdef LIB_JPEGLI_DCT_INL_H_
8 #undef LIB_JPEGLI_DCT_INL_H_
9 #else
10 #define LIB_JPEGLI_DCT_INL_H_
11 #endif
13 #include "lib/jpegli/transpose-inl.h"
14 #include "lib/jxl/base/compiler_specific.h"
16 HWY_BEFORE_NAMESPACE();
17 namespace jpegli {
18 namespace HWY_NAMESPACE {
19 namespace {
21 // These templates are not found via ADL.
22 using hwy::HWY_NAMESPACE::Abs;
23 using hwy::HWY_NAMESPACE::Add;
24 using hwy::HWY_NAMESPACE::DemoteTo;
25 using hwy::HWY_NAMESPACE::Ge;
26 using hwy::HWY_NAMESPACE::IfThenElseZero;
27 using hwy::HWY_NAMESPACE::Mul;
28 using hwy::HWY_NAMESPACE::MulAdd;
29 using hwy::HWY_NAMESPACE::Rebind;
30 using hwy::HWY_NAMESPACE::Round;
31 using hwy::HWY_NAMESPACE::Sub;
32 using hwy::HWY_NAMESPACE::Vec;
34 using D = HWY_FULL(float);
35 using DI = HWY_FULL(int32_t);
37 template <size_t N>
38 void AddReverse(const float* JXL_RESTRICT a_in1,
39 const float* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) {
40 HWY_CAPPED(float, 8) d8;
41 for (size_t i = 0; i < N; i++) {
42 auto in1 = Load(d8, a_in1 + i * 8);
43 auto in2 = Load(d8, a_in2 + (N - i - 1) * 8);
44 Store(Add(in1, in2), d8, a_out + i * 8);
48 template <size_t N>
49 void SubReverse(const float* JXL_RESTRICT a_in1,
50 const float* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) {
51 HWY_CAPPED(float, 8) d8;
52 for (size_t i = 0; i < N; i++) {
53 auto in1 = Load(d8, a_in1 + i * 8);
54 auto in2 = Load(d8, a_in2 + (N - i - 1) * 8);
55 Store(Sub(in1, in2), d8, a_out + i * 8);
59 template <size_t N>
60 void B(float* JXL_RESTRICT coeff) {
61 HWY_CAPPED(float, 8) d8;
62 constexpr float kSqrt2 = 1.41421356237f;
63 auto sqrt2 = Set(d8, kSqrt2);
64 auto in1 = Load(d8, coeff);
65 auto in2 = Load(d8, coeff + 8);
66 Store(MulAdd(in1, sqrt2, in2), d8, coeff);
67 for (size_t i = 1; i + 1 < N; i++) {
68 auto in1 = Load(d8, coeff + i * 8);
69 auto in2 = Load(d8, coeff + (i + 1) * 8);
70 Store(Add(in1, in2), d8, coeff + i * 8);
74 // Ideally optimized away by compiler (except the multiply).
75 template <size_t N>
76 void InverseEvenOdd(const float* JXL_RESTRICT a_in, float* JXL_RESTRICT a_out) {
77 HWY_CAPPED(float, 8) d8;
78 for (size_t i = 0; i < N / 2; i++) {
79 auto in1 = Load(d8, a_in + i * 8);
80 Store(in1, d8, a_out + 2 * i * 8);
82 for (size_t i = N / 2; i < N; i++) {
83 auto in1 = Load(d8, a_in + i * 8);
84 Store(in1, d8, a_out + (2 * (i - N / 2) + 1) * 8);
88 // Constants for DCT implementation. Generated by the following snippet:
89 // for i in range(N // 2):
90 // print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
91 template <size_t N>
92 struct WcMultipliers;
94 template <>
95 struct WcMultipliers<4> {
96 static constexpr float kMultipliers[] = {
97 0.541196100146197,
98 1.3065629648763764,
102 template <>
103 struct WcMultipliers<8> {
104 static constexpr float kMultipliers[] = {
105 0.5097955791041592,
106 0.6013448869350453,
107 0.8999762231364156,
108 2.5629154477415055,
112 #if JXL_CXX_LANG < JXL_CXX_17
113 constexpr float WcMultipliers<4>::kMultipliers[];
114 constexpr float WcMultipliers<8>::kMultipliers[];
115 #endif
117 // Invoked on full vector.
118 template <size_t N>
119 void Multiply(float* JXL_RESTRICT coeff) {
120 HWY_CAPPED(float, 8) d8;
121 for (size_t i = 0; i < N / 2; i++) {
122 auto in1 = Load(d8, coeff + (N / 2 + i) * 8);
123 auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
124 Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8);
128 void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride,
129 size_t off, float* JXL_RESTRICT coeff) {
130 HWY_CAPPED(float, 8) d8;
131 for (size_t i = 0; i < 8; i++) {
132 Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8);
136 void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output,
137 size_t off) {
138 HWY_CAPPED(float, 8) d8;
139 auto mul = Set(d8, 1.0f / 8);
140 for (size_t i = 0; i < 8; i++) {
141 StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off);
145 template <size_t N>
146 struct DCT1DImpl;
148 template <>
149 struct DCT1DImpl<1> {
150 JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
153 template <>
154 struct DCT1DImpl<2> {
155 JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
156 HWY_CAPPED(float, 8) d8;
157 auto in1 = Load(d8, mem);
158 auto in2 = Load(d8, mem + 8);
159 Store(Add(in1, in2), d8, mem);
160 Store(Sub(in1, in2), d8, mem + 8);
164 template <size_t N>
165 struct DCT1DImpl {
166 void operator()(float* JXL_RESTRICT mem) {
167 HWY_ALIGN float tmp[N * 8];
168 AddReverse<N / 2>(mem, mem + N * 4, tmp);
169 DCT1DImpl<N / 2>()(tmp);
170 SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);
171 Multiply<N>(tmp);
172 DCT1DImpl<N / 2>()(tmp + N * 4);
173 B<N / 2>(tmp + N * 4);
174 InverseEvenOdd<N>(tmp, mem);
178 void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride,
179 float* JXL_RESTRICT output) {
180 HWY_CAPPED(float, 8) d8;
181 HWY_ALIGN float tmp[64];
182 for (size_t i = 0; i < 8; i += Lanes(d8)) {
183 // TODO(veluca): consider removing the temporary memory here (as is done in
184 // IDCT), if it turns out that some compilers don't optimize away the loads
185 // and this is performance-critical.
186 LoadFromBlock(pixels, pixels_stride, i, tmp);
187 DCT1DImpl<8>()(tmp);
188 StoreToBlockAndScale(tmp, output, i);
192 JXL_INLINE JXL_MAYBE_UNUSED void TransformFromPixels(
193 const float* JXL_RESTRICT pixels, size_t pixels_stride,
194 float* JXL_RESTRICT coefficients, float* JXL_RESTRICT scratch_space) {
195 DCT1D(pixels, pixels_stride, scratch_space);
196 Transpose8x8Block(scratch_space, coefficients);
197 DCT1D(coefficients, 8, scratch_space);
198 Transpose8x8Block(scratch_space, coefficients);
201 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
202 int16_t* out) {
203 Rebind<int16_t, DI> di16;
204 Store(DemoteTo(di16, ival), di16, out);
207 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
208 int32_t* out) {
209 DI di;
210 Store(ival, di, out);
213 template <typename T>
214 void QuantizeBlock(const float* dct, const float* qmc, float aq_strength,
215 const float* zero_bias_offset, const float* zero_bias_mul,
216 T* block) {
217 D d;
218 DI di;
219 const auto aq_mul = Set(d, aq_strength);
220 for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
221 const auto val = Load(d, dct + k);
222 const auto q = Load(d, qmc + k);
223 const auto qval = Mul(val, q);
224 const auto zb_offset = Load(d, zero_bias_offset + k);
225 const auto zb_mul = Load(d, zero_bias_mul + k);
226 const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
227 const auto nzero_mask = Ge(Abs(qval), threshold);
228 const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval)));
229 StoreQuantizedValue(ival, block + k);
233 template <typename T>
234 void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride,
235 const float* JXL_RESTRICT qmc,
236 int16_t last_dc_coeff, float aq_strength,
237 const float* zero_bias_offset,
238 const float* zero_bias_mul,
239 float* JXL_RESTRICT tmp, T* block) {
240 float* JXL_RESTRICT dct = tmp;
241 float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2;
242 TransformFromPixels(pixels, stride, dct, scratch_space);
243 QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul, block);
244 // Center DC values around zero.
245 static constexpr float kDCBias = 128.0f;
246 const float dc = (dct[0] - kDCBias) * qmc[0];
247 float dc_threshold = zero_bias_offset[0] + aq_strength * zero_bias_mul[0];
248 if (std::abs(dc - last_dc_coeff) < dc_threshold) {
249 block[0] = last_dc_coeff;
250 } else {
251 block[0] = std::round(dc);
255 // NOLINTNEXTLINE(google-readability-namespace-comments)
256 } // namespace
257 } // namespace HWY_NAMESPACE
258 } // namespace jpegli
259 HWY_AFTER_NAMESPACE();
260 #endif // LIB_JPEGLI_DCT_INL_H_