1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
6 #if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
7 #ifdef LIB_JPEGLI_DCT_INL_H_
8 #undef LIB_JPEGLI_DCT_INL_H_
10 #define LIB_JPEGLI_DCT_INL_H_
13 #include "lib/jpegli/transpose-inl.h"
14 #include "lib/jxl/base/compiler_specific.h"
16 HWY_BEFORE_NAMESPACE();
18 namespace HWY_NAMESPACE
{
21 // These templates are not found via ADL.
22 using hwy::HWY_NAMESPACE::Abs
;
23 using hwy::HWY_NAMESPACE::Add
;
24 using hwy::HWY_NAMESPACE::DemoteTo
;
25 using hwy::HWY_NAMESPACE::Ge
;
26 using hwy::HWY_NAMESPACE::IfThenElseZero
;
27 using hwy::HWY_NAMESPACE::Mul
;
28 using hwy::HWY_NAMESPACE::MulAdd
;
29 using hwy::HWY_NAMESPACE::Rebind
;
30 using hwy::HWY_NAMESPACE::Round
;
31 using hwy::HWY_NAMESPACE::Sub
;
32 using hwy::HWY_NAMESPACE::Vec
;
34 using D
= HWY_FULL(float);
35 using DI
= HWY_FULL(int32_t);
38 void AddReverse(const float* JXL_RESTRICT a_in1
,
39 const float* JXL_RESTRICT a_in2
, float* JXL_RESTRICT a_out
) {
40 HWY_CAPPED(float, 8) d8
;
41 for (size_t i
= 0; i
< N
; i
++) {
42 auto in1
= Load(d8
, a_in1
+ i
* 8);
43 auto in2
= Load(d8
, a_in2
+ (N
- i
- 1) * 8);
44 Store(Add(in1
, in2
), d8
, a_out
+ i
* 8);
49 void SubReverse(const float* JXL_RESTRICT a_in1
,
50 const float* JXL_RESTRICT a_in2
, float* JXL_RESTRICT a_out
) {
51 HWY_CAPPED(float, 8) d8
;
52 for (size_t i
= 0; i
< N
; i
++) {
53 auto in1
= Load(d8
, a_in1
+ i
* 8);
54 auto in2
= Load(d8
, a_in2
+ (N
- i
- 1) * 8);
55 Store(Sub(in1
, in2
), d8
, a_out
+ i
* 8);
60 void B(float* JXL_RESTRICT coeff
) {
61 HWY_CAPPED(float, 8) d8
;
62 constexpr float kSqrt2
= 1.41421356237f
;
63 auto sqrt2
= Set(d8
, kSqrt2
);
64 auto in1
= Load(d8
, coeff
);
65 auto in2
= Load(d8
, coeff
+ 8);
66 Store(MulAdd(in1
, sqrt2
, in2
), d8
, coeff
);
67 for (size_t i
= 1; i
+ 1 < N
; i
++) {
68 auto in1
= Load(d8
, coeff
+ i
* 8);
69 auto in2
= Load(d8
, coeff
+ (i
+ 1) * 8);
70 Store(Add(in1
, in2
), d8
, coeff
+ i
* 8);
74 // Ideally optimized away by compiler (except the multiply).
76 void InverseEvenOdd(const float* JXL_RESTRICT a_in
, float* JXL_RESTRICT a_out
) {
77 HWY_CAPPED(float, 8) d8
;
78 for (size_t i
= 0; i
< N
/ 2; i
++) {
79 auto in1
= Load(d8
, a_in
+ i
* 8);
80 Store(in1
, d8
, a_out
+ 2 * i
* 8);
82 for (size_t i
= N
/ 2; i
< N
; i
++) {
83 auto in1
= Load(d8
, a_in
+ i
* 8);
84 Store(in1
, d8
, a_out
+ (2 * (i
- N
/ 2) + 1) * 8);
88 // Constants for DCT implementation. Generated by the following snippet:
89 // for i in range(N // 2):
90 // print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
95 struct WcMultipliers
<4> {
96 static constexpr float kMultipliers
[] = {
103 struct WcMultipliers
<8> {
104 static constexpr float kMultipliers
[] = {
112 #if JXL_CXX_LANG < JXL_CXX_17
113 constexpr float WcMultipliers
<4>::kMultipliers
[];
114 constexpr float WcMultipliers
<8>::kMultipliers
[];
117 // Invoked on full vector.
119 void Multiply(float* JXL_RESTRICT coeff
) {
120 HWY_CAPPED(float, 8) d8
;
121 for (size_t i
= 0; i
< N
/ 2; i
++) {
122 auto in1
= Load(d8
, coeff
+ (N
/ 2 + i
) * 8);
123 auto mul
= Set(d8
, WcMultipliers
<N
>::kMultipliers
[i
]);
124 Store(Mul(in1
, mul
), d8
, coeff
+ (N
/ 2 + i
) * 8);
128 void LoadFromBlock(const float* JXL_RESTRICT pixels
, size_t pixels_stride
,
129 size_t off
, float* JXL_RESTRICT coeff
) {
130 HWY_CAPPED(float, 8) d8
;
131 for (size_t i
= 0; i
< 8; i
++) {
132 Store(LoadU(d8
, pixels
+ i
* pixels_stride
+ off
), d8
, coeff
+ i
* 8);
136 void StoreToBlockAndScale(const float* JXL_RESTRICT coeff
, float* output
,
138 HWY_CAPPED(float, 8) d8
;
139 auto mul
= Set(d8
, 1.0f
/ 8);
140 for (size_t i
= 0; i
< 8; i
++) {
141 StoreU(Mul(mul
, Load(d8
, coeff
+ i
* 8)), d8
, output
+ i
* 8 + off
);
149 struct DCT1DImpl
<1> {
150 JXL_INLINE
void operator()(float* JXL_RESTRICT mem
) {}
154 struct DCT1DImpl
<2> {
155 JXL_INLINE
void operator()(float* JXL_RESTRICT mem
) {
156 HWY_CAPPED(float, 8) d8
;
157 auto in1
= Load(d8
, mem
);
158 auto in2
= Load(d8
, mem
+ 8);
159 Store(Add(in1
, in2
), d8
, mem
);
160 Store(Sub(in1
, in2
), d8
, mem
+ 8);
166 void operator()(float* JXL_RESTRICT mem
) {
167 HWY_ALIGN
float tmp
[N
* 8];
168 AddReverse
<N
/ 2>(mem
, mem
+ N
* 4, tmp
);
169 DCT1DImpl
<N
/ 2>()(tmp
);
170 SubReverse
<N
/ 2>(mem
, mem
+ N
* 4, tmp
+ N
* 4);
172 DCT1DImpl
<N
/ 2>()(tmp
+ N
* 4);
173 B
<N
/ 2>(tmp
+ N
* 4);
174 InverseEvenOdd
<N
>(tmp
, mem
);
178 void DCT1D(const float* JXL_RESTRICT pixels
, size_t pixels_stride
,
179 float* JXL_RESTRICT output
) {
180 HWY_CAPPED(float, 8) d8
;
181 HWY_ALIGN
float tmp
[64];
182 for (size_t i
= 0; i
< 8; i
+= Lanes(d8
)) {
183 // TODO(veluca): consider removing the temporary memory here (as is done in
184 // IDCT), if it turns out that some compilers don't optimize away the loads
185 // and this is performance-critical.
186 LoadFromBlock(pixels
, pixels_stride
, i
, tmp
);
188 StoreToBlockAndScale(tmp
, output
, i
);
192 JXL_INLINE JXL_MAYBE_UNUSED
void TransformFromPixels(
193 const float* JXL_RESTRICT pixels
, size_t pixels_stride
,
194 float* JXL_RESTRICT coefficients
, float* JXL_RESTRICT scratch_space
) {
195 DCT1D(pixels
, pixels_stride
, scratch_space
);
196 Transpose8x8Block(scratch_space
, coefficients
);
197 DCT1D(coefficients
, 8, scratch_space
);
198 Transpose8x8Block(scratch_space
, coefficients
);
201 JXL_INLINE JXL_MAYBE_UNUSED
void StoreQuantizedValue(const Vec
<DI
>& ival
,
203 Rebind
<int16_t, DI
> di16
;
204 Store(DemoteTo(di16
, ival
), di16
, out
);
207 JXL_INLINE JXL_MAYBE_UNUSED
void StoreQuantizedValue(const Vec
<DI
>& ival
,
210 Store(ival
, di
, out
);
213 template <typename T
>
214 void QuantizeBlock(const float* dct
, const float* qmc
, float aq_strength
,
215 const float* zero_bias_offset
, const float* zero_bias_mul
,
219 const auto aq_mul
= Set(d
, aq_strength
);
220 for (size_t k
= 0; k
< DCTSIZE2
; k
+= Lanes(d
)) {
221 const auto val
= Load(d
, dct
+ k
);
222 const auto q
= Load(d
, qmc
+ k
);
223 const auto qval
= Mul(val
, q
);
224 const auto zb_offset
= Load(d
, zero_bias_offset
+ k
);
225 const auto zb_mul
= Load(d
, zero_bias_mul
+ k
);
226 const auto threshold
= Add(zb_offset
, Mul(zb_mul
, aq_mul
));
227 const auto nzero_mask
= Ge(Abs(qval
), threshold
);
228 const auto ival
= ConvertTo(di
, IfThenElseZero(nzero_mask
, Round(qval
)));
229 StoreQuantizedValue(ival
, block
+ k
);
233 template <typename T
>
234 void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels
, size_t stride
,
235 const float* JXL_RESTRICT qmc
,
236 int16_t last_dc_coeff
, float aq_strength
,
237 const float* zero_bias_offset
,
238 const float* zero_bias_mul
,
239 float* JXL_RESTRICT tmp
, T
* block
) {
240 float* JXL_RESTRICT dct
= tmp
;
241 float* JXL_RESTRICT scratch_space
= tmp
+ DCTSIZE2
;
242 TransformFromPixels(pixels
, stride
, dct
, scratch_space
);
243 QuantizeBlock(dct
, qmc
, aq_strength
, zero_bias_offset
, zero_bias_mul
, block
);
244 // Center DC values around zero.
245 static constexpr float kDCBias
= 128.0f
;
246 const float dc
= (dct
[0] - kDCBias
) * qmc
[0];
247 float dc_threshold
= zero_bias_offset
[0] + aq_strength
* zero_bias_mul
[0];
248 if (std::abs(dc
- last_dc_coeff
) < dc_threshold
) {
249 block
[0] = last_dc_coeff
;
251 block
[0] = std::round(dc
);
255 // NOLINTNEXTLINE(google-readability-namespace-comments)
257 } // namespace HWY_NAMESPACE
258 } // namespace jpegli
259 HWY_AFTER_NAMESPACE();
260 #endif // LIB_JPEGLI_DCT_INL_H_