4 * This file was part of the Independent JPEG Group's software:
5 * Copyright (C) 1994-1998, Thomas G. Lane.
6 * Modified 2010 by Guido Vollbeding.
7 * libjpeg-turbo Modifications:
8 * Copyright (C) 2014, D. R. Commander.
9 * For conditions of distribution and use, see the accompanying README.ijg
12 * This file contains a floating-point implementation of the
13 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
14 * must also perform dequantization of the input coefficients.
16 * This implementation should be more accurate than either of the integer
17 * IDCT implementations. However, it may not give the same results on all
18 * machines because of differences in roundoff behavior. Speed will depend
19 * on the hardware's floating point capacity.
21 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
22 * on each row (or vice versa, but it's more convenient to emit a row at
23 * a time). Direct algorithms are also available, but they are much more
24 * complex and seem not to be any faster when reduced to code.
26 * This implementation is based on Arai, Agui, and Nakajima's algorithm for
27 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
28 * Japanese, but the algorithm is described in the Pennebaker & Mitchell
29 * JPEG textbook (see REFERENCES section in file README.ijg). The following
30 * code is based directly on figure 4-8 in P&M.
31 * While an 8-point DCT cannot be done in less than 11 multiplies, it is
32 * possible to arrange the computation so that many of the multiplies are
33 * simple scalings of the final outputs. These multiplies can then be
34 * folded into the multiplications or divisions by the JPEG quantization
35 * table entries. The AA&N method leaves only 5 multiplies and 29 adds
36 * to be done in the DCT itself.
37 * The primary disadvantage of this method is that with a fixed-point
38 * implementation, accuracy is lost due to imprecise representation of the
39 * scaled quantization values. However, that problem does not arise if
40 * we use floating point arithmetic.
43 #define JPEG_INTERNALS
46 #include "jdct.h" /* Private declarations for DCT subsystem */
48 #ifdef DCT_FLOAT_SUPPORTED
52 * This module is specialized to the case DCTSIZE = 8.
56 Sorry
, this code only copes with
8x8 DCTs
. /* deliberate syntax err */
60 /* Dequantize a coefficient by multiplying it by the multiplier-table
61 * entry; produce a float result.
64 #define DEQUANTIZE(coef, quantval) (((FAST_FLOAT)(coef)) * (quantval))
68 * Perform dequantization and inverse DCT on one block of coefficients.
72 jpeg_idct_float(j_decompress_ptr cinfo
, jpeg_component_info
*compptr
,
73 JCOEFPTR coef_block
, JSAMPARRAY output_buf
,
74 JDIMENSION output_col
)
76 FAST_FLOAT tmp0
, tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
;
77 FAST_FLOAT tmp10
, tmp11
, tmp12
, tmp13
;
78 FAST_FLOAT z5
, z10
, z11
, z12
, z13
;
80 FLOAT_MULT_TYPE
*quantptr
;
83 JSAMPLE
*range_limit
= cinfo
->sample_range_limit
;
85 FAST_FLOAT workspace
[DCTSIZE2
]; /* buffers data between passes */
86 #define _0_125 ((FLOAT_MULT_TYPE)0.125)
88 /* Pass 1: process columns from input, store into work array. */
91 quantptr
= (FLOAT_MULT_TYPE
*)compptr
->dct_table
;
93 for (ctr
= DCTSIZE
; ctr
> 0; ctr
--) {
94 /* Due to quantization, we will usually find that many of the input
95 * coefficients are zero, especially the AC terms. We can exploit this
96 * by short-circuiting the IDCT calculation for any column in which all
97 * the AC terms are zero. In that case each output is equal to the
98 * DC coefficient (with scale factor as needed).
99 * With typical images and quantization tables, half or more of the
100 * column DCT calculations can be simplified this way.
103 if (inptr
[DCTSIZE
* 1] == 0 && inptr
[DCTSIZE
* 2] == 0 &&
104 inptr
[DCTSIZE
* 3] == 0 && inptr
[DCTSIZE
* 4] == 0 &&
105 inptr
[DCTSIZE
* 5] == 0 && inptr
[DCTSIZE
* 6] == 0 &&
106 inptr
[DCTSIZE
* 7] == 0) {
107 /* AC terms all zero */
108 FAST_FLOAT dcval
= DEQUANTIZE(inptr
[DCTSIZE
* 0],
109 quantptr
[DCTSIZE
* 0] * _0_125
);
111 wsptr
[DCTSIZE
* 0] = dcval
;
112 wsptr
[DCTSIZE
* 1] = dcval
;
113 wsptr
[DCTSIZE
* 2] = dcval
;
114 wsptr
[DCTSIZE
* 3] = dcval
;
115 wsptr
[DCTSIZE
* 4] = dcval
;
116 wsptr
[DCTSIZE
* 5] = dcval
;
117 wsptr
[DCTSIZE
* 6] = dcval
;
118 wsptr
[DCTSIZE
* 7] = dcval
;
120 inptr
++; /* advance pointers to next column */
128 tmp0
= DEQUANTIZE(inptr
[DCTSIZE
* 0], quantptr
[DCTSIZE
* 0] * _0_125
);
129 tmp1
= DEQUANTIZE(inptr
[DCTSIZE
* 2], quantptr
[DCTSIZE
* 2] * _0_125
);
130 tmp2
= DEQUANTIZE(inptr
[DCTSIZE
* 4], quantptr
[DCTSIZE
* 4] * _0_125
);
131 tmp3
= DEQUANTIZE(inptr
[DCTSIZE
* 6], quantptr
[DCTSIZE
* 6] * _0_125
);
133 tmp10
= tmp0
+ tmp2
; /* phase 3 */
136 tmp13
= tmp1
+ tmp3
; /* phases 5-3 */
137 tmp12
= (tmp1
- tmp3
) * ((FAST_FLOAT
)1.414213562) - tmp13
; /* 2*c4 */
139 tmp0
= tmp10
+ tmp13
; /* phase 2 */
140 tmp3
= tmp10
- tmp13
;
141 tmp1
= tmp11
+ tmp12
;
142 tmp2
= tmp11
- tmp12
;
146 tmp4
= DEQUANTIZE(inptr
[DCTSIZE
* 1], quantptr
[DCTSIZE
* 1] * _0_125
);
147 tmp5
= DEQUANTIZE(inptr
[DCTSIZE
* 3], quantptr
[DCTSIZE
* 3] * _0_125
);
148 tmp6
= DEQUANTIZE(inptr
[DCTSIZE
* 5], quantptr
[DCTSIZE
* 5] * _0_125
);
149 tmp7
= DEQUANTIZE(inptr
[DCTSIZE
* 7], quantptr
[DCTSIZE
* 7] * _0_125
);
151 z13
= tmp6
+ tmp5
; /* phase 6 */
156 tmp7
= z11
+ z13
; /* phase 5 */
157 tmp11
= (z11
- z13
) * ((FAST_FLOAT
)1.414213562); /* 2*c4 */
159 z5
= (z10
+ z12
) * ((FAST_FLOAT
)1.847759065); /* 2*c2 */
160 tmp10
= z5
- z12
* ((FAST_FLOAT
)1.082392200); /* 2*(c2-c6) */
161 tmp12
= z5
- z10
* ((FAST_FLOAT
)2.613125930); /* 2*(c2+c6) */
163 tmp6
= tmp12
- tmp7
; /* phase 2 */
167 wsptr
[DCTSIZE
* 0] = tmp0
+ tmp7
;
168 wsptr
[DCTSIZE
* 7] = tmp0
- tmp7
;
169 wsptr
[DCTSIZE
* 1] = tmp1
+ tmp6
;
170 wsptr
[DCTSIZE
* 6] = tmp1
- tmp6
;
171 wsptr
[DCTSIZE
* 2] = tmp2
+ tmp5
;
172 wsptr
[DCTSIZE
* 5] = tmp2
- tmp5
;
173 wsptr
[DCTSIZE
* 3] = tmp3
+ tmp4
;
174 wsptr
[DCTSIZE
* 4] = tmp3
- tmp4
;
176 inptr
++; /* advance pointers to next column */
181 /* Pass 2: process rows from work array, store into output array. */
184 for (ctr
= 0; ctr
< DCTSIZE
; ctr
++) {
185 outptr
= output_buf
[ctr
] + output_col
;
186 /* Rows of zeroes can be exploited in the same way as we did with columns.
187 * However, the column calculation has created many nonzero AC terms, so
188 * the simplification applies less often (typically 5% to 10% of the time).
189 * And testing floats for zero is relatively expensive, so we don't bother.
194 /* Apply signed->unsigned and prepare float->int conversion */
195 z5
= wsptr
[0] + ((FAST_FLOAT
)CENTERJSAMPLE
+ (FAST_FLOAT
)0.5);
196 tmp10
= z5
+ wsptr
[4];
197 tmp11
= z5
- wsptr
[4];
199 tmp13
= wsptr
[2] + wsptr
[6];
200 tmp12
= (wsptr
[2] - wsptr
[6]) * ((FAST_FLOAT
)1.414213562) - tmp13
;
202 tmp0
= tmp10
+ tmp13
;
203 tmp3
= tmp10
- tmp13
;
204 tmp1
= tmp11
+ tmp12
;
205 tmp2
= tmp11
- tmp12
;
209 z13
= wsptr
[5] + wsptr
[3];
210 z10
= wsptr
[5] - wsptr
[3];
211 z11
= wsptr
[1] + wsptr
[7];
212 z12
= wsptr
[1] - wsptr
[7];
215 tmp11
= (z11
- z13
) * ((FAST_FLOAT
)1.414213562);
217 z5
= (z10
+ z12
) * ((FAST_FLOAT
)1.847759065); /* 2*c2 */
218 tmp10
= z5
- z12
* ((FAST_FLOAT
)1.082392200); /* 2*(c2-c6) */
219 tmp12
= z5
- z10
* ((FAST_FLOAT
)2.613125930); /* 2*(c2+c6) */
225 /* Final output stage: float->int conversion and range-limit */
227 outptr
[0] = range_limit
[((int)(tmp0
+ tmp7
)) & RANGE_MASK
];
228 outptr
[7] = range_limit
[((int)(tmp0
- tmp7
)) & RANGE_MASK
];
229 outptr
[1] = range_limit
[((int)(tmp1
+ tmp6
)) & RANGE_MASK
];
230 outptr
[6] = range_limit
[((int)(tmp1
- tmp6
)) & RANGE_MASK
];
231 outptr
[2] = range_limit
[((int)(tmp2
+ tmp5
)) & RANGE_MASK
];
232 outptr
[5] = range_limit
[((int)(tmp2
- tmp5
)) & RANGE_MASK
];
233 outptr
[3] = range_limit
[((int)(tmp3
+ tmp4
)) & RANGE_MASK
];
234 outptr
[4] = range_limit
[((int)(tmp3
- tmp4
)) & RANGE_MASK
];
236 wsptr
+= DCTSIZE
; /* advance pointer to next row */
240 #endif /* DCT_FLOAT_SUPPORTED */