2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "vpx_dsp/fwd_txfm.h"
13 void vpx_fdct4x4_c(const int16_t *input
, tran_low_t
*output
, int stride
) {
14 // The 2D transform is done with two passes which are actually pretty
15 // similar. In the first one, we transform the columns and transpose
16 // the results. In the second one, we transform the rows. To achieve that,
17 // as the first pass results are transposed, we transpose the columns (that
18 // is the transposed rows) and transpose the results (so that it goes back
19 // in normal/row positions).
21 // We need an intermediate buffer between passes.
22 tran_low_t intermediate
[4 * 4];
23 const int16_t *in_pass0
= input
;
24 const tran_low_t
*in
= NULL
;
25 tran_low_t
*out
= intermediate
;
26 // Do the two transform/transpose passes
27 for (pass
= 0; pass
< 2; ++pass
) {
28 tran_high_t input
[4]; // canbe16
29 tran_high_t step
[4]; // canbe16
30 tran_high_t temp1
, temp2
; // needs32
32 for (i
= 0; i
< 4; ++i
) {
35 input
[0] = in_pass0
[0 * stride
] * 16;
36 input
[1] = in_pass0
[1 * stride
] * 16;
37 input
[2] = in_pass0
[2 * stride
] * 16;
38 input
[3] = in_pass0
[3 * stride
] * 16;
39 if (i
== 0 && input
[0]) {
49 step
[0] = input
[0] + input
[3];
50 step
[1] = input
[1] + input
[2];
51 step
[2] = input
[1] - input
[2];
52 step
[3] = input
[0] - input
[3];
53 temp1
= (step
[0] + step
[1]) * cospi_16_64
;
54 temp2
= (step
[0] - step
[1]) * cospi_16_64
;
55 out
[0] = (tran_low_t
)fdct_round_shift(temp1
);
56 out
[2] = (tran_low_t
)fdct_round_shift(temp2
);
57 temp1
= step
[2] * cospi_24_64
+ step
[3] * cospi_8_64
;
58 temp2
= -step
[2] * cospi_8_64
+ step
[3] * cospi_24_64
;
59 out
[1] = (tran_low_t
)fdct_round_shift(temp1
);
60 out
[3] = (tran_low_t
)fdct_round_shift(temp2
);
61 // Do next column (which is a transposed row in second/horizontal pass)
66 // Setup in/out for next pass.
73 for (i
= 0; i
< 4; ++i
) {
74 for (j
= 0; j
< 4; ++j
)
75 output
[j
+ i
* 4] = (output
[j
+ i
* 4] + 1) >> 2;
80 void vpx_fdct4x4_1_c(const int16_t *input
, tran_low_t
*output
, int stride
) {
83 for (r
= 0; r
< 4; ++r
)
84 for (c
= 0; c
< 4; ++c
)
85 sum
+= input
[r
* stride
+ c
];
91 void vpx_fdct8x8_c(const int16_t *input
, tran_low_t
*final_output
, int stride
) {
93 tran_low_t intermediate
[64];
95 tran_low_t
*output
= intermediate
;
96 const tran_low_t
*in
= NULL
;
99 for (pass
= 0; pass
< 2; ++pass
) {
100 tran_high_t s0
, s1
, s2
, s3
, s4
, s5
, s6
, s7
; // canbe16
101 tran_high_t t0
, t1
, t2
, t3
; // needs32
102 tran_high_t x0
, x1
, x2
, x3
; // canbe16
105 for (i
= 0; i
< 8; i
++) {
108 s0
= (input
[0 * stride
] + input
[7 * stride
]) * 4;
109 s1
= (input
[1 * stride
] + input
[6 * stride
]) * 4;
110 s2
= (input
[2 * stride
] + input
[5 * stride
]) * 4;
111 s3
= (input
[3 * stride
] + input
[4 * stride
]) * 4;
112 s4
= (input
[3 * stride
] - input
[4 * stride
]) * 4;
113 s5
= (input
[2 * stride
] - input
[5 * stride
]) * 4;
114 s6
= (input
[1 * stride
] - input
[6 * stride
]) * 4;
115 s7
= (input
[0 * stride
] - input
[7 * stride
]) * 4;
118 s0
= in
[0 * 8] + in
[7 * 8];
119 s1
= in
[1 * 8] + in
[6 * 8];
120 s2
= in
[2 * 8] + in
[5 * 8];
121 s3
= in
[3 * 8] + in
[4 * 8];
122 s4
= in
[3 * 8] - in
[4 * 8];
123 s5
= in
[2 * 8] - in
[5 * 8];
124 s6
= in
[1 * 8] - in
[6 * 8];
125 s7
= in
[0 * 8] - in
[7 * 8];
129 // fdct4(step, step);
134 t0
= (x0
+ x1
) * cospi_16_64
;
135 t1
= (x0
- x1
) * cospi_16_64
;
136 t2
= x2
* cospi_24_64
+ x3
* cospi_8_64
;
137 t3
= -x2
* cospi_8_64
+ x3
* cospi_24_64
;
138 output
[0] = (tran_low_t
)fdct_round_shift(t0
);
139 output
[2] = (tran_low_t
)fdct_round_shift(t2
);
140 output
[4] = (tran_low_t
)fdct_round_shift(t1
);
141 output
[6] = (tran_low_t
)fdct_round_shift(t3
);
144 t0
= (s6
- s5
) * cospi_16_64
;
145 t1
= (s6
+ s5
) * cospi_16_64
;
146 t2
= fdct_round_shift(t0
);
147 t3
= fdct_round_shift(t1
);
156 t0
= x0
* cospi_28_64
+ x3
* cospi_4_64
;
157 t1
= x1
* cospi_12_64
+ x2
* cospi_20_64
;
158 t2
= x2
* cospi_12_64
+ x1
* -cospi_20_64
;
159 t3
= x3
* cospi_28_64
+ x0
* -cospi_4_64
;
160 output
[1] = (tran_low_t
)fdct_round_shift(t0
);
161 output
[3] = (tran_low_t
)fdct_round_shift(t2
);
162 output
[5] = (tran_low_t
)fdct_round_shift(t1
);
163 output
[7] = (tran_low_t
)fdct_round_shift(t3
);
167 output
= final_output
;
171 for (i
= 0; i
< 8; ++i
) {
172 for (j
= 0; j
< 8; ++j
)
173 final_output
[j
+ i
* 8] /= 2;
177 void vpx_fdct8x8_1_c(const int16_t *input
, tran_low_t
*output
, int stride
) {
180 for (r
= 0; r
< 8; ++r
)
181 for (c
= 0; c
< 8; ++c
)
182 sum
+= input
[r
* stride
+ c
];
188 void vpx_fdct16x16_c(const int16_t *input
, tran_low_t
*output
, int stride
) {
189 // The 2D transform is done with two passes which are actually pretty
190 // similar. In the first one, we transform the columns and transpose
191 // the results. In the second one, we transform the rows. To achieve that,
192 // as the first pass results are transposed, we transpose the columns (that
193 // is the transposed rows) and transpose the results (so that it goes back
194 // in normal/row positions).
196 // We need an intermediate buffer between passes.
197 tran_low_t intermediate
[256];
198 const int16_t *in_pass0
= input
;
199 const tran_low_t
*in
= NULL
;
200 tran_low_t
*out
= intermediate
;
201 // Do the two transform/transpose passes
202 for (pass
= 0; pass
< 2; ++pass
) {
203 tran_high_t step1
[8]; // canbe16
204 tran_high_t step2
[8]; // canbe16
205 tran_high_t step3
[8]; // canbe16
206 tran_high_t input
[8]; // canbe16
207 tran_high_t temp1
, temp2
; // needs32
209 for (i
= 0; i
< 16; i
++) {
211 // Calculate input for the first 8 results.
212 input
[0] = (in_pass0
[0 * stride
] + in_pass0
[15 * stride
]) * 4;
213 input
[1] = (in_pass0
[1 * stride
] + in_pass0
[14 * stride
]) * 4;
214 input
[2] = (in_pass0
[2 * stride
] + in_pass0
[13 * stride
]) * 4;
215 input
[3] = (in_pass0
[3 * stride
] + in_pass0
[12 * stride
]) * 4;
216 input
[4] = (in_pass0
[4 * stride
] + in_pass0
[11 * stride
]) * 4;
217 input
[5] = (in_pass0
[5 * stride
] + in_pass0
[10 * stride
]) * 4;
218 input
[6] = (in_pass0
[6 * stride
] + in_pass0
[ 9 * stride
]) * 4;
219 input
[7] = (in_pass0
[7 * stride
] + in_pass0
[ 8 * stride
]) * 4;
220 // Calculate input for the next 8 results.
221 step1
[0] = (in_pass0
[7 * stride
] - in_pass0
[ 8 * stride
]) * 4;
222 step1
[1] = (in_pass0
[6 * stride
] - in_pass0
[ 9 * stride
]) * 4;
223 step1
[2] = (in_pass0
[5 * stride
] - in_pass0
[10 * stride
]) * 4;
224 step1
[3] = (in_pass0
[4 * stride
] - in_pass0
[11 * stride
]) * 4;
225 step1
[4] = (in_pass0
[3 * stride
] - in_pass0
[12 * stride
]) * 4;
226 step1
[5] = (in_pass0
[2 * stride
] - in_pass0
[13 * stride
]) * 4;
227 step1
[6] = (in_pass0
[1 * stride
] - in_pass0
[14 * stride
]) * 4;
228 step1
[7] = (in_pass0
[0 * stride
] - in_pass0
[15 * stride
]) * 4;
230 // Calculate input for the first 8 results.
231 input
[0] = ((in
[0 * 16] + 1) >> 2) + ((in
[15 * 16] + 1) >> 2);
232 input
[1] = ((in
[1 * 16] + 1) >> 2) + ((in
[14 * 16] + 1) >> 2);
233 input
[2] = ((in
[2 * 16] + 1) >> 2) + ((in
[13 * 16] + 1) >> 2);
234 input
[3] = ((in
[3 * 16] + 1) >> 2) + ((in
[12 * 16] + 1) >> 2);
235 input
[4] = ((in
[4 * 16] + 1) >> 2) + ((in
[11 * 16] + 1) >> 2);
236 input
[5] = ((in
[5 * 16] + 1) >> 2) + ((in
[10 * 16] + 1) >> 2);
237 input
[6] = ((in
[6 * 16] + 1) >> 2) + ((in
[ 9 * 16] + 1) >> 2);
238 input
[7] = ((in
[7 * 16] + 1) >> 2) + ((in
[ 8 * 16] + 1) >> 2);
239 // Calculate input for the next 8 results.
240 step1
[0] = ((in
[7 * 16] + 1) >> 2) - ((in
[ 8 * 16] + 1) >> 2);
241 step1
[1] = ((in
[6 * 16] + 1) >> 2) - ((in
[ 9 * 16] + 1) >> 2);
242 step1
[2] = ((in
[5 * 16] + 1) >> 2) - ((in
[10 * 16] + 1) >> 2);
243 step1
[3] = ((in
[4 * 16] + 1) >> 2) - ((in
[11 * 16] + 1) >> 2);
244 step1
[4] = ((in
[3 * 16] + 1) >> 2) - ((in
[12 * 16] + 1) >> 2);
245 step1
[5] = ((in
[2 * 16] + 1) >> 2) - ((in
[13 * 16] + 1) >> 2);
246 step1
[6] = ((in
[1 * 16] + 1) >> 2) - ((in
[14 * 16] + 1) >> 2);
247 step1
[7] = ((in
[0 * 16] + 1) >> 2) - ((in
[15 * 16] + 1) >> 2);
249 // Work on the first eight values; fdct8(input, even_results);
251 tran_high_t s0
, s1
, s2
, s3
, s4
, s5
, s6
, s7
; // canbe16
252 tran_high_t t0
, t1
, t2
, t3
; // needs32
253 tran_high_t x0
, x1
, x2
, x3
; // canbe16
256 s0
= input
[0] + input
[7];
257 s1
= input
[1] + input
[6];
258 s2
= input
[2] + input
[5];
259 s3
= input
[3] + input
[4];
260 s4
= input
[3] - input
[4];
261 s5
= input
[2] - input
[5];
262 s6
= input
[1] - input
[6];
263 s7
= input
[0] - input
[7];
265 // fdct4(step, step);
270 t0
= (x0
+ x1
) * cospi_16_64
;
271 t1
= (x0
- x1
) * cospi_16_64
;
272 t2
= x3
* cospi_8_64
+ x2
* cospi_24_64
;
273 t3
= x3
* cospi_24_64
- x2
* cospi_8_64
;
274 out
[0] = (tran_low_t
)fdct_round_shift(t0
);
275 out
[4] = (tran_low_t
)fdct_round_shift(t2
);
276 out
[8] = (tran_low_t
)fdct_round_shift(t1
);
277 out
[12] = (tran_low_t
)fdct_round_shift(t3
);
280 t0
= (s6
- s5
) * cospi_16_64
;
281 t1
= (s6
+ s5
) * cospi_16_64
;
282 t2
= fdct_round_shift(t0
);
283 t3
= fdct_round_shift(t1
);
292 t0
= x0
* cospi_28_64
+ x3
* cospi_4_64
;
293 t1
= x1
* cospi_12_64
+ x2
* cospi_20_64
;
294 t2
= x2
* cospi_12_64
+ x1
* -cospi_20_64
;
295 t3
= x3
* cospi_28_64
+ x0
* -cospi_4_64
;
296 out
[2] = (tran_low_t
)fdct_round_shift(t0
);
297 out
[6] = (tran_low_t
)fdct_round_shift(t2
);
298 out
[10] = (tran_low_t
)fdct_round_shift(t1
);
299 out
[14] = (tran_low_t
)fdct_round_shift(t3
);
301 // Work on the next eight values; step1 -> odd_results
304 temp1
= (step1
[5] - step1
[2]) * cospi_16_64
;
305 temp2
= (step1
[4] - step1
[3]) * cospi_16_64
;
306 step2
[2] = fdct_round_shift(temp1
);
307 step2
[3] = fdct_round_shift(temp2
);
308 temp1
= (step1
[4] + step1
[3]) * cospi_16_64
;
309 temp2
= (step1
[5] + step1
[2]) * cospi_16_64
;
310 step2
[4] = fdct_round_shift(temp1
);
311 step2
[5] = fdct_round_shift(temp2
);
313 step3
[0] = step1
[0] + step2
[3];
314 step3
[1] = step1
[1] + step2
[2];
315 step3
[2] = step1
[1] - step2
[2];
316 step3
[3] = step1
[0] - step2
[3];
317 step3
[4] = step1
[7] - step2
[4];
318 step3
[5] = step1
[6] - step2
[5];
319 step3
[6] = step1
[6] + step2
[5];
320 step3
[7] = step1
[7] + step2
[4];
322 temp1
= step3
[1] * -cospi_8_64
+ step3
[6] * cospi_24_64
;
323 temp2
= step3
[2] * cospi_24_64
+ step3
[5] * cospi_8_64
;
324 step2
[1] = fdct_round_shift(temp1
);
325 step2
[2] = fdct_round_shift(temp2
);
326 temp1
= step3
[2] * cospi_8_64
- step3
[5] * cospi_24_64
;
327 temp2
= step3
[1] * cospi_24_64
+ step3
[6] * cospi_8_64
;
328 step2
[5] = fdct_round_shift(temp1
);
329 step2
[6] = fdct_round_shift(temp2
);
331 step1
[0] = step3
[0] + step2
[1];
332 step1
[1] = step3
[0] - step2
[1];
333 step1
[2] = step3
[3] + step2
[2];
334 step1
[3] = step3
[3] - step2
[2];
335 step1
[4] = step3
[4] - step2
[5];
336 step1
[5] = step3
[4] + step2
[5];
337 step1
[6] = step3
[7] - step2
[6];
338 step1
[7] = step3
[7] + step2
[6];
340 temp1
= step1
[0] * cospi_30_64
+ step1
[7] * cospi_2_64
;
341 temp2
= step1
[1] * cospi_14_64
+ step1
[6] * cospi_18_64
;
342 out
[1] = (tran_low_t
)fdct_round_shift(temp1
);
343 out
[9] = (tran_low_t
)fdct_round_shift(temp2
);
344 temp1
= step1
[2] * cospi_22_64
+ step1
[5] * cospi_10_64
;
345 temp2
= step1
[3] * cospi_6_64
+ step1
[4] * cospi_26_64
;
346 out
[5] = (tran_low_t
)fdct_round_shift(temp1
);
347 out
[13] = (tran_low_t
)fdct_round_shift(temp2
);
348 temp1
= step1
[3] * -cospi_26_64
+ step1
[4] * cospi_6_64
;
349 temp2
= step1
[2] * -cospi_10_64
+ step1
[5] * cospi_22_64
;
350 out
[3] = (tran_low_t
)fdct_round_shift(temp1
);
351 out
[11] = (tran_low_t
)fdct_round_shift(temp2
);
352 temp1
= step1
[1] * -cospi_18_64
+ step1
[6] * cospi_14_64
;
353 temp2
= step1
[0] * -cospi_2_64
+ step1
[7] * cospi_30_64
;
354 out
[7] = (tran_low_t
)fdct_round_shift(temp1
);
355 out
[15] = (tran_low_t
)fdct_round_shift(temp2
);
357 // Do next column (which is a transposed row in second/horizontal pass)
362 // Setup in/out for next pass.
368 void vpx_fdct16x16_1_c(const int16_t *input
, tran_low_t
*output
, int stride
) {
371 for (r
= 0; r
< 16; ++r
)
372 for (c
= 0; c
< 16; ++c
)
373 sum
+= input
[r
* stride
+ c
];
375 output
[0] = sum
>> 1;
379 static INLINE tran_high_t
dct_32_round(tran_high_t input
) {
380 tran_high_t rv
= ROUND_POWER_OF_TWO(input
, DCT_CONST_BITS
);
381 // TODO(debargha, peter.derivaz): Find new bounds for this assert,
382 // and make the bounds consts.
383 // assert(-131072 <= rv && rv <= 131071);
387 static INLINE tran_high_t
half_round_shift(tran_high_t input
) {
388 tran_high_t rv
= (input
+ 1 + (input
< 0)) >> 2;
392 void vpx_fdct32(const tran_high_t
*input
, tran_high_t
*output
, int round
) {
393 tran_high_t step
[32];
395 step
[0] = input
[0] + input
[(32 - 1)];
396 step
[1] = input
[1] + input
[(32 - 2)];
397 step
[2] = input
[2] + input
[(32 - 3)];
398 step
[3] = input
[3] + input
[(32 - 4)];
399 step
[4] = input
[4] + input
[(32 - 5)];
400 step
[5] = input
[5] + input
[(32 - 6)];
401 step
[6] = input
[6] + input
[(32 - 7)];
402 step
[7] = input
[7] + input
[(32 - 8)];
403 step
[8] = input
[8] + input
[(32 - 9)];
404 step
[9] = input
[9] + input
[(32 - 10)];
405 step
[10] = input
[10] + input
[(32 - 11)];
406 step
[11] = input
[11] + input
[(32 - 12)];
407 step
[12] = input
[12] + input
[(32 - 13)];
408 step
[13] = input
[13] + input
[(32 - 14)];
409 step
[14] = input
[14] + input
[(32 - 15)];
410 step
[15] = input
[15] + input
[(32 - 16)];
411 step
[16] = -input
[16] + input
[(32 - 17)];
412 step
[17] = -input
[17] + input
[(32 - 18)];
413 step
[18] = -input
[18] + input
[(32 - 19)];
414 step
[19] = -input
[19] + input
[(32 - 20)];
415 step
[20] = -input
[20] + input
[(32 - 21)];
416 step
[21] = -input
[21] + input
[(32 - 22)];
417 step
[22] = -input
[22] + input
[(32 - 23)];
418 step
[23] = -input
[23] + input
[(32 - 24)];
419 step
[24] = -input
[24] + input
[(32 - 25)];
420 step
[25] = -input
[25] + input
[(32 - 26)];
421 step
[26] = -input
[26] + input
[(32 - 27)];
422 step
[27] = -input
[27] + input
[(32 - 28)];
423 step
[28] = -input
[28] + input
[(32 - 29)];
424 step
[29] = -input
[29] + input
[(32 - 30)];
425 step
[30] = -input
[30] + input
[(32 - 31)];
426 step
[31] = -input
[31] + input
[(32 - 32)];
429 output
[0] = step
[0] + step
[16 - 1];
430 output
[1] = step
[1] + step
[16 - 2];
431 output
[2] = step
[2] + step
[16 - 3];
432 output
[3] = step
[3] + step
[16 - 4];
433 output
[4] = step
[4] + step
[16 - 5];
434 output
[5] = step
[5] + step
[16 - 6];
435 output
[6] = step
[6] + step
[16 - 7];
436 output
[7] = step
[7] + step
[16 - 8];
437 output
[8] = -step
[8] + step
[16 - 9];
438 output
[9] = -step
[9] + step
[16 - 10];
439 output
[10] = -step
[10] + step
[16 - 11];
440 output
[11] = -step
[11] + step
[16 - 12];
441 output
[12] = -step
[12] + step
[16 - 13];
442 output
[13] = -step
[13] + step
[16 - 14];
443 output
[14] = -step
[14] + step
[16 - 15];
444 output
[15] = -step
[15] + step
[16 - 16];
446 output
[16] = step
[16];
447 output
[17] = step
[17];
448 output
[18] = step
[18];
449 output
[19] = step
[19];
451 output
[20] = dct_32_round((-step
[20] + step
[27]) * cospi_16_64
);
452 output
[21] = dct_32_round((-step
[21] + step
[26]) * cospi_16_64
);
453 output
[22] = dct_32_round((-step
[22] + step
[25]) * cospi_16_64
);
454 output
[23] = dct_32_round((-step
[23] + step
[24]) * cospi_16_64
);
456 output
[24] = dct_32_round((step
[24] + step
[23]) * cospi_16_64
);
457 output
[25] = dct_32_round((step
[25] + step
[22]) * cospi_16_64
);
458 output
[26] = dct_32_round((step
[26] + step
[21]) * cospi_16_64
);
459 output
[27] = dct_32_round((step
[27] + step
[20]) * cospi_16_64
);
461 output
[28] = step
[28];
462 output
[29] = step
[29];
463 output
[30] = step
[30];
464 output
[31] = step
[31];
466 // dump the magnitude by 4, hence the intermediate values are within
467 // the range of 16 bits.
469 output
[0] = half_round_shift(output
[0]);
470 output
[1] = half_round_shift(output
[1]);
471 output
[2] = half_round_shift(output
[2]);
472 output
[3] = half_round_shift(output
[3]);
473 output
[4] = half_round_shift(output
[4]);
474 output
[5] = half_round_shift(output
[5]);
475 output
[6] = half_round_shift(output
[6]);
476 output
[7] = half_round_shift(output
[7]);
477 output
[8] = half_round_shift(output
[8]);
478 output
[9] = half_round_shift(output
[9]);
479 output
[10] = half_round_shift(output
[10]);
480 output
[11] = half_round_shift(output
[11]);
481 output
[12] = half_round_shift(output
[12]);
482 output
[13] = half_round_shift(output
[13]);
483 output
[14] = half_round_shift(output
[14]);
484 output
[15] = half_round_shift(output
[15]);
486 output
[16] = half_round_shift(output
[16]);
487 output
[17] = half_round_shift(output
[17]);
488 output
[18] = half_round_shift(output
[18]);
489 output
[19] = half_round_shift(output
[19]);
490 output
[20] = half_round_shift(output
[20]);
491 output
[21] = half_round_shift(output
[21]);
492 output
[22] = half_round_shift(output
[22]);
493 output
[23] = half_round_shift(output
[23]);
494 output
[24] = half_round_shift(output
[24]);
495 output
[25] = half_round_shift(output
[25]);
496 output
[26] = half_round_shift(output
[26]);
497 output
[27] = half_round_shift(output
[27]);
498 output
[28] = half_round_shift(output
[28]);
499 output
[29] = half_round_shift(output
[29]);
500 output
[30] = half_round_shift(output
[30]);
501 output
[31] = half_round_shift(output
[31]);
505 step
[0] = output
[0] + output
[(8 - 1)];
506 step
[1] = output
[1] + output
[(8 - 2)];
507 step
[2] = output
[2] + output
[(8 - 3)];
508 step
[3] = output
[3] + output
[(8 - 4)];
509 step
[4] = -output
[4] + output
[(8 - 5)];
510 step
[5] = -output
[5] + output
[(8 - 6)];
511 step
[6] = -output
[6] + output
[(8 - 7)];
512 step
[7] = -output
[7] + output
[(8 - 8)];
515 step
[10] = dct_32_round((-output
[10] + output
[13]) * cospi_16_64
);
516 step
[11] = dct_32_round((-output
[11] + output
[12]) * cospi_16_64
);
517 step
[12] = dct_32_round((output
[12] + output
[11]) * cospi_16_64
);
518 step
[13] = dct_32_round((output
[13] + output
[10]) * cospi_16_64
);
519 step
[14] = output
[14];
520 step
[15] = output
[15];
522 step
[16] = output
[16] + output
[23];
523 step
[17] = output
[17] + output
[22];
524 step
[18] = output
[18] + output
[21];
525 step
[19] = output
[19] + output
[20];
526 step
[20] = -output
[20] + output
[19];
527 step
[21] = -output
[21] + output
[18];
528 step
[22] = -output
[22] + output
[17];
529 step
[23] = -output
[23] + output
[16];
530 step
[24] = -output
[24] + output
[31];
531 step
[25] = -output
[25] + output
[30];
532 step
[26] = -output
[26] + output
[29];
533 step
[27] = -output
[27] + output
[28];
534 step
[28] = output
[28] + output
[27];
535 step
[29] = output
[29] + output
[26];
536 step
[30] = output
[30] + output
[25];
537 step
[31] = output
[31] + output
[24];
540 output
[0] = step
[0] + step
[3];
541 output
[1] = step
[1] + step
[2];
542 output
[2] = -step
[2] + step
[1];
543 output
[3] = -step
[3] + step
[0];
545 output
[5] = dct_32_round((-step
[5] + step
[6]) * cospi_16_64
);
546 output
[6] = dct_32_round((step
[6] + step
[5]) * cospi_16_64
);
548 output
[8] = step
[8] + step
[11];
549 output
[9] = step
[9] + step
[10];
550 output
[10] = -step
[10] + step
[9];
551 output
[11] = -step
[11] + step
[8];
552 output
[12] = -step
[12] + step
[15];
553 output
[13] = -step
[13] + step
[14];
554 output
[14] = step
[14] + step
[13];
555 output
[15] = step
[15] + step
[12];
557 output
[16] = step
[16];
558 output
[17] = step
[17];
559 output
[18] = dct_32_round(step
[18] * -cospi_8_64
+ step
[29] * cospi_24_64
);
560 output
[19] = dct_32_round(step
[19] * -cospi_8_64
+ step
[28] * cospi_24_64
);
561 output
[20] = dct_32_round(step
[20] * -cospi_24_64
+ step
[27] * -cospi_8_64
);
562 output
[21] = dct_32_round(step
[21] * -cospi_24_64
+ step
[26] * -cospi_8_64
);
563 output
[22] = step
[22];
564 output
[23] = step
[23];
565 output
[24] = step
[24];
566 output
[25] = step
[25];
567 output
[26] = dct_32_round(step
[26] * cospi_24_64
+ step
[21] * -cospi_8_64
);
568 output
[27] = dct_32_round(step
[27] * cospi_24_64
+ step
[20] * -cospi_8_64
);
569 output
[28] = dct_32_round(step
[28] * cospi_8_64
+ step
[19] * cospi_24_64
);
570 output
[29] = dct_32_round(step
[29] * cospi_8_64
+ step
[18] * cospi_24_64
);
571 output
[30] = step
[30];
572 output
[31] = step
[31];
575 step
[0] = dct_32_round((output
[0] + output
[1]) * cospi_16_64
);
576 step
[1] = dct_32_round((-output
[1] + output
[0]) * cospi_16_64
);
577 step
[2] = dct_32_round(output
[2] * cospi_24_64
+ output
[3] * cospi_8_64
);
578 step
[3] = dct_32_round(output
[3] * cospi_24_64
- output
[2] * cospi_8_64
);
579 step
[4] = output
[4] + output
[5];
580 step
[5] = -output
[5] + output
[4];
581 step
[6] = -output
[6] + output
[7];
582 step
[7] = output
[7] + output
[6];
584 step
[9] = dct_32_round(output
[9] * -cospi_8_64
+ output
[14] * cospi_24_64
);
585 step
[10] = dct_32_round(output
[10] * -cospi_24_64
+ output
[13] * -cospi_8_64
);
586 step
[11] = output
[11];
587 step
[12] = output
[12];
588 step
[13] = dct_32_round(output
[13] * cospi_24_64
+ output
[10] * -cospi_8_64
);
589 step
[14] = dct_32_round(output
[14] * cospi_8_64
+ output
[9] * cospi_24_64
);
590 step
[15] = output
[15];
592 step
[16] = output
[16] + output
[19];
593 step
[17] = output
[17] + output
[18];
594 step
[18] = -output
[18] + output
[17];
595 step
[19] = -output
[19] + output
[16];
596 step
[20] = -output
[20] + output
[23];
597 step
[21] = -output
[21] + output
[22];
598 step
[22] = output
[22] + output
[21];
599 step
[23] = output
[23] + output
[20];
600 step
[24] = output
[24] + output
[27];
601 step
[25] = output
[25] + output
[26];
602 step
[26] = -output
[26] + output
[25];
603 step
[27] = -output
[27] + output
[24];
604 step
[28] = -output
[28] + output
[31];
605 step
[29] = -output
[29] + output
[30];
606 step
[30] = output
[30] + output
[29];
607 step
[31] = output
[31] + output
[28];
614 output
[4] = dct_32_round(step
[4] * cospi_28_64
+ step
[7] * cospi_4_64
);
615 output
[5] = dct_32_round(step
[5] * cospi_12_64
+ step
[6] * cospi_20_64
);
616 output
[6] = dct_32_round(step
[6] * cospi_12_64
+ step
[5] * -cospi_20_64
);
617 output
[7] = dct_32_round(step
[7] * cospi_28_64
+ step
[4] * -cospi_4_64
);
618 output
[8] = step
[8] + step
[9];
619 output
[9] = -step
[9] + step
[8];
620 output
[10] = -step
[10] + step
[11];
621 output
[11] = step
[11] + step
[10];
622 output
[12] = step
[12] + step
[13];
623 output
[13] = -step
[13] + step
[12];
624 output
[14] = -step
[14] + step
[15];
625 output
[15] = step
[15] + step
[14];
627 output
[16] = step
[16];
628 output
[17] = dct_32_round(step
[17] * -cospi_4_64
+ step
[30] * cospi_28_64
);
629 output
[18] = dct_32_round(step
[18] * -cospi_28_64
+ step
[29] * -cospi_4_64
);
630 output
[19] = step
[19];
631 output
[20] = step
[20];
632 output
[21] = dct_32_round(step
[21] * -cospi_20_64
+ step
[26] * cospi_12_64
);
633 output
[22] = dct_32_round(step
[22] * -cospi_12_64
+ step
[25] * -cospi_20_64
);
634 output
[23] = step
[23];
635 output
[24] = step
[24];
636 output
[25] = dct_32_round(step
[25] * cospi_12_64
+ step
[22] * -cospi_20_64
);
637 output
[26] = dct_32_round(step
[26] * cospi_20_64
+ step
[21] * cospi_12_64
);
638 output
[27] = step
[27];
639 output
[28] = step
[28];
640 output
[29] = dct_32_round(step
[29] * cospi_28_64
+ step
[18] * -cospi_4_64
);
641 output
[30] = dct_32_round(step
[30] * cospi_4_64
+ step
[17] * cospi_28_64
);
642 output
[31] = step
[31];
653 step
[8] = dct_32_round(output
[8] * cospi_30_64
+ output
[15] * cospi_2_64
);
654 step
[9] = dct_32_round(output
[9] * cospi_14_64
+ output
[14] * cospi_18_64
);
655 step
[10] = dct_32_round(output
[10] * cospi_22_64
+ output
[13] * cospi_10_64
);
656 step
[11] = dct_32_round(output
[11] * cospi_6_64
+ output
[12] * cospi_26_64
);
657 step
[12] = dct_32_round(output
[12] * cospi_6_64
+ output
[11] * -cospi_26_64
);
658 step
[13] = dct_32_round(output
[13] * cospi_22_64
+ output
[10] * -cospi_10_64
);
659 step
[14] = dct_32_round(output
[14] * cospi_14_64
+ output
[9] * -cospi_18_64
);
660 step
[15] = dct_32_round(output
[15] * cospi_30_64
+ output
[8] * -cospi_2_64
);
662 step
[16] = output
[16] + output
[17];
663 step
[17] = -output
[17] + output
[16];
664 step
[18] = -output
[18] + output
[19];
665 step
[19] = output
[19] + output
[18];
666 step
[20] = output
[20] + output
[21];
667 step
[21] = -output
[21] + output
[20];
668 step
[22] = -output
[22] + output
[23];
669 step
[23] = output
[23] + output
[22];
670 step
[24] = output
[24] + output
[25];
671 step
[25] = -output
[25] + output
[24];
672 step
[26] = -output
[26] + output
[27];
673 step
[27] = output
[27] + output
[26];
674 step
[28] = output
[28] + output
[29];
675 step
[29] = -output
[29] + output
[28];
676 step
[30] = -output
[30] + output
[31];
677 step
[31] = output
[31] + output
[30];
679 // Final stage --- outputs indices are bit-reversed.
681 output
[16] = step
[1];
683 output
[24] = step
[3];
685 output
[20] = step
[5];
686 output
[12] = step
[6];
687 output
[28] = step
[7];
689 output
[18] = step
[9];
690 output
[10] = step
[10];
691 output
[26] = step
[11];
692 output
[6] = step
[12];
693 output
[22] = step
[13];
694 output
[14] = step
[14];
695 output
[30] = step
[15];
697 output
[1] = dct_32_round(step
[16] * cospi_31_64
+ step
[31] * cospi_1_64
);
698 output
[17] = dct_32_round(step
[17] * cospi_15_64
+ step
[30] * cospi_17_64
);
699 output
[9] = dct_32_round(step
[18] * cospi_23_64
+ step
[29] * cospi_9_64
);
700 output
[25] = dct_32_round(step
[19] * cospi_7_64
+ step
[28] * cospi_25_64
);
701 output
[5] = dct_32_round(step
[20] * cospi_27_64
+ step
[27] * cospi_5_64
);
702 output
[21] = dct_32_round(step
[21] * cospi_11_64
+ step
[26] * cospi_21_64
);
703 output
[13] = dct_32_round(step
[22] * cospi_19_64
+ step
[25] * cospi_13_64
);
704 output
[29] = dct_32_round(step
[23] * cospi_3_64
+ step
[24] * cospi_29_64
);
705 output
[3] = dct_32_round(step
[24] * cospi_3_64
+ step
[23] * -cospi_29_64
);
706 output
[19] = dct_32_round(step
[25] * cospi_19_64
+ step
[22] * -cospi_13_64
);
707 output
[11] = dct_32_round(step
[26] * cospi_11_64
+ step
[21] * -cospi_21_64
);
708 output
[27] = dct_32_round(step
[27] * cospi_27_64
+ step
[20] * -cospi_5_64
);
709 output
[7] = dct_32_round(step
[28] * cospi_7_64
+ step
[19] * -cospi_25_64
);
710 output
[23] = dct_32_round(step
[29] * cospi_23_64
+ step
[18] * -cospi_9_64
);
711 output
[15] = dct_32_round(step
[30] * cospi_15_64
+ step
[17] * -cospi_17_64
);
712 output
[31] = dct_32_round(step
[31] * cospi_31_64
+ step
[16] * -cospi_1_64
);
715 void vpx_fdct32x32_c(const int16_t *input
, tran_low_t
*out
, int stride
) {
717 tran_high_t output
[32 * 32];
720 for (i
= 0; i
< 32; ++i
) {
721 tran_high_t temp_in
[32], temp_out
[32];
722 for (j
= 0; j
< 32; ++j
)
723 temp_in
[j
] = input
[j
* stride
+ i
] * 4;
724 vpx_fdct32(temp_in
, temp_out
, 0);
725 for (j
= 0; j
< 32; ++j
)
726 output
[j
* 32 + i
] = (temp_out
[j
] + 1 + (temp_out
[j
] > 0)) >> 2;
730 for (i
= 0; i
< 32; ++i
) {
731 tran_high_t temp_in
[32], temp_out
[32];
732 for (j
= 0; j
< 32; ++j
)
733 temp_in
[j
] = output
[j
+ i
* 32];
734 vpx_fdct32(temp_in
, temp_out
, 0);
735 for (j
= 0; j
< 32; ++j
)
737 (tran_low_t
)((temp_out
[j
] + 1 + (temp_out
[j
] < 0)) >> 2);
741 // Note that although we use dct_32_round in dct32 computation flow,
742 // this 2d fdct32x32 for rate-distortion optimization loop is operating
743 // within 16 bits precision.
744 void vpx_fdct32x32_rd_c(const int16_t *input
, tran_low_t
*out
, int stride
) {
746 tran_high_t output
[32 * 32];
749 for (i
= 0; i
< 32; ++i
) {
750 tran_high_t temp_in
[32], temp_out
[32];
751 for (j
= 0; j
< 32; ++j
)
752 temp_in
[j
] = input
[j
* stride
+ i
] * 4;
753 vpx_fdct32(temp_in
, temp_out
, 0);
754 for (j
= 0; j
< 32; ++j
)
755 // TODO(cd): see quality impact of only doing
756 // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
757 // PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c
758 output
[j
* 32 + i
] = (temp_out
[j
] + 1 + (temp_out
[j
] > 0)) >> 2;
762 for (i
= 0; i
< 32; ++i
) {
763 tran_high_t temp_in
[32], temp_out
[32];
764 for (j
= 0; j
< 32; ++j
)
765 temp_in
[j
] = output
[j
+ i
* 32];
766 vpx_fdct32(temp_in
, temp_out
, 1);
767 for (j
= 0; j
< 32; ++j
)
768 out
[j
+ i
* 32] = (tran_low_t
)temp_out
[j
];
772 void vpx_fdct32x32_1_c(const int16_t *input
, tran_low_t
*output
, int stride
) {
775 for (r
= 0; r
< 32; ++r
)
776 for (c
= 0; c
< 32; ++c
)
777 sum
+= input
[r
* stride
+ c
];
779 output
[0] = sum
>> 3;
783 #if CONFIG_VP9_HIGHBITDEPTH
784 void vpx_highbd_fdct4x4_c(const int16_t *input
, tran_low_t
*output
,
786 vpx_fdct4x4_c(input
, output
, stride
);
789 void vpx_highbd_fdct8x8_c(const int16_t *input
, tran_low_t
*final_output
,
791 vpx_fdct8x8_c(input
, final_output
, stride
);
794 void vpx_highbd_fdct8x8_1_c(const int16_t *input
, tran_low_t
*final_output
,
796 vpx_fdct8x8_1_c(input
, final_output
, stride
);
799 void vpx_highbd_fdct16x16_c(const int16_t *input
, tran_low_t
*output
,
801 vpx_fdct16x16_c(input
, output
, stride
);
804 void vpx_highbd_fdct16x16_1_c(const int16_t *input
, tran_low_t
*output
,
806 vpx_fdct16x16_1_c(input
, output
, stride
);
809 void vpx_highbd_fdct32x32_c(const int16_t *input
, tran_low_t
*out
, int stride
) {
810 vpx_fdct32x32_c(input
, out
, stride
);
813 void vpx_highbd_fdct32x32_rd_c(const int16_t *input
, tran_low_t
*out
,
815 vpx_fdct32x32_rd_c(input
, out
, stride
);
818 void vpx_highbd_fdct32x32_1_c(const int16_t *input
, tran_low_t
*out
,
820 vpx_fdct32x32_1_c(input
, out
, stride
);
822 #endif // CONFIG_VP9_HIGHBITDEPTH