vpx_dsp/inv_txfm.c

   1 /*
   2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <math.h>
  12 #include <string.h>
  13
  14 #include "vpx_dsp/inv_txfm.h"
  15
  16 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  17 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  18    0.5 shifts per pixel. */
  19   int i;
  20   tran_low_t output[16];
  21   tran_high_t a1, b1, c1, d1, e1;
  22   const tran_low_t *ip = input;
  23   tran_low_t *op = output;
  24
  25   for (i = 0; i < 4; i++) {
  26     a1 = ip[0] >> UNIT_QUANT_SHIFT;
  27     c1 = ip[1] >> UNIT_QUANT_SHIFT;
  28     d1 = ip[2] >> UNIT_QUANT_SHIFT;
  29     b1 = ip[3] >> UNIT_QUANT_SHIFT;
  30     a1 += c1;
  31     d1 -= b1;
  32     e1 = (a1 - d1) >> 1;
  33     b1 = e1 - b1;
  34     c1 = e1 - c1;
  35     a1 -= b1;
  36     d1 += c1;
  37     op[0] = WRAPLOW(a1, 8);
  38     op[1] = WRAPLOW(b1, 8);
  39     op[2] = WRAPLOW(c1, 8);
  40     op[3] = WRAPLOW(d1, 8);
  41     ip += 4;
  42     op += 4;
  43   }
  44
  45   ip = output;
  46   for (i = 0; i < 4; i++) {
  47     a1 = ip[4 * 0];
  48     c1 = ip[4 * 1];
  49     d1 = ip[4 * 2];
  50     b1 = ip[4 * 3];
  51     a1 += c1;
  52     d1 -= b1;
  53     e1 = (a1 - d1) >> 1;
  54     b1 = e1 - b1;
  55     c1 = e1 - c1;
  56     a1 -= b1;
  57     d1 += c1;
  58     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
  59     dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
  60     dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
  61     dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
  62
  63     ip++;
  64     dest++;
  65   }
  66 }
  67
  68 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
  69   int i;
  70   tran_high_t a1, e1;
  71   tran_low_t tmp[4];
  72   const tran_low_t *ip = in;
  73   tran_low_t *op = tmp;
  74
  75   a1 = ip[0] >> UNIT_QUANT_SHIFT;
  76   e1 = a1 >> 1;
  77   a1 -= e1;
  78   op[0] = WRAPLOW(a1, 8);
  79   op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
  80
  81   ip = tmp;
  82   for (i = 0; i < 4; i++) {
  83     e1 = ip[0] >> 1;
  84     a1 = ip[0] - e1;
  85     dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
  86     dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
  87     dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
  88     dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
  89     ip++;
  90     dest++;
  91   }
  92 }
  93
  94 void idct4_c(const tran_low_t *input, tran_low_t *output) {
  95   tran_low_t step[4];
  96   tran_high_t temp1, temp2;
  97   // stage 1
  98   temp1 = (input[0] + input[2]) * cospi_16_64;
  99   temp2 = (input[0] - input[2]) * cospi_16_64;
 100   step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
 101   step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
 102   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
 103   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
 104   step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
 105   step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
 106
 107   // stage 2
 108   output[0] = WRAPLOW(step[0] + step[3], 8);
 109   output[1] = WRAPLOW(step[1] + step[2], 8);
 110   output[2] = WRAPLOW(step[1] - step[2], 8);
 111   output[3] = WRAPLOW(step[0] - step[3], 8);
 112 }
 113
 114 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 115   tran_low_t out[4 * 4];
 116   tran_low_t *outptr = out;
 117   int i, j;
 118   tran_low_t temp_in[4], temp_out[4];
 119
 120   // Rows
 121   for (i = 0; i < 4; ++i) {
 122     idct4_c(input, outptr);
 123     input += 4;
 124     outptr += 4;
 125   }
 126
 127   // Columns
 128   for (i = 0; i < 4; ++i) {
 129     for (j = 0; j < 4; ++j)
 130       temp_in[j] = out[j * 4 + i];
 131     idct4_c(temp_in, temp_out);
 132     for (j = 0; j < 4; ++j) {
 133       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 134                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
 135     }
 136   }
 137 }
 138
 139 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
 140                          int dest_stride) {
 141   int i;
 142   tran_high_t a1;
 143   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
 144   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
 145   a1 = ROUND_POWER_OF_TWO(out, 4);
 146
 147   for (i = 0; i < 4; i++) {
 148     dest[0] = clip_pixel_add(dest[0], a1);
 149     dest[1] = clip_pixel_add(dest[1], a1);
 150     dest[2] = clip_pixel_add(dest[2], a1);
 151     dest[3] = clip_pixel_add(dest[3], a1);
 152     dest += dest_stride;
 153   }
 154 }
 155
 156 void idct8_c(const tran_low_t *input, tran_low_t *output) {
 157   tran_low_t step1[8], step2[8];
 158   tran_high_t temp1, temp2;
 159   // stage 1
 160   step1[0] = input[0];
 161   step1[2] = input[4];
 162   step1[1] = input[2];
 163   step1[3] = input[6];
 164   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
 165   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
 166   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
 167   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
 168   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
 169   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
 170   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
 171   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
 172
 173   // stage 2
 174   temp1 = (step1[0] + step1[2]) * cospi_16_64;
 175   temp2 = (step1[0] - step1[2]) * cospi_16_64;
 176   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
 177   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
 178   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
 179   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
 180   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
 181   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
 182   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
 183   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
 184   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
 185   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
 186
 187   // stage 3
 188   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
 189   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
 190   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
 191   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
 192   step1[4] = step2[4];
 193   temp1 = (step2[6] - step2[5]) * cospi_16_64;
 194   temp2 = (step2[5] + step2[6]) * cospi_16_64;
 195   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
 196   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
 197   step1[7] = step2[7];
 198
 199   // stage 4
 200   output[0] = WRAPLOW(step1[0] + step1[7], 8);
 201   output[1] = WRAPLOW(step1[1] + step1[6], 8);
 202   output[2] = WRAPLOW(step1[2] + step1[5], 8);
 203   output[3] = WRAPLOW(step1[3] + step1[4], 8);
 204   output[4] = WRAPLOW(step1[3] - step1[4], 8);
 205   output[5] = WRAPLOW(step1[2] - step1[5], 8);
 206   output[6] = WRAPLOW(step1[1] - step1[6], 8);
 207   output[7] = WRAPLOW(step1[0] - step1[7], 8);
 208 }
 209
 210 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 211   tran_low_t out[8 * 8];
 212   tran_low_t *outptr = out;
 213   int i, j;
 214   tran_low_t temp_in[8], temp_out[8];
 215
 216   // First transform rows
 217   for (i = 0; i < 8; ++i) {
 218     idct8_c(input, outptr);
 219     input += 8;
 220     outptr += 8;
 221   }
 222
 223   // Then transform columns
 224   for (i = 0; i < 8; ++i) {
 225     for (j = 0; j < 8; ++j)
 226       temp_in[j] = out[j * 8 + i];
 227     idct8_c(temp_in, temp_out);
 228     for (j = 0; j < 8; ++j) {
 229       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 230                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
 231     }
 232   }
 233 }
 234
 235 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 236   int i, j;
 237   tran_high_t a1;
 238   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
 239   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
 240   a1 = ROUND_POWER_OF_TWO(out, 5);
 241   for (j = 0; j < 8; ++j) {
 242     for (i = 0; i < 8; ++i)
 243       dest[i] = clip_pixel_add(dest[i], a1);
 244     dest += stride;
 245   }
 246 }
 247
 248 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
 249   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 250
 251   tran_low_t x0 = input[0];
 252   tran_low_t x1 = input[1];
 253   tran_low_t x2 = input[2];
 254   tran_low_t x3 = input[3];
 255
 256   if (!(x0 | x1 | x2 | x3)) {
 257     output[0] = output[1] = output[2] = output[3] = 0;
 258     return;
 259   }
 260
 261   s0 = sinpi_1_9 * x0;
 262   s1 = sinpi_2_9 * x0;
 263   s2 = sinpi_3_9 * x1;
 264   s3 = sinpi_4_9 * x2;
 265   s4 = sinpi_1_9 * x2;
 266   s5 = sinpi_2_9 * x3;
 267   s6 = sinpi_4_9 * x3;
 268   s7 = x0 - x2 + x3;
 269
 270   s0 = s0 + s3 + s5;
 271   s1 = s1 - s4 - s6;
 272   s3 = s2;
 273   s2 = sinpi_3_9 * s7;
 274
 275   // 1-D transform scaling factor is sqrt(2).
 276   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
 277   // + 1b (addition) = 29b.
 278   // Hence the output bit depth is 15b.
 279   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
 280   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
 281   output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
 282   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
 283 }
 284
 285 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
 286   int s0, s1, s2, s3, s4, s5, s6, s7;
 287
 288   tran_high_t x0 = input[7];
 289   tran_high_t x1 = input[0];
 290   tran_high_t x2 = input[5];
 291   tran_high_t x3 = input[2];
 292   tran_high_t x4 = input[3];
 293   tran_high_t x5 = input[4];
 294   tran_high_t x6 = input[1];
 295   tran_high_t x7 = input[6];
 296
 297   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
 298     output[0] = output[1] = output[2] = output[3] = output[4]
 299               = output[5] = output[6] = output[7] = 0;
 300     return;
 301   }
 302
 303   // stage 1
 304   s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
 305   s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
 306   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
 307   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
 308   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
 309   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
 310   s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
 311   s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
 312
 313   x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
 314   x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
 315   x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
 316   x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
 317   x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
 318   x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
 319   x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
 320   x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
 321
 322   // stage 2
 323   s0 = (int)x0;
 324   s1 = (int)x1;
 325   s2 = (int)x2;
 326   s3 = (int)x3;
 327   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
 328   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
 329   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
 330   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
 331
 332   x0 = WRAPLOW(s0 + s2, 8);
 333   x1 = WRAPLOW(s1 + s3, 8);
 334   x2 = WRAPLOW(s0 - s2, 8);
 335   x3 = WRAPLOW(s1 - s3, 8);
 336   x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
 337   x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
 338   x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
 339   x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
 340
 341   // stage 3
 342   s2 = (int)(cospi_16_64 * (x2 + x3));
 343   s3 = (int)(cospi_16_64 * (x2 - x3));
 344   s6 = (int)(cospi_16_64 * (x6 + x7));
 345   s7 = (int)(cospi_16_64 * (x6 - x7));
 346
 347   x2 = WRAPLOW(dct_const_round_shift(s2), 8);
 348   x3 = WRAPLOW(dct_const_round_shift(s3), 8);
 349   x6 = WRAPLOW(dct_const_round_shift(s6), 8);
 350   x7 = WRAPLOW(dct_const_round_shift(s7), 8);
 351
 352   output[0] = WRAPLOW(x0, 8);
 353   output[1] = WRAPLOW(-x4, 8);
 354   output[2] = WRAPLOW(x6, 8);
 355   output[3] = WRAPLOW(-x2, 8);
 356   output[4] = WRAPLOW(x3, 8);
 357   output[5] = WRAPLOW(-x7, 8);
 358   output[6] = WRAPLOW(x5, 8);
 359   output[7] = WRAPLOW(-x1, 8);
 360 }
 361
 362 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 363   tran_low_t out[8 * 8] = { 0 };
 364   tran_low_t *outptr = out;
 365   int i, j;
 366   tran_low_t temp_in[8], temp_out[8];
 367
 368   // First transform rows
 369   // only first 4 row has non-zero coefs
 370   for (i = 0; i < 4; ++i) {
 371     idct8_c(input, outptr);
 372     input += 8;
 373     outptr += 8;
 374   }
 375
 376   // Then transform columns
 377   for (i = 0; i < 8; ++i) {
 378     for (j = 0; j < 8; ++j)
 379       temp_in[j] = out[j * 8 + i];
 380     idct8_c(temp_in, temp_out);
 381     for (j = 0; j < 8; ++j) {
 382       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 383                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
 384     }
 385   }
 386 }
 387
 388 void idct16_c(const tran_low_t *input, tran_low_t *output) {
 389   tran_low_t step1[16], step2[16];
 390   tran_high_t temp1, temp2;
 391
 392   // stage 1
 393   step1[0] = input[0/2];
 394   step1[1] = input[16/2];
 395   step1[2] = input[8/2];
 396   step1[3] = input[24/2];
 397   step1[4] = input[4/2];
 398   step1[5] = input[20/2];
 399   step1[6] = input[12/2];
 400   step1[7] = input[28/2];
 401   step1[8] = input[2/2];
 402   step1[9] = input[18/2];
 403   step1[10] = input[10/2];
 404   step1[11] = input[26/2];
 405   step1[12] = input[6/2];
 406   step1[13] = input[22/2];
 407   step1[14] = input[14/2];
 408   step1[15] = input[30/2];
 409
 410   // stage 2
 411   step2[0] = step1[0];
 412   step2[1] = step1[1];
 413   step2[2] = step1[2];
 414   step2[3] = step1[3];
 415   step2[4] = step1[4];
 416   step2[5] = step1[5];
 417   step2[6] = step1[6];
 418   step2[7] = step1[7];
 419
 420   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
 421   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
 422   step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
 423   step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
 424
 425   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
 426   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
 427   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
 428   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
 429
 430   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
 431   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
 432   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
 433   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
 434
 435   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
 436   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
 437   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
 438   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
 439
 440   // stage 3
 441   step1[0] = step2[0];
 442   step1[1] = step2[1];
 443   step1[2] = step2[2];
 444   step1[3] = step2[3];
 445
 446   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
 447   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
 448   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
 449   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
 450   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
 451   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
 452   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
 453   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
 454
 455   step1[8] = WRAPLOW(step2[8] + step2[9], 8);
 456   step1[9] = WRAPLOW(step2[8] - step2[9], 8);
 457   step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
 458   step1[11] = WRAPLOW(step2[10] + step2[11], 8);
 459   step1[12] = WRAPLOW(step2[12] + step2[13], 8);
 460   step1[13] = WRAPLOW(step2[12] - step2[13], 8);
 461   step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
 462   step1[15] = WRAPLOW(step2[14] + step2[15], 8);
 463
 464   // stage 4
 465   temp1 = (step1[0] + step1[1]) * cospi_16_64;
 466   temp2 = (step1[0] - step1[1]) * cospi_16_64;
 467   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
 468   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
 469   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
 470   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
 471   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
 472   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
 473   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
 474   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
 475   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
 476   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
 477
 478   step2[8] = step1[8];
 479   step2[15] = step1[15];
 480   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
 481   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
 482   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
 483   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
 484   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
 485   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
 486   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
 487   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
 488   step2[11] = step1[11];
 489   step2[12] = step1[12];
 490
 491   // stage 5
 492   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
 493   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
 494   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
 495   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
 496   step1[4] = step2[4];
 497   temp1 = (step2[6] - step2[5]) * cospi_16_64;
 498   temp2 = (step2[5] + step2[6]) * cospi_16_64;
 499   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
 500   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
 501   step1[7] = step2[7];
 502
 503   step1[8] = WRAPLOW(step2[8] + step2[11], 8);
 504   step1[9] = WRAPLOW(step2[9] + step2[10], 8);
 505   step1[10] = WRAPLOW(step2[9] - step2[10], 8);
 506   step1[11] = WRAPLOW(step2[8] - step2[11], 8);
 507   step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
 508   step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
 509   step1[14] = WRAPLOW(step2[13] + step2[14], 8);
 510   step1[15] = WRAPLOW(step2[12] + step2[15], 8);
 511
 512   // stage 6
 513   step2[0] = WRAPLOW(step1[0] + step1[7], 8);
 514   step2[1] = WRAPLOW(step1[1] + step1[6], 8);
 515   step2[2] = WRAPLOW(step1[2] + step1[5], 8);
 516   step2[3] = WRAPLOW(step1[3] + step1[4], 8);
 517   step2[4] = WRAPLOW(step1[3] - step1[4], 8);
 518   step2[5] = WRAPLOW(step1[2] - step1[5], 8);
 519   step2[6] = WRAPLOW(step1[1] - step1[6], 8);
 520   step2[7] = WRAPLOW(step1[0] - step1[7], 8);
 521   step2[8] = step1[8];
 522   step2[9] = step1[9];
 523   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
 524   temp2 = (step1[10] + step1[13]) * cospi_16_64;
 525   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
 526   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
 527   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
 528   temp2 = (step1[11] + step1[12]) * cospi_16_64;
 529   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
 530   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
 531   step2[14] = step1[14];
 532   step2[15] = step1[15];
 533
 534   // stage 7
 535   output[0] = WRAPLOW(step2[0] + step2[15], 8);
 536   output[1] = WRAPLOW(step2[1] + step2[14], 8);
 537   output[2] = WRAPLOW(step2[2] + step2[13], 8);
 538   output[3] = WRAPLOW(step2[3] + step2[12], 8);
 539   output[4] = WRAPLOW(step2[4] + step2[11], 8);
 540   output[5] = WRAPLOW(step2[5] + step2[10], 8);
 541   output[6] = WRAPLOW(step2[6] + step2[9], 8);
 542   output[7] = WRAPLOW(step2[7] + step2[8], 8);
 543   output[8] = WRAPLOW(step2[7] - step2[8], 8);
 544   output[9] = WRAPLOW(step2[6] - step2[9], 8);
 545   output[10] = WRAPLOW(step2[5] - step2[10], 8);
 546   output[11] = WRAPLOW(step2[4] - step2[11], 8);
 547   output[12] = WRAPLOW(step2[3] - step2[12], 8);
 548   output[13] = WRAPLOW(step2[2] - step2[13], 8);
 549   output[14] = WRAPLOW(step2[1] - step2[14], 8);
 550   output[15] = WRAPLOW(step2[0] - step2[15], 8);
 551 }
 552
 553 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
 554                              int stride) {
 555   tran_low_t out[16 * 16];
 556   tran_low_t *outptr = out;
 557   int i, j;
 558   tran_low_t temp_in[16], temp_out[16];
 559
 560   // First transform rows
 561   for (i = 0; i < 16; ++i) {
 562     idct16_c(input, outptr);
 563     input += 16;
 564     outptr += 16;
 565   }
 566
 567   // Then transform columns
 568   for (i = 0; i < 16; ++i) {
 569     for (j = 0; j < 16; ++j)
 570       temp_in[j] = out[j * 16 + i];
 571     idct16_c(temp_in, temp_out);
 572     for (j = 0; j < 16; ++j) {
 573       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 574                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
 575     }
 576   }
 577 }
 578
 579 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
 580   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
 581   tran_high_t s9, s10, s11, s12, s13, s14, s15;
 582
 583   tran_high_t x0 = input[15];
 584   tran_high_t x1 = input[0];
 585   tran_high_t x2 = input[13];
 586   tran_high_t x3 = input[2];
 587   tran_high_t x4 = input[11];
 588   tran_high_t x5 = input[4];
 589   tran_high_t x6 = input[9];
 590   tran_high_t x7 = input[6];
 591   tran_high_t x8 = input[7];
 592   tran_high_t x9 = input[8];
 593   tran_high_t x10 = input[5];
 594   tran_high_t x11 = input[10];
 595   tran_high_t x12 = input[3];
 596   tran_high_t x13 = input[12];
 597   tran_high_t x14 = input[1];
 598   tran_high_t x15 = input[14];
 599
 600   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
 601            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
 602     output[0] = output[1] = output[2] = output[3] = output[4]
 603               = output[5] = output[6] = output[7] = output[8]
 604               = output[9] = output[10] = output[11] = output[12]
 605               = output[13] = output[14] = output[15] = 0;
 606     return;
 607   }
 608
 609   // stage 1
 610   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
 611   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
 612   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
 613   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
 614   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
 615   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
 616   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
 617   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
 618   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
 619   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
 620   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
 621   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
 622   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
 623   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
 624   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
 625   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 626
 627   x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
 628   x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
 629   x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
 630   x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
 631   x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
 632   x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
 633   x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
 634   x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
 635   x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
 636   x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
 637   x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
 638   x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
 639   x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
 640   x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
 641   x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
 642   x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
 643
 644   // stage 2
 645   s0 = x0;
 646   s1 = x1;
 647   s2 = x2;
 648   s3 = x3;
 649   s4 = x4;
 650   s5 = x5;
 651   s6 = x6;
 652   s7 = x7;
 653   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
 654   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
 655   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
 656   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
 657   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
 658   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
 659   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
 660   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
 661
 662   x0 = WRAPLOW(s0 + s4, 8);
 663   x1 = WRAPLOW(s1 + s5, 8);
 664   x2 = WRAPLOW(s2 + s6, 8);
 665   x3 = WRAPLOW(s3 + s7, 8);
 666   x4 = WRAPLOW(s0 - s4, 8);
 667   x5 = WRAPLOW(s1 - s5, 8);
 668   x6 = WRAPLOW(s2 - s6, 8);
 669   x7 = WRAPLOW(s3 - s7, 8);
 670   x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
 671   x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
 672   x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
 673   x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
 674   x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
 675   x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
 676   x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
 677   x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
 678
 679   // stage 3
 680   s0 = x0;
 681   s1 = x1;
 682   s2 = x2;
 683   s3 = x3;
 684   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
 685   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
 686   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
 687   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
 688   s8 = x8;
 689   s9 = x9;
 690   s10 = x10;
 691   s11 = x11;
 692   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
 693   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
 694   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
 695   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
 696
 697   x0 = WRAPLOW(check_range(s0 + s2), 8);
 698   x1 = WRAPLOW(check_range(s1 + s3), 8);
 699   x2 = WRAPLOW(check_range(s0 - s2), 8);
 700   x3 = WRAPLOW(check_range(s1 - s3), 8);
 701   x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
 702   x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
 703   x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
 704   x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
 705   x8 = WRAPLOW(check_range(s8 + s10), 8);
 706   x9 = WRAPLOW(check_range(s9 + s11), 8);
 707   x10 = WRAPLOW(check_range(s8 - s10), 8);
 708   x11 = WRAPLOW(check_range(s9 - s11), 8);
 709   x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
 710   x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
 711   x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
 712   x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
 713
 714   // stage 4
 715   s2 = (- cospi_16_64) * (x2 + x3);
 716   s3 = cospi_16_64 * (x2 - x3);
 717   s6 = cospi_16_64 * (x6 + x7);
 718   s7 = cospi_16_64 * (- x6 + x7);
 719   s10 = cospi_16_64 * (x10 + x11);
 720   s11 = cospi_16_64 * (- x10 + x11);
 721   s14 = (- cospi_16_64) * (x14 + x15);
 722   s15 = cospi_16_64 * (x14 - x15);
 723
 724   x2 = WRAPLOW(dct_const_round_shift(s2), 8);
 725   x3 = WRAPLOW(dct_const_round_shift(s3), 8);
 726   x6 = WRAPLOW(dct_const_round_shift(s6), 8);
 727   x7 = WRAPLOW(dct_const_round_shift(s7), 8);
 728   x10 = WRAPLOW(dct_const_round_shift(s10), 8);
 729   x11 = WRAPLOW(dct_const_round_shift(s11), 8);
 730   x14 = WRAPLOW(dct_const_round_shift(s14), 8);
 731   x15 = WRAPLOW(dct_const_round_shift(s15), 8);
 732
 733   output[0] = WRAPLOW(x0, 8);
 734   output[1] = WRAPLOW(-x8, 8);
 735   output[2] = WRAPLOW(x12, 8);
 736   output[3] = WRAPLOW(-x4, 8);
 737   output[4] = WRAPLOW(x6, 8);
 738   output[5] = WRAPLOW(x14, 8);
 739   output[6] = WRAPLOW(x10, 8);
 740   output[7] = WRAPLOW(x2, 8);
 741   output[8] = WRAPLOW(x3, 8);
 742   output[9] = WRAPLOW(x11, 8);
 743   output[10] = WRAPLOW(x15, 8);
 744   output[11] = WRAPLOW(x7, 8);
 745   output[12] = WRAPLOW(x5, 8);
 746   output[13] = WRAPLOW(-x13, 8);
 747   output[14] = WRAPLOW(x9, 8);
 748   output[15] = WRAPLOW(-x1, 8);
 749 }
 750
 751 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
 752                             int stride) {
 753   tran_low_t out[16 * 16] = { 0 };
 754   tran_low_t *outptr = out;
 755   int i, j;
 756   tran_low_t temp_in[16], temp_out[16];
 757
 758   // First transform rows. Since all non-zero dct coefficients are in
 759   // upper-left 4x4 area, we only need to calculate first 4 rows here.
 760   for (i = 0; i < 4; ++i) {
 761     idct16_c(input, outptr);
 762     input += 16;
 763     outptr += 16;
 764   }
 765
 766   // Then transform columns
 767   for (i = 0; i < 16; ++i) {
 768     for (j = 0; j < 16; ++j)
 769       temp_in[j] = out[j*16 + i];
 770     idct16_c(temp_in, temp_out);
 771     for (j = 0; j < 16; ++j) {
 772       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
 773                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
 774     }
 775   }
 776 }
 777
 778 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 779   int i, j;
 780   tran_high_t a1;
 781   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
 782   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
 783   a1 = ROUND_POWER_OF_TWO(out, 6);
 784   for (j = 0; j < 16; ++j) {
 785     for (i = 0; i < 16; ++i)
 786       dest[i] = clip_pixel_add(dest[i], a1);
 787     dest += stride;
 788   }
 789 }
 790
 791 void idct32_c(const tran_low_t *input, tran_low_t *output) {
 792   tran_low_t step1[32], step2[32];
 793   tran_high_t temp1, temp2;
 794
 795   // stage 1
 796   step1[0] = input[0];
 797   step1[1] = input[16];
 798   step1[2] = input[8];
 799   step1[3] = input[24];
 800   step1[4] = input[4];
 801   step1[5] = input[20];
 802   step1[6] = input[12];
 803   step1[7] = input[28];
 804   step1[8] = input[2];
 805   step1[9] = input[18];
 806   step1[10] = input[10];
 807   step1[11] = input[26];
 808   step1[12] = input[6];
 809   step1[13] = input[22];
 810   step1[14] = input[14];
 811   step1[15] = input[30];
 812
 813   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
 814   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
 815   step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
 816   step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
 817
 818   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
 819   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
 820   step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
 821   step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
 822
 823   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
 824   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
 825   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
 826   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
 827
 828   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
 829   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
 830   step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
 831   step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
 832
 833   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
 834   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
 835   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
 836   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
 837
 838   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
 839   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
 840   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
 841   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
 842
 843   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
 844   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
 845   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
 846   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
 847
 848   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
 849   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
 850   step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
 851   step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
 852
 853   // stage 2
 854   step2[0] = step1[0];
 855   step2[1] = step1[1];
 856   step2[2] = step1[2];
 857   step2[3] = step1[3];
 858   step2[4] = step1[4];
 859   step2[5] = step1[5];
 860   step2[6] = step1[6];
 861   step2[7] = step1[7];
 862
 863   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
 864   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
 865   step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
 866   step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
 867
 868   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
 869   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
 870   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
 871   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
 872
 873   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
 874   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
 875   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
 876   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
 877
 878   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
 879   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
 880   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
 881   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
 882
 883   step2[16] = WRAPLOW(step1[16] + step1[17], 8);
 884   step2[17] = WRAPLOW(step1[16] - step1[17], 8);
 885   step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
 886   step2[19] = WRAPLOW(step1[18] + step1[19], 8);
 887   step2[20] = WRAPLOW(step1[20] + step1[21], 8);
 888   step2[21] = WRAPLOW(step1[20] - step1[21], 8);
 889   step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
 890   step2[23] = WRAPLOW(step1[22] + step1[23], 8);
 891   step2[24] = WRAPLOW(step1[24] + step1[25], 8);
 892   step2[25] = WRAPLOW(step1[24] - step1[25], 8);
 893   step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
 894   step2[27] = WRAPLOW(step1[26] + step1[27], 8);
 895   step2[28] = WRAPLOW(step1[28] + step1[29], 8);
 896   step2[29] = WRAPLOW(step1[28] - step1[29], 8);
 897   step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
 898   step2[31] = WRAPLOW(step1[30] + step1[31], 8);
 899
 900   // stage 3
 901   step1[0] = step2[0];
 902   step1[1] = step2[1];
 903   step1[2] = step2[2];
 904   step1[3] = step2[3];
 905
 906   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
 907   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
 908   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
 909   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
 910   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
 911   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
 912   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
 913   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
 914
 915   step1[8] = WRAPLOW(step2[8] + step2[9], 8);
 916   step1[9] = WRAPLOW(step2[8] - step2[9], 8);
 917   step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
 918   step1[11] = WRAPLOW(step2[10] + step2[11], 8);
 919   step1[12] = WRAPLOW(step2[12] + step2[13], 8);
 920   step1[13] = WRAPLOW(step2[12] - step2[13], 8);
 921   step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
 922   step1[15] = WRAPLOW(step2[14] + step2[15], 8);
 923
 924   step1[16] = step2[16];
 925   step1[31] = step2[31];
 926   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
 927   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
 928   step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
 929   step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
 930   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
 931   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
 932   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
 933   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
 934   step1[19] = step2[19];
 935   step1[20] = step2[20];
 936   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
 937   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
 938   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
 939   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
 940   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
 941   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
 942   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
 943   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
 944   step1[23] = step2[23];
 945   step1[24] = step2[24];
 946   step1[27] = step2[27];
 947   step1[28] = step2[28];
 948
 949   // stage 4
 950   temp1 = (step1[0] + step1[1]) * cospi_16_64;
 951   temp2 = (step1[0] - step1[1]) * cospi_16_64;
 952   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
 953   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
 954   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
 955   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
 956   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
 957   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
 958   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
 959   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
 960   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
 961   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
 962
 963   step2[8] = step1[8];
 964   step2[15] = step1[15];
 965   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
 966   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
 967   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
 968   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
 969   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
 970   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
 971   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
 972   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
 973   step2[11] = step1[11];
 974   step2[12] = step1[12];
 975
 976   step2[16] = WRAPLOW(step1[16] + step1[19], 8);
 977   step2[17] = WRAPLOW(step1[17] + step1[18], 8);
 978   step2[18] = WRAPLOW(step1[17] - step1[18], 8);
 979   step2[19] = WRAPLOW(step1[16] - step1[19], 8);
 980   step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
 981   step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
 982   step2[22] = WRAPLOW(step1[21] + step1[22], 8);
 983   step2[23] = WRAPLOW(step1[20] + step1[23], 8);
 984
 985   step2[24] = WRAPLOW(step1[24] + step1[27], 8);
 986   step2[25] = WRAPLOW(step1[25] + step1[26], 8);
 987   step2[26] = WRAPLOW(step1[25] - step1[26], 8);
 988   step2[27] = WRAPLOW(step1[24] - step1[27], 8);
 989   step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
 990   step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
 991   step2[30] = WRAPLOW(step1[29] + step1[30], 8);
 992   step2[31] = WRAPLOW(step1[28] + step1[31], 8);
 993
 994   // stage 5
 995   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
 996   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
 997   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
 998   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
 999   step1[4] = step2[4];
1000   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1001   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1002   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
1003   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
1004   step1[7] = step2[7];
1005
1006   step1[8] = WRAPLOW(step2[8] + step2[11], 8);
1007   step1[9] = WRAPLOW(step2[9] + step2[10], 8);
1008   step1[10] = WRAPLOW(step2[9] - step2[10], 8);
1009   step1[11] = WRAPLOW(step2[8] - step2[11], 8);
1010   step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
1011   step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
1012   step1[14] = WRAPLOW(step2[13] + step2[14], 8);
1013   step1[15] = WRAPLOW(step2[12] + step2[15], 8);
1014
1015   step1[16] = step2[16];
1016   step1[17] = step2[17];
1017   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1018   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1019   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
1020   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
1021   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1022   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1023   step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
1024   step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
1025   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1026   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1027   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1028   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1029   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1030   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1031   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1032   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1033   step1[22] = step2[22];
1034   step1[23] = step2[23];
1035   step1[24] = step2[24];
1036   step1[25] = step2[25];
1037   step1[30] = step2[30];
1038   step1[31] = step2[31];
1039
1040   // stage 6
1041   step2[0] = WRAPLOW(step1[0] + step1[7], 8);
1042   step2[1] = WRAPLOW(step1[1] + step1[6], 8);
1043   step2[2] = WRAPLOW(step1[2] + step1[5], 8);
1044   step2[3] = WRAPLOW(step1[3] + step1[4], 8);
1045   step2[4] = WRAPLOW(step1[3] - step1[4], 8);
1046   step2[5] = WRAPLOW(step1[2] - step1[5], 8);
1047   step2[6] = WRAPLOW(step1[1] - step1[6], 8);
1048   step2[7] = WRAPLOW(step1[0] - step1[7], 8);
1049   step2[8] = step1[8];
1050   step2[9] = step1[9];
1051   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1052   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1053   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1054   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1055   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1056   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1057   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
1058   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
1059   step2[14] = step1[14];
1060   step2[15] = step1[15];
1061
1062   step2[16] = WRAPLOW(step1[16] + step1[23], 8);
1063   step2[17] = WRAPLOW(step1[17] + step1[22], 8);
1064   step2[18] = WRAPLOW(step1[18] + step1[21], 8);
1065   step2[19] = WRAPLOW(step1[19] + step1[20], 8);
1066   step2[20] = WRAPLOW(step1[19] - step1[20], 8);
1067   step2[21] = WRAPLOW(step1[18] - step1[21], 8);
1068   step2[22] = WRAPLOW(step1[17] - step1[22], 8);
1069   step2[23] = WRAPLOW(step1[16] - step1[23], 8);
1070
1071   step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
1072   step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
1073   step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
1074   step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
1075   step2[28] = WRAPLOW(step1[27] + step1[28], 8);
1076   step2[29] = WRAPLOW(step1[26] + step1[29], 8);
1077   step2[30] = WRAPLOW(step1[25] + step1[30], 8);
1078   step2[31] = WRAPLOW(step1[24] + step1[31], 8);
1079
1080   // stage 7
1081   step1[0] = WRAPLOW(step2[0] + step2[15], 8);
1082   step1[1] = WRAPLOW(step2[1] + step2[14], 8);
1083   step1[2] = WRAPLOW(step2[2] + step2[13], 8);
1084   step1[3] = WRAPLOW(step2[3] + step2[12], 8);
1085   step1[4] = WRAPLOW(step2[4] + step2[11], 8);
1086   step1[5] = WRAPLOW(step2[5] + step2[10], 8);
1087   step1[6] = WRAPLOW(step2[6] + step2[9], 8);
1088   step1[7] = WRAPLOW(step2[7] + step2[8], 8);
1089   step1[8] = WRAPLOW(step2[7] - step2[8], 8);
1090   step1[9] = WRAPLOW(step2[6] - step2[9], 8);
1091   step1[10] = WRAPLOW(step2[5] - step2[10], 8);
1092   step1[11] = WRAPLOW(step2[4] - step2[11], 8);
1093   step1[12] = WRAPLOW(step2[3] - step2[12], 8);
1094   step1[13] = WRAPLOW(step2[2] - step2[13], 8);
1095   step1[14] = WRAPLOW(step2[1] - step2[14], 8);
1096   step1[15] = WRAPLOW(step2[0] - step2[15], 8);
1097
1098   step1[16] = step2[16];
1099   step1[17] = step2[17];
1100   step1[18] = step2[18];
1101   step1[19] = step2[19];
1102   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1103   temp2 = (step2[20] + step2[27]) * cospi_16_64;
1104   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1105   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1106   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1107   temp2 = (step2[21] + step2[26]) * cospi_16_64;
1108   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1109   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1110   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1111   temp2 = (step2[22] + step2[25]) * cospi_16_64;
1112   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
1113   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
1114   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1115   temp2 = (step2[23] + step2[24]) * cospi_16_64;
1116   step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
1117   step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
1118   step1[28] = step2[28];
1119   step1[29] = step2[29];
1120   step1[30] = step2[30];
1121   step1[31] = step2[31];
1122
1123   // final stage
1124   output[0] = WRAPLOW(step1[0] + step1[31], 8);
1125   output[1] = WRAPLOW(step1[1] + step1[30], 8);
1126   output[2] = WRAPLOW(step1[2] + step1[29], 8);
1127   output[3] = WRAPLOW(step1[3] + step1[28], 8);
1128   output[4] = WRAPLOW(step1[4] + step1[27], 8);
1129   output[5] = WRAPLOW(step1[5] + step1[26], 8);
1130   output[6] = WRAPLOW(step1[6] + step1[25], 8);
1131   output[7] = WRAPLOW(step1[7] + step1[24], 8);
1132   output[8] = WRAPLOW(step1[8] + step1[23], 8);
1133   output[9] = WRAPLOW(step1[9] + step1[22], 8);
1134   output[10] = WRAPLOW(step1[10] + step1[21], 8);
1135   output[11] = WRAPLOW(step1[11] + step1[20], 8);
1136   output[12] = WRAPLOW(step1[12] + step1[19], 8);
1137   output[13] = WRAPLOW(step1[13] + step1[18], 8);
1138   output[14] = WRAPLOW(step1[14] + step1[17], 8);
1139   output[15] = WRAPLOW(step1[15] + step1[16], 8);
1140   output[16] = WRAPLOW(step1[15] - step1[16], 8);
1141   output[17] = WRAPLOW(step1[14] - step1[17], 8);
1142   output[18] = WRAPLOW(step1[13] - step1[18], 8);
1143   output[19] = WRAPLOW(step1[12] - step1[19], 8);
1144   output[20] = WRAPLOW(step1[11] - step1[20], 8);
1145   output[21] = WRAPLOW(step1[10] - step1[21], 8);
1146   output[22] = WRAPLOW(step1[9] - step1[22], 8);
1147   output[23] = WRAPLOW(step1[8] - step1[23], 8);
1148   output[24] = WRAPLOW(step1[7] - step1[24], 8);
1149   output[25] = WRAPLOW(step1[6] - step1[25], 8);
1150   output[26] = WRAPLOW(step1[5] - step1[26], 8);
1151   output[27] = WRAPLOW(step1[4] - step1[27], 8);
1152   output[28] = WRAPLOW(step1[3] - step1[28], 8);
1153   output[29] = WRAPLOW(step1[2] - step1[29], 8);
1154   output[30] = WRAPLOW(step1[1] - step1[30], 8);
1155   output[31] = WRAPLOW(step1[0] - step1[31], 8);
1156 }
1157
1158 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1159                               int stride) {
1160   tran_low_t out[32 * 32];
1161   tran_low_t *outptr = out;
1162   int i, j;
1163   tran_low_t temp_in[32], temp_out[32];
1164
1165   // Rows
1166   for (i = 0; i < 32; ++i) {
1167     int16_t zero_coeff[16];
1168     for (j = 0; j < 16; ++j)
1169       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1170     for (j = 0; j < 8; ++j)
1171       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1172     for (j = 0; j < 4; ++j)
1173       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1174     for (j = 0; j < 2; ++j)
1175       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1176
1177     if (zero_coeff[0] | zero_coeff[1])
1178       idct32_c(input, outptr);
1179     else
1180       memset(outptr, 0, sizeof(tran_low_t) * 32);
1181     input += 32;
1182     outptr += 32;
1183   }
1184
1185   // Columns
1186   for (i = 0; i < 32; ++i) {
1187     for (j = 0; j < 32; ++j)
1188       temp_in[j] = out[j * 32 + i];
1189     idct32_c(temp_in, temp_out);
1190     for (j = 0; j < 32; ++j) {
1191       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1192                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1193     }
1194   }
1195 }
1196
1197 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1198                             int stride) {
1199   tran_low_t out[32 * 32] = {0};
1200   tran_low_t *outptr = out;
1201   int i, j;
1202   tran_low_t temp_in[32], temp_out[32];
1203
1204   // Rows
1205   // only upper-left 8x8 has non-zero coeff
1206   for (i = 0; i < 8; ++i) {
1207     idct32_c(input, outptr);
1208     input += 32;
1209     outptr += 32;
1210   }
1211
1212   // Columns
1213   for (i = 0; i < 32; ++i) {
1214     for (j = 0; j < 32; ++j)
1215       temp_in[j] = out[j * 32 + i];
1216     idct32_c(temp_in, temp_out);
1217     for (j = 0; j < 32; ++j) {
1218       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1219                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1220     }
1221   }
1222 }
1223
1224 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1225   int i, j;
1226   tran_high_t a1;
1227
1228   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
1229   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
1230   a1 = ROUND_POWER_OF_TWO(out, 6);
1231
1232   for (j = 0; j < 32; ++j) {
1233     for (i = 0; i < 32; ++i)
1234       dest[i] = clip_pixel_add(dest[i], a1);
1235     dest += stride;
1236   }
1237 }
1238
1239 #if CONFIG_VP9_HIGHBITDEPTH
1240 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1241                                  int stride, int bd) {
1242   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1243      0.5 shifts per pixel. */
1244   int i;
1245   tran_low_t output[16];
1246   tran_high_t a1, b1, c1, d1, e1;
1247   const tran_low_t *ip = input;
1248   tran_low_t *op = output;
1249   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1250
1251   for (i = 0; i < 4; i++) {
1252     a1 = ip[0] >> UNIT_QUANT_SHIFT;
1253     c1 = ip[1] >> UNIT_QUANT_SHIFT;
1254     d1 = ip[2] >> UNIT_QUANT_SHIFT;
1255     b1 = ip[3] >> UNIT_QUANT_SHIFT;
1256     a1 += c1;
1257     d1 -= b1;
1258     e1 = (a1 - d1) >> 1;
1259     b1 = e1 - b1;
1260     c1 = e1 - c1;
1261     a1 -= b1;
1262     d1 += c1;
1263     op[0] = WRAPLOW(a1, bd);
1264     op[1] = WRAPLOW(b1, bd);
1265     op[2] = WRAPLOW(c1, bd);
1266     op[3] = WRAPLOW(d1, bd);
1267     ip += 4;
1268     op += 4;
1269   }
1270
1271   ip = output;
1272   for (i = 0; i < 4; i++) {
1273     a1 = ip[4 * 0];
1274     c1 = ip[4 * 1];
1275     d1 = ip[4 * 2];
1276     b1 = ip[4 * 3];
1277     a1 += c1;
1278     d1 -= b1;
1279     e1 = (a1 - d1) >> 1;
1280     b1 = e1 - b1;
1281     c1 = e1 - c1;
1282     a1 -= b1;
1283     d1 += c1;
1284     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1285     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
1286     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
1287     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
1288
1289     ip++;
1290     dest++;
1291   }
1292 }
1293
1294 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1295                                 int dest_stride, int bd) {
1296   int i;
1297   tran_high_t a1, e1;
1298   tran_low_t tmp[4];
1299   const tran_low_t *ip = in;
1300   tran_low_t *op = tmp;
1301   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1302   (void) bd;
1303
1304   a1 = ip[0] >> UNIT_QUANT_SHIFT;
1305   e1 = a1 >> 1;
1306   a1 -= e1;
1307   op[0] = WRAPLOW(a1, bd);
1308   op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
1309
1310   ip = tmp;
1311   for (i = 0; i < 4; i++) {
1312     e1 = ip[0] >> 1;
1313     a1 = ip[0] - e1;
1314     dest[dest_stride * 0] = highbd_clip_pixel_add(
1315         dest[dest_stride * 0], a1, bd);
1316     dest[dest_stride * 1] = highbd_clip_pixel_add(
1317         dest[dest_stride * 1], e1, bd);
1318     dest[dest_stride * 2] = highbd_clip_pixel_add(
1319         dest[dest_stride * 2], e1, bd);
1320     dest[dest_stride * 3] = highbd_clip_pixel_add(
1321         dest[dest_stride * 3], e1, bd);
1322     ip++;
1323     dest++;
1324   }
1325 }
1326
1327 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1328   tran_low_t step[4];
1329   tran_high_t temp1, temp2;
1330   (void) bd;
1331   // stage 1
1332   temp1 = (input[0] + input[2]) * cospi_16_64;
1333   temp2 = (input[0] - input[2]) * cospi_16_64;
1334   step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1335   step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1336   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1337   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1338   step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1339   step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1340
1341   // stage 2
1342   output[0] = WRAPLOW(step[0] + step[3], bd);
1343   output[1] = WRAPLOW(step[1] + step[2], bd);
1344   output[2] = WRAPLOW(step[1] - step[2], bd);
1345   output[3] = WRAPLOW(step[0] - step[3], bd);
1346 }
1347
1348 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1349                                  int stride, int bd) {
1350   tran_low_t out[4 * 4];
1351   tran_low_t *outptr = out;
1352   int i, j;
1353   tran_low_t temp_in[4], temp_out[4];
1354   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1355
1356   // Rows
1357   for (i = 0; i < 4; ++i) {
1358     vpx_highbd_idct4_c(input, outptr, bd);
1359     input += 4;
1360     outptr += 4;
1361   }
1362
1363   // Columns
1364   for (i = 0; i < 4; ++i) {
1365     for (j = 0; j < 4; ++j)
1366       temp_in[j] = out[j * 4 + i];
1367     vpx_highbd_idct4_c(temp_in, temp_out, bd);
1368     for (j = 0; j < 4; ++j) {
1369       dest[j * stride + i] = highbd_clip_pixel_add(
1370           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1371     }
1372   }
1373 }
1374
1375 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1376                                 int dest_stride, int bd) {
1377   int i;
1378   tran_high_t a1;
1379   tran_low_t out = WRAPLOW(
1380       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1381   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1382
1383   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1384   a1 = ROUND_POWER_OF_TWO(out, 4);
1385
1386   for (i = 0; i < 4; i++) {
1387     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1388     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1389     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1390     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1391     dest += dest_stride;
1392   }
1393 }
1394
1395 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1396   tran_low_t step1[8], step2[8];
1397   tran_high_t temp1, temp2;
1398   // stage 1
1399   step1[0] = input[0];
1400   step1[2] = input[4];
1401   step1[1] = input[2];
1402   step1[3] = input[6];
1403   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1404   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1405   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1406   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1407   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1408   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1409   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1410   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1411
1412   // stage 2 & stage 3 - even half
1413   vpx_highbd_idct4_c(step1, step1, bd);
1414
1415   // stage 2 - odd half
1416   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1417   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1418   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1419   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1420
1421   // stage 3 - odd half
1422   step1[4] = step2[4];
1423   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1424   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1425   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1426   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1427   step1[7] = step2[7];
1428
1429   // stage 4
1430   output[0] = WRAPLOW(step1[0] + step1[7], bd);
1431   output[1] = WRAPLOW(step1[1] + step1[6], bd);
1432   output[2] = WRAPLOW(step1[2] + step1[5], bd);
1433   output[3] = WRAPLOW(step1[3] + step1[4], bd);
1434   output[4] = WRAPLOW(step1[3] - step1[4], bd);
1435   output[5] = WRAPLOW(step1[2] - step1[5], bd);
1436   output[6] = WRAPLOW(step1[1] - step1[6], bd);
1437   output[7] = WRAPLOW(step1[0] - step1[7], bd);
1438 }
1439
1440 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1441                                  int stride, int bd) {
1442   tran_low_t out[8 * 8];
1443   tran_low_t *outptr = out;
1444   int i, j;
1445   tran_low_t temp_in[8], temp_out[8];
1446   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1447
1448   // First transform rows.
1449   for (i = 0; i < 8; ++i) {
1450     vpx_highbd_idct8_c(input, outptr, bd);
1451     input += 8;
1452     outptr += 8;
1453   }
1454
1455   // Then transform columns.
1456   for (i = 0; i < 8; ++i) {
1457     for (j = 0; j < 8; ++j)
1458       temp_in[j] = out[j * 8 + i];
1459     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1460     for (j = 0; j < 8; ++j) {
1461       dest[j * stride + i] = highbd_clip_pixel_add(
1462           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1463     }
1464   }
1465 }
1466
1467 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1468                                 int stride, int bd) {
1469   int i, j;
1470   tran_high_t a1;
1471   tran_low_t out = WRAPLOW(
1472       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1473   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1474   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1475   a1 = ROUND_POWER_OF_TWO(out, 5);
1476   for (j = 0; j < 8; ++j) {
1477     for (i = 0; i < 8; ++i)
1478       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1479     dest += stride;
1480   }
1481 }
1482
1483 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1484   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1485
1486   tran_low_t x0 = input[0];
1487   tran_low_t x1 = input[1];
1488   tran_low_t x2 = input[2];
1489   tran_low_t x3 = input[3];
1490   (void) bd;
1491
1492   if (!(x0 | x1 | x2 | x3)) {
1493     memset(output, 0, 4 * sizeof(*output));
1494     return;
1495   }
1496
1497   s0 = sinpi_1_9 * x0;
1498   s1 = sinpi_2_9 * x0;
1499   s2 = sinpi_3_9 * x1;
1500   s3 = sinpi_4_9 * x2;
1501   s4 = sinpi_1_9 * x2;
1502   s5 = sinpi_2_9 * x3;
1503   s6 = sinpi_4_9 * x3;
1504   s7 = (tran_high_t)(x0 - x2 + x3);
1505
1506   s0 = s0 + s3 + s5;
1507   s1 = s1 - s4 - s6;
1508   s3 = s2;
1509   s2 = sinpi_3_9 * s7;
1510
1511   // 1-D transform scaling factor is sqrt(2).
1512   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1513   // + 1b (addition) = 29b.
1514   // Hence the output bit depth is 15b.
1515   output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
1516   output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
1517   output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1518   output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
1519 }
1520
1521 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1522   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1523
1524   tran_low_t x0 = input[7];
1525   tran_low_t x1 = input[0];
1526   tran_low_t x2 = input[5];
1527   tran_low_t x3 = input[2];
1528   tran_low_t x4 = input[3];
1529   tran_low_t x5 = input[4];
1530   tran_low_t x6 = input[1];
1531   tran_low_t x7 = input[6];
1532   (void) bd;
1533
1534   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1535     memset(output, 0, 8 * sizeof(*output));
1536     return;
1537   }
1538
1539   // stage 1
1540   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
1541   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
1542   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1543   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1544   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1545   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1546   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
1547   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
1548
1549   x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
1550   x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
1551   x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
1552   x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
1553   x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
1554   x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
1555   x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
1556   x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
1557
1558   // stage 2
1559   s0 = x0;
1560   s1 = x1;
1561   s2 = x2;
1562   s3 = x3;
1563   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
1564   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
1565   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
1566   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
1567
1568   x0 = WRAPLOW(s0 + s2, bd);
1569   x1 = WRAPLOW(s1 + s3, bd);
1570   x2 = WRAPLOW(s0 - s2, bd);
1571   x3 = WRAPLOW(s1 - s3, bd);
1572   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1573   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1574   x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1575   x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1576
1577   // stage 3
1578   s2 = cospi_16_64 * (x2 + x3);
1579   s3 = cospi_16_64 * (x2 - x3);
1580   s6 = cospi_16_64 * (x6 + x7);
1581   s7 = cospi_16_64 * (x6 - x7);
1582
1583   x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1584   x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1585   x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1586   x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1587
1588   output[0] = WRAPLOW(x0, bd);
1589   output[1] = WRAPLOW(-x4, bd);
1590   output[2] = WRAPLOW(x6, bd);
1591   output[3] = WRAPLOW(-x2, bd);
1592   output[4] = WRAPLOW(x3, bd);
1593   output[5] = WRAPLOW(-x7, bd);
1594   output[6] = WRAPLOW(x5, bd);
1595   output[7] = WRAPLOW(-x1, bd);
1596 }
1597
1598 void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
1599                                  int stride, int bd) {
1600   tran_low_t out[8 * 8] = { 0 };
1601   tran_low_t *outptr = out;
1602   int i, j;
1603   tran_low_t temp_in[8], temp_out[8];
1604   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1605
1606   // First transform rows.
1607   // Only first 4 row has non-zero coefs.
1608   for (i = 0; i < 4; ++i) {
1609     vpx_highbd_idct8_c(input, outptr, bd);
1610     input += 8;
1611     outptr += 8;
1612   }
1613   // Then transform columns.
1614   for (i = 0; i < 8; ++i) {
1615     for (j = 0; j < 8; ++j)
1616       temp_in[j] = out[j * 8 + i];
1617     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1618     for (j = 0; j < 8; ++j) {
1619       dest[j * stride + i] = highbd_clip_pixel_add(
1620           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1621     }
1622   }
1623 }
1624
1625 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1626   tran_low_t step1[16], step2[16];
1627   tran_high_t temp1, temp2;
1628   (void) bd;
1629
1630   // stage 1
1631   step1[0] = input[0/2];
1632   step1[1] = input[16/2];
1633   step1[2] = input[8/2];
1634   step1[3] = input[24/2];
1635   step1[4] = input[4/2];
1636   step1[5] = input[20/2];
1637   step1[6] = input[12/2];
1638   step1[7] = input[28/2];
1639   step1[8] = input[2/2];
1640   step1[9] = input[18/2];
1641   step1[10] = input[10/2];
1642   step1[11] = input[26/2];
1643   step1[12] = input[6/2];
1644   step1[13] = input[22/2];
1645   step1[14] = input[14/2];
1646   step1[15] = input[30/2];
1647
1648   // stage 2
1649   step2[0] = step1[0];
1650   step2[1] = step1[1];
1651   step2[2] = step1[2];
1652   step2[3] = step1[3];
1653   step2[4] = step1[4];
1654   step2[5] = step1[5];
1655   step2[6] = step1[6];
1656   step2[7] = step1[7];
1657
1658   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1659   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1660   step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1661   step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1662
1663   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1664   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1665   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1666   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1667
1668   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1669   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1670   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1671   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1672
1673   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1674   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1675   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1676   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1677
1678   // stage 3
1679   step1[0] = step2[0];
1680   step1[1] = step2[1];
1681   step1[2] = step2[2];
1682   step1[3] = step2[3];
1683
1684   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1685   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1686   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1687   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1688   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1689   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1690   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1691   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1692
1693   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
1694   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
1695   step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
1696   step1[11] = WRAPLOW(step2[10] + step2[11], bd);
1697   step1[12] = WRAPLOW(step2[12] + step2[13], bd);
1698   step1[13] = WRAPLOW(step2[12] - step2[13], bd);
1699   step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
1700   step1[15] = WRAPLOW(step2[14] + step2[15], bd);
1701
1702   // stage 4
1703   temp1 = (step1[0] + step1[1]) * cospi_16_64;
1704   temp2 = (step1[0] - step1[1]) * cospi_16_64;
1705   step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1706   step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1707   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1708   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1709   step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1710   step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1711   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1712   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1713   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1714   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1715
1716   step2[8] = step1[8];
1717   step2[15] = step1[15];
1718   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1719   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1720   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1721   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1722   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1723   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1724   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1725   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1726   step2[11] = step1[11];
1727   step2[12] = step1[12];
1728
1729   // stage 5
1730   step1[0] = WRAPLOW(step2[0] + step2[3], bd);
1731   step1[1] = WRAPLOW(step2[1] + step2[2], bd);
1732   step1[2] = WRAPLOW(step2[1] - step2[2], bd);
1733   step1[3] = WRAPLOW(step2[0] - step2[3], bd);
1734   step1[4] = step2[4];
1735   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1736   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1737   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1738   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1739   step1[7] = step2[7];
1740
1741   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
1742   step1[9] = WRAPLOW(step2[9] + step2[10], bd);
1743   step1[10] = WRAPLOW(step2[9] - step2[10], bd);
1744   step1[11] = WRAPLOW(step2[8] - step2[11], bd);
1745   step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
1746   step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
1747   step1[14] = WRAPLOW(step2[13] + step2[14], bd);
1748   step1[15] = WRAPLOW(step2[12] + step2[15], bd);
1749
1750   // stage 6
1751   step2[0] = WRAPLOW(step1[0] + step1[7], bd);
1752   step2[1] = WRAPLOW(step1[1] + step1[6], bd);
1753   step2[2] = WRAPLOW(step1[2] + step1[5], bd);
1754   step2[3] = WRAPLOW(step1[3] + step1[4], bd);
1755   step2[4] = WRAPLOW(step1[3] - step1[4], bd);
1756   step2[5] = WRAPLOW(step1[2] - step1[5], bd);
1757   step2[6] = WRAPLOW(step1[1] - step1[6], bd);
1758   step2[7] = WRAPLOW(step1[0] - step1[7], bd);
1759   step2[8] = step1[8];
1760   step2[9] = step1[9];
1761   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1762   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1763   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1764   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1765   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1766   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1767   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1768   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1769   step2[14] = step1[14];
1770   step2[15] = step1[15];
1771
1772   // stage 7
1773   output[0] = WRAPLOW(step2[0] + step2[15], bd);
1774   output[1] = WRAPLOW(step2[1] + step2[14], bd);
1775   output[2] = WRAPLOW(step2[2] + step2[13], bd);
1776   output[3] = WRAPLOW(step2[3] + step2[12], bd);
1777   output[4] = WRAPLOW(step2[4] + step2[11], bd);
1778   output[5] = WRAPLOW(step2[5] + step2[10], bd);
1779   output[6] = WRAPLOW(step2[6] + step2[9], bd);
1780   output[7] = WRAPLOW(step2[7] + step2[8], bd);
1781   output[8] = WRAPLOW(step2[7] - step2[8], bd);
1782   output[9] = WRAPLOW(step2[6] - step2[9], bd);
1783   output[10] = WRAPLOW(step2[5] - step2[10], bd);
1784   output[11] = WRAPLOW(step2[4] - step2[11], bd);
1785   output[12] = WRAPLOW(step2[3] - step2[12], bd);
1786   output[13] = WRAPLOW(step2[2] - step2[13], bd);
1787   output[14] = WRAPLOW(step2[1] - step2[14], bd);
1788   output[15] = WRAPLOW(step2[0] - step2[15], bd);
1789 }
1790
1791 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
1792                                     int stride, int bd) {
1793   tran_low_t out[16 * 16];
1794   tran_low_t *outptr = out;
1795   int i, j;
1796   tran_low_t temp_in[16], temp_out[16];
1797   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1798
1799   // First transform rows.
1800   for (i = 0; i < 16; ++i) {
1801     vpx_highbd_idct16_c(input, outptr, bd);
1802     input += 16;
1803     outptr += 16;
1804   }
1805
1806   // Then transform columns.
1807   for (i = 0; i < 16; ++i) {
1808     for (j = 0; j < 16; ++j)
1809       temp_in[j] = out[j * 16 + i];
1810     vpx_highbd_idct16_c(temp_in, temp_out, bd);
1811     for (j = 0; j < 16; ++j) {
1812       dest[j * stride + i] = highbd_clip_pixel_add(
1813           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1814     }
1815   }
1816 }
1817
1818 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1819   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1820   tran_high_t s9, s10, s11, s12, s13, s14, s15;
1821
1822   tran_low_t x0 = input[15];
1823   tran_low_t x1 = input[0];
1824   tran_low_t x2 = input[13];
1825   tran_low_t x3 = input[2];
1826   tran_low_t x4 = input[11];
1827   tran_low_t x5 = input[4];
1828   tran_low_t x6 = input[9];
1829   tran_low_t x7 = input[6];
1830   tran_low_t x8 = input[7];
1831   tran_low_t x9 = input[8];
1832   tran_low_t x10 = input[5];
1833   tran_low_t x11 = input[10];
1834   tran_low_t x12 = input[3];
1835   tran_low_t x13 = input[12];
1836   tran_low_t x14 = input[1];
1837   tran_low_t x15 = input[14];
1838   (void) bd;
1839
1840   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
1841            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
1842     memset(output, 0, 16 * sizeof(*output));
1843     return;
1844   }
1845
1846   // stage 1
1847   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
1848   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1849   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
1850   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1851   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
1852   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1853   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1854   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1855   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1856   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1857   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1858   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1859   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1860   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
1861   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1862   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
1863
1864   x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
1865   x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
1866   x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
1867   x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
1868   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
1869   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
1870   x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
1871   x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
1872   x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
1873   x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
1874   x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
1875   x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
1876   x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
1877   x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
1878   x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
1879   x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
1880
1881   // stage 2
1882   s0 = x0;
1883   s1 = x1;
1884   s2 = x2;
1885   s3 = x3;
1886   s4 = x4;
1887   s5 = x5;
1888   s6 = x6;
1889   s7 = x7;
1890   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1891   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1892   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1893   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1894   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1895   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1896   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1897   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1898
1899   x0 = WRAPLOW(s0 + s4, bd);
1900   x1 = WRAPLOW(s1 + s5, bd);
1901   x2 = WRAPLOW(s2 + s6, bd);
1902   x3 = WRAPLOW(s3 + s7, bd);
1903   x4 = WRAPLOW(s0 - s4, bd);
1904   x5 = WRAPLOW(s1 - s5, bd);
1905   x6 = WRAPLOW(s2 - s6, bd);
1906   x7 = WRAPLOW(s3 - s7, bd);
1907   x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
1908   x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
1909   x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
1910   x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
1911   x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
1912   x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
1913   x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
1914   x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
1915
1916   // stage 3
1917   s0 = x0;
1918   s1 = x1;
1919   s2 = x2;
1920   s3 = x3;
1921   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1922   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1923   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1924   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1925   s8 = x8;
1926   s9 = x9;
1927   s10 = x10;
1928   s11 = x11;
1929   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1930   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1931   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1932   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1933
1934   x0 = WRAPLOW(s0 + s2, bd);
1935   x1 = WRAPLOW(s1 + s3, bd);
1936   x2 = WRAPLOW(s0 - s2, bd);
1937   x3 = WRAPLOW(s1 - s3, bd);
1938   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1939   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1940   x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1941   x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1942   x8 = WRAPLOW(s8 + s10, bd);
1943   x9 = WRAPLOW(s9 + s11, bd);
1944   x10 = WRAPLOW(s8 - s10, bd);
1945   x11 = WRAPLOW(s9 - s11, bd);
1946   x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
1947   x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
1948   x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
1949   x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
1950
1951   // stage 4
1952   s2 = (- cospi_16_64) * (x2 + x3);
1953   s3 = cospi_16_64 * (x2 - x3);
1954   s6 = cospi_16_64 * (x6 + x7);
1955   s7 = cospi_16_64 * (-x6 + x7);
1956   s10 = cospi_16_64 * (x10 + x11);
1957   s11 = cospi_16_64 * (-x10 + x11);
1958   s14 = (- cospi_16_64) * (x14 + x15);
1959   s15 = cospi_16_64 * (x14 - x15);
1960
1961   x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1962   x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1963   x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1964   x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1965   x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
1966   x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
1967   x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
1968   x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
1969
1970   output[0] = WRAPLOW(x0, bd);
1971   output[1] = WRAPLOW(-x8, bd);
1972   output[2] = WRAPLOW(x12, bd);
1973   output[3] = WRAPLOW(-x4, bd);
1974   output[4] = WRAPLOW(x6, bd);
1975   output[5] = WRAPLOW(x14, bd);
1976   output[6] = WRAPLOW(x10, bd);
1977   output[7] = WRAPLOW(x2, bd);
1978   output[8] = WRAPLOW(x3, bd);
1979   output[9] = WRAPLOW(x11, bd);
1980   output[10] = WRAPLOW(x15, bd);
1981   output[11] = WRAPLOW(x7, bd);
1982   output[12] = WRAPLOW(x5, bd);
1983   output[13] = WRAPLOW(-x13, bd);
1984   output[14] = WRAPLOW(x9, bd);
1985   output[15] = WRAPLOW(-x1, bd);
1986 }
1987
1988 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
1989                                    int stride, int bd) {
1990   tran_low_t out[16 * 16] = { 0 };
1991   tran_low_t *outptr = out;
1992   int i, j;
1993   tran_low_t temp_in[16], temp_out[16];
1994   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1995
1996   // First transform rows. Since all non-zero dct coefficients are in
1997   // upper-left 4x4 area, we only need to calculate first 4 rows here.
1998   for (i = 0; i < 4; ++i) {
1999     vpx_highbd_idct16_c(input, outptr, bd);
2000     input += 16;
2001     outptr += 16;
2002   }
2003
2004   // Then transform columns.
2005   for (i = 0; i < 16; ++i) {
2006     for (j = 0; j < 16; ++j)
2007       temp_in[j] = out[j*16 + i];
2008     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2009     for (j = 0; j < 16; ++j) {
2010       dest[j * stride + i] = highbd_clip_pixel_add(
2011           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2012     }
2013   }
2014 }
2015
2016 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2017                                   int stride, int bd) {
2018   int i, j;
2019   tran_high_t a1;
2020   tran_low_t out = WRAPLOW(
2021       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2022   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2023
2024   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2025   a1 = ROUND_POWER_OF_TWO(out, 6);
2026   for (j = 0; j < 16; ++j) {
2027     for (i = 0; i < 16; ++i)
2028       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2029     dest += stride;
2030   }
2031 }
2032
2033 static void highbd_idct32_c(const tran_low_t *input,
2034                             tran_low_t *output, int bd) {
2035   tran_low_t step1[32], step2[32];
2036   tran_high_t temp1, temp2;
2037   (void) bd;
2038
2039   // stage 1
2040   step1[0] = input[0];
2041   step1[1] = input[16];
2042   step1[2] = input[8];
2043   step1[3] = input[24];
2044   step1[4] = input[4];
2045   step1[5] = input[20];
2046   step1[6] = input[12];
2047   step1[7] = input[28];
2048   step1[8] = input[2];
2049   step1[9] = input[18];
2050   step1[10] = input[10];
2051   step1[11] = input[26];
2052   step1[12] = input[6];
2053   step1[13] = input[22];
2054   step1[14] = input[14];
2055   step1[15] = input[30];
2056
2057   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2058   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2059   step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2060   step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2061
2062   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2063   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2064   step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2065   step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2066
2067   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2068   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2069   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2070   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2071
2072   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2073   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2074   step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2075   step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2076
2077   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2078   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2079   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2080   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2081
2082   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2083   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2084   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2085   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2086
2087   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2088   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2089   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2090   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2091
2092   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2093   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2094   step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2095   step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2096
2097   // stage 2
2098   step2[0] = step1[0];
2099   step2[1] = step1[1];
2100   step2[2] = step1[2];
2101   step2[3] = step1[3];
2102   step2[4] = step1[4];
2103   step2[5] = step1[5];
2104   step2[6] = step1[6];
2105   step2[7] = step1[7];
2106
2107   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2108   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2109   step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2110   step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2111
2112   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2113   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2114   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2115   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2116
2117   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2118   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2119   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2120   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2121
2122   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2123   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2124   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2125   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2126
2127   step2[16] = WRAPLOW(step1[16] + step1[17], bd);
2128   step2[17] = WRAPLOW(step1[16] - step1[17], bd);
2129   step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
2130   step2[19] = WRAPLOW(step1[18] + step1[19], bd);
2131   step2[20] = WRAPLOW(step1[20] + step1[21], bd);
2132   step2[21] = WRAPLOW(step1[20] - step1[21], bd);
2133   step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
2134   step2[23] = WRAPLOW(step1[22] + step1[23], bd);
2135   step2[24] = WRAPLOW(step1[24] + step1[25], bd);
2136   step2[25] = WRAPLOW(step1[24] - step1[25], bd);
2137   step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
2138   step2[27] = WRAPLOW(step1[26] + step1[27], bd);
2139   step2[28] = WRAPLOW(step1[28] + step1[29], bd);
2140   step2[29] = WRAPLOW(step1[28] - step1[29], bd);
2141   step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
2142   step2[31] = WRAPLOW(step1[30] + step1[31], bd);
2143
2144   // stage 3
2145   step1[0] = step2[0];
2146   step1[1] = step2[1];
2147   step1[2] = step2[2];
2148   step1[3] = step2[3];
2149
2150   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2151   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2152   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2153   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2154   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2155   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2156   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2157   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2158
2159   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
2160   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
2161   step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
2162   step1[11] = WRAPLOW(step2[10] + step2[11], bd);
2163   step1[12] = WRAPLOW(step2[12] + step2[13], bd);
2164   step1[13] = WRAPLOW(step2[12] - step2[13], bd);
2165   step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
2166   step1[15] = WRAPLOW(step2[14] + step2[15], bd);
2167
2168   step1[16] = step2[16];
2169   step1[31] = step2[31];
2170   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2171   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2172   step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2173   step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2174   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2175   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2176   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2177   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2178   step1[19] = step2[19];
2179   step1[20] = step2[20];
2180   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2181   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2182   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2183   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2184   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2185   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2186   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2187   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2188   step1[23] = step2[23];
2189   step1[24] = step2[24];
2190   step1[27] = step2[27];
2191   step1[28] = step2[28];
2192
2193   // stage 4
2194   temp1 = (step1[0] + step1[1]) * cospi_16_64;
2195   temp2 = (step1[0] - step1[1]) * cospi_16_64;
2196   step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2197   step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2198   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2199   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2200   step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2201   step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2202   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
2203   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
2204   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
2205   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
2206
2207   step2[8] = step1[8];
2208   step2[15] = step1[15];
2209   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2210   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2211   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2212   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2213   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2214   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2215   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2216   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2217   step2[11] = step1[11];
2218   step2[12] = step1[12];
2219
2220   step2[16] = WRAPLOW(step1[16] + step1[19], bd);
2221   step2[17] = WRAPLOW(step1[17] + step1[18], bd);
2222   step2[18] = WRAPLOW(step1[17] - step1[18], bd);
2223   step2[19] = WRAPLOW(step1[16] - step1[19], bd);
2224   step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
2225   step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
2226   step2[22] = WRAPLOW(step1[21] + step1[22], bd);
2227   step2[23] = WRAPLOW(step1[20] + step1[23], bd);
2228
2229   step2[24] = WRAPLOW(step1[24] + step1[27], bd);
2230   step2[25] = WRAPLOW(step1[25] + step1[26], bd);
2231   step2[26] = WRAPLOW(step1[25] - step1[26], bd);
2232   step2[27] = WRAPLOW(step1[24] - step1[27], bd);
2233   step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
2234   step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
2235   step2[30] = WRAPLOW(step1[29] + step1[30], bd);
2236   step2[31] = WRAPLOW(step1[28] + step1[31], bd);
2237
2238   // stage 5
2239   step1[0] = WRAPLOW(step2[0] + step2[3], bd);
2240   step1[1] = WRAPLOW(step2[1] + step2[2], bd);
2241   step1[2] = WRAPLOW(step2[1] - step2[2], bd);
2242   step1[3] = WRAPLOW(step2[0] - step2[3], bd);
2243   step1[4] = step2[4];
2244   temp1 = (step2[6] - step2[5]) * cospi_16_64;
2245   temp2 = (step2[5] + step2[6]) * cospi_16_64;
2246   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2247   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2248   step1[7] = step2[7];
2249
2250   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
2251   step1[9] = WRAPLOW(step2[9] + step2[10], bd);
2252   step1[10] = WRAPLOW(step2[9] - step2[10], bd);
2253   step1[11] = WRAPLOW(step2[8] - step2[11], bd);
2254   step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
2255   step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
2256   step1[14] = WRAPLOW(step2[13] + step2[14], bd);
2257   step1[15] = WRAPLOW(step2[12] + step2[15], bd);
2258
2259   step1[16] = step2[16];
2260   step1[17] = step2[17];
2261   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2262   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2263   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2264   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2265   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2266   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2267   step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2268   step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2269   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2270   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2271   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2272   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2273   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2274   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2275   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2276   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2277   step1[22] = step2[22];
2278   step1[23] = step2[23];
2279   step1[24] = step2[24];
2280   step1[25] = step2[25];
2281   step1[30] = step2[30];
2282   step1[31] = step2[31];
2283
2284   // stage 6
2285   step2[0] = WRAPLOW(step1[0] + step1[7], bd);
2286   step2[1] = WRAPLOW(step1[1] + step1[6], bd);
2287   step2[2] = WRAPLOW(step1[2] + step1[5], bd);
2288   step2[3] = WRAPLOW(step1[3] + step1[4], bd);
2289   step2[4] = WRAPLOW(step1[3] - step1[4], bd);
2290   step2[5] = WRAPLOW(step1[2] - step1[5], bd);
2291   step2[6] = WRAPLOW(step1[1] - step1[6], bd);
2292   step2[7] = WRAPLOW(step1[0] - step1[7], bd);
2293   step2[8] = step1[8];
2294   step2[9] = step1[9];
2295   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2296   temp2 = (step1[10] + step1[13]) * cospi_16_64;
2297   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2298   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2299   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2300   temp2 = (step1[11] + step1[12]) * cospi_16_64;
2301   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2302   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2303   step2[14] = step1[14];
2304   step2[15] = step1[15];
2305
2306   step2[16] = WRAPLOW(step1[16] + step1[23], bd);
2307   step2[17] = WRAPLOW(step1[17] + step1[22], bd);
2308   step2[18] = WRAPLOW(step1[18] + step1[21], bd);
2309   step2[19] = WRAPLOW(step1[19] + step1[20], bd);
2310   step2[20] = WRAPLOW(step1[19] - step1[20], bd);
2311   step2[21] = WRAPLOW(step1[18] - step1[21], bd);
2312   step2[22] = WRAPLOW(step1[17] - step1[22], bd);
2313   step2[23] = WRAPLOW(step1[16] - step1[23], bd);
2314
2315   step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
2316   step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
2317   step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
2318   step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
2319   step2[28] = WRAPLOW(step1[27] + step1[28], bd);
2320   step2[29] = WRAPLOW(step1[26] + step1[29], bd);
2321   step2[30] = WRAPLOW(step1[25] + step1[30], bd);
2322   step2[31] = WRAPLOW(step1[24] + step1[31], bd);
2323
2324   // stage 7
2325   step1[0] = WRAPLOW(step2[0] + step2[15], bd);
2326   step1[1] = WRAPLOW(step2[1] + step2[14], bd);
2327   step1[2] = WRAPLOW(step2[2] + step2[13], bd);
2328   step1[3] = WRAPLOW(step2[3] + step2[12], bd);
2329   step1[4] = WRAPLOW(step2[4] + step2[11], bd);
2330   step1[5] = WRAPLOW(step2[5] + step2[10], bd);
2331   step1[6] = WRAPLOW(step2[6] + step2[9], bd);
2332   step1[7] = WRAPLOW(step2[7] + step2[8], bd);
2333   step1[8] = WRAPLOW(step2[7] - step2[8], bd);
2334   step1[9] = WRAPLOW(step2[6] - step2[9], bd);
2335   step1[10] = WRAPLOW(step2[5] - step2[10], bd);
2336   step1[11] = WRAPLOW(step2[4] - step2[11], bd);
2337   step1[12] = WRAPLOW(step2[3] - step2[12], bd);
2338   step1[13] = WRAPLOW(step2[2] - step2[13], bd);
2339   step1[14] = WRAPLOW(step2[1] - step2[14], bd);
2340   step1[15] = WRAPLOW(step2[0] - step2[15], bd);
2341
2342   step1[16] = step2[16];
2343   step1[17] = step2[17];
2344   step1[18] = step2[18];
2345   step1[19] = step2[19];
2346   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2347   temp2 = (step2[20] + step2[27]) * cospi_16_64;
2348   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2349   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2350   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2351   temp2 = (step2[21] + step2[26]) * cospi_16_64;
2352   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2353   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2354   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2355   temp2 = (step2[22] + step2[25]) * cospi_16_64;
2356   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2357   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2358   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2359   temp2 = (step2[23] + step2[24]) * cospi_16_64;
2360   step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2361   step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2362   step1[28] = step2[28];
2363   step1[29] = step2[29];
2364   step1[30] = step2[30];
2365   step1[31] = step2[31];
2366
2367   // final stage
2368   output[0] = WRAPLOW(step1[0] + step1[31], bd);
2369   output[1] = WRAPLOW(step1[1] + step1[30], bd);
2370   output[2] = WRAPLOW(step1[2] + step1[29], bd);
2371   output[3] = WRAPLOW(step1[3] + step1[28], bd);
2372   output[4] = WRAPLOW(step1[4] + step1[27], bd);
2373   output[5] = WRAPLOW(step1[5] + step1[26], bd);
2374   output[6] = WRAPLOW(step1[6] + step1[25], bd);
2375   output[7] = WRAPLOW(step1[7] + step1[24], bd);
2376   output[8] = WRAPLOW(step1[8] + step1[23], bd);
2377   output[9] = WRAPLOW(step1[9] + step1[22], bd);
2378   output[10] = WRAPLOW(step1[10] + step1[21], bd);
2379   output[11] = WRAPLOW(step1[11] + step1[20], bd);
2380   output[12] = WRAPLOW(step1[12] + step1[19], bd);
2381   output[13] = WRAPLOW(step1[13] + step1[18], bd);
2382   output[14] = WRAPLOW(step1[14] + step1[17], bd);
2383   output[15] = WRAPLOW(step1[15] + step1[16], bd);
2384   output[16] = WRAPLOW(step1[15] - step1[16], bd);
2385   output[17] = WRAPLOW(step1[14] - step1[17], bd);
2386   output[18] = WRAPLOW(step1[13] - step1[18], bd);
2387   output[19] = WRAPLOW(step1[12] - step1[19], bd);
2388   output[20] = WRAPLOW(step1[11] - step1[20], bd);
2389   output[21] = WRAPLOW(step1[10] - step1[21], bd);
2390   output[22] = WRAPLOW(step1[9] - step1[22], bd);
2391   output[23] = WRAPLOW(step1[8] - step1[23], bd);
2392   output[24] = WRAPLOW(step1[7] - step1[24], bd);
2393   output[25] = WRAPLOW(step1[6] - step1[25], bd);
2394   output[26] = WRAPLOW(step1[5] - step1[26], bd);
2395   output[27] = WRAPLOW(step1[4] - step1[27], bd);
2396   output[28] = WRAPLOW(step1[3] - step1[28], bd);
2397   output[29] = WRAPLOW(step1[2] - step1[29], bd);
2398   output[30] = WRAPLOW(step1[1] - step1[30], bd);
2399   output[31] = WRAPLOW(step1[0] - step1[31], bd);
2400 }
2401
2402 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2403                                      int stride, int bd) {
2404   tran_low_t out[32 * 32];
2405   tran_low_t *outptr = out;
2406   int i, j;
2407   tran_low_t temp_in[32], temp_out[32];
2408   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2409
2410   // Rows
2411   for (i = 0; i < 32; ++i) {
2412     tran_low_t zero_coeff[16];
2413     for (j = 0; j < 16; ++j)
2414       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2415     for (j = 0; j < 8; ++j)
2416       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2417     for (j = 0; j < 4; ++j)
2418       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2419     for (j = 0; j < 2; ++j)
2420       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2421
2422     if (zero_coeff[0] | zero_coeff[1])
2423       highbd_idct32_c(input, outptr, bd);
2424     else
2425       memset(outptr, 0, sizeof(tran_low_t) * 32);
2426     input += 32;
2427     outptr += 32;
2428   }
2429
2430   // Columns
2431   for (i = 0; i < 32; ++i) {
2432     for (j = 0; j < 32; ++j)
2433       temp_in[j] = out[j * 32 + i];
2434     highbd_idct32_c(temp_in, temp_out, bd);
2435     for (j = 0; j < 32; ++j) {
2436       dest[j * stride + i] = highbd_clip_pixel_add(
2437           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2438     }
2439   }
2440 }
2441
2442 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2443                                    int stride, int bd) {
2444   tran_low_t out[32 * 32] = {0};
2445   tran_low_t *outptr = out;
2446   int i, j;
2447   tran_low_t temp_in[32], temp_out[32];
2448   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2449
2450   // Rows
2451   // Only upper-left 8x8 has non-zero coeff.
2452   for (i = 0; i < 8; ++i) {
2453     highbd_idct32_c(input, outptr, bd);
2454     input += 32;
2455     outptr += 32;
2456   }
2457   // Columns
2458   for (i = 0; i < 32; ++i) {
2459     for (j = 0; j < 32; ++j)
2460       temp_in[j] = out[j * 32 + i];
2461     highbd_idct32_c(temp_in, temp_out, bd);
2462     for (j = 0; j < 32; ++j) {
2463       dest[j * stride + i] = highbd_clip_pixel_add(
2464           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2465     }
2466   }
2467 }
2468
2469 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2470                                   int stride, int bd) {
2471   int i, j;
2472   int a1;
2473   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2474
2475   tran_low_t out = WRAPLOW(
2476       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2477   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2478   a1 = ROUND_POWER_OF_TWO(out, 6);
2479
2480   for (j = 0; j < 32; ++j) {
2481     for (i = 0; i < 32; ++i)
2482       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2483     dest += stride;
2484   }
2485 }
2486 #endif  // CONFIG_VP9_HIGHBITDEPTH