jpeg/jidctfst.c

   1 /*
   2  * jidctfst.c
   3  *
   4  * Copyright (C) 1994-1998, Thomas G. Lane.
   5  * This file is part of the Independent JPEG Group's software.
   6  * For conditions of distribution and use, see the accompanying README file.
   7  *
   8  * This file contains a fast, not so accurate integer implementation of the
   9  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
  10  * must also perform dequantization of the input coefficients.
  11  *
  12  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
  13  * on each row (or vice versa, but it's more convenient to emit a row at
  14  * a time).  Direct algorithms are also available, but they are much more
  15  * complex and seem not to be any faster when reduced to code.
  16  *
  17  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  18  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  19  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
  20  * JPEG textbook (see REFERENCES section in file README).  The following code
  21  * is based directly on figure 4-8 in P&M.
  22  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  23  * possible to arrange the computation so that many of the multiplies are
  24  * simple scalings of the final outputs.  These multiplies can then be
  25  * folded into the multiplications or divisions by the JPEG quantization
  26  * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
  27  * to be done in the DCT itself.
  28  * The primary disadvantage of this method is that with fixed-point math,
  29  * accuracy is lost due to imprecise representation of the scaled
  30  * quantization values.  The smaller the quantization table entry, the less
  31  * precise the scaled value, so this implementation does worse with high-
  32  * quality-setting files than with low-quality ones.
  33  */
  34
  35 #define JPEG_INTERNALS
  36 #include "jinclude.h"
  37 #include "jpeglib.h"
  38 #include "jdct.h"               /* Private declarations for DCT subsystem */
  39
  40
  41 #ifdef DCT_IFAST_SUPPORTED
  42
  43
  44 /*
  45  * This module is specialized to the case DCTSIZE = 8.
  46  */
  47
  48 #if DCTSIZE != 8
  49   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
  50 #endif
  51
  52
  53 /* Scaling decisions are generally the same as in the LL&M algorithm;
  54  * see jidctint.c for more details.  However, we choose to descale
  55  * (right shift) multiplication products as soon as they are formed,
  56  * rather than carrying additional fractional bits into subsequent additions.
  57  * This compromises accuracy slightly, but it lets us save a few shifts.
  58  * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
  59  * everywhere except in the multiplications proper; this saves a good deal
  60  * of work on 16-bit-int machines.
  61  *
  62  * The dequantized coefficients are not integers because the AA&N scaling
  63  * factors have been incorporated.  We represent them scaled up by PASS1_BITS,
  64  * so that the first and second IDCT rounds have the same input scaling.
  65  * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
  66  * avoid a descaling shift; this compromises accuracy rather drastically
  67  * for small quantization table entries, but it saves a lot of shifts.
  68  * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
  69  * so we use a much larger scaling factor to preserve accuracy.
  70  *
  71  * A final compromise is to represent the multiplicative constants to only
  72  * 8 fractional bits, rather than 13.  This saves some shifting work on some
  73  * machines, and may also reduce the cost of multiplication (since there
  74  * are fewer one-bits in the constants).
  75  */
  76
  77 #if BITS_IN_JSAMPLE == 8
  78 #define CONST_BITS  8
  79 #define PASS1_BITS  2
  80 #else
  81 #define CONST_BITS  8
  82 #define PASS1_BITS  1           /* lose a little precision to avoid overflow */
  83 #endif
  84
  85 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
  86  * causing a lot of useless floating-point operations at run time.
  87  * To get around this we use the following pre-calculated constants.
  88  * If you change CONST_BITS you may want to add appropriate values.
  89  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
  90  */
  91
  92 #if CONST_BITS == 8
  93 #define FIX_1_082392200  ((INT32)  277)         /* FIX(1.082392200) */
  94 #define FIX_1_414213562  ((INT32)  362)         /* FIX(1.414213562) */
  95 #define FIX_1_847759065  ((INT32)  473)         /* FIX(1.847759065) */
  96 #define FIX_2_613125930  ((INT32)  669)         /* FIX(2.613125930) */
  97 #else
  98 #define FIX_1_082392200  FIX(1.082392200)
  99 #define FIX_1_414213562  FIX(1.414213562)
 100 #define FIX_1_847759065  FIX(1.847759065)
 101 #define FIX_2_613125930  FIX(2.613125930)
 102 #endif
 103
 104
 105 /* We can gain a little more speed, with a further compromise in accuracy,
 106  * by omitting the addition in a descaling shift.  This yields an incorrectly
 107  * rounded result half the time...
 108  */
 109
 110 #ifndef USE_ACCURATE_ROUNDING
 111 #undef DESCALE
 112 #define DESCALE(x,n)  RIGHT_SHIFT(x, n)
 113 #endif
 114
 115
 116 /* Multiply a DCTELEM variable by an INT32 constant, and immediately
 117  * descale to yield a DCTELEM result.
 118  */
 119
 120 #define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
 121
 122
 123 /* Dequantize a coefficient by multiplying it by the multiplier-table
 124  * entry; produce a DCTELEM result.  For 8-bit data a 16x16->16
 125  * multiplication will do.  For 12-bit data, the multiplier table is
 126  * declared INT32, so a 32-bit multiply will be used.
 127  */
 128
 129 #if BITS_IN_JSAMPLE == 8
 130 #define DEQUANTIZE(coef,quantval)  (((IFAST_MULT_TYPE) (coef)) * (quantval))
 131 #else
 132 #define DEQUANTIZE(coef,quantval)  \
 133         DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
 134 #endif
 135
 136
 137 /* Like DESCALE, but applies to a DCTELEM and produces an int.
 138  * We assume that int right shift is unsigned if INT32 right shift is.
 139  */
 140
 141 #ifdef RIGHT_SHIFT_IS_UNSIGNED
 142 #define ISHIFT_TEMPS    DCTELEM ishift_temp;
 143 #if BITS_IN_JSAMPLE == 8
 144 #define DCTELEMBITS  16         /* DCTELEM may be 16 or 32 bits */
 145 #else
 146 #define DCTELEMBITS  32         /* DCTELEM must be 32 bits */
 147 #endif
 148 #define IRIGHT_SHIFT(x,shft)  \
 149     ((ishift_temp = (x)) < 0 ? \
 150      (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
 151      (ishift_temp >> (shft)))
 152 #else
 153 #define ISHIFT_TEMPS
 154 #define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
 155 #endif
 156
 157 #ifdef USE_ACCURATE_ROUNDING
 158 #define IDESCALE(x,n)  ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
 159 #else
 160 #define IDESCALE(x,n)  ((int) IRIGHT_SHIFT(x, n))
 161 #endif
 162
 163 #ifdef HAVE_MMX_INTEL_MNEMONICS
 164 __inline GLOBAL(void)
 165 jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 166                  JCOEFPTR coef_block,
 167                  JSAMPARRAY output_buf, JDIMENSION output_col);
 168 __inline GLOBAL(void)
 169 jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 170                  JCOEFPTR coef_block,
 171                  JSAMPARRAY output_buf, JDIMENSION output_col);
 172 #endif
 173
 174 GLOBAL(void)
 175 jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info * compptr,
 176                  JCOEFPTR coef_block,
 177                  JSAMPARRAY output_buf, JDIMENSION output_col);
 178
 179
 180 #ifdef HAVE_MMX_INTEL_MNEMONICS
 181 GLOBAL(void)
 182 jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 183                  JCOEFPTR coef_block,
 184                  JSAMPARRAY output_buf, JDIMENSION output_col)
 185 {
 186 if (MMXAvailable)
 187         jpeg_idct_ifast_mmx(cinfo, compptr, coef_block, output_buf, output_col);
 188 else
 189         jpeg_idct_ifast_orig(cinfo, compptr, coef_block, output_buf, output_col);
 190 }
 191 #else
 192
 193 /*
 194  * Perform dequantization and inverse DCT on one block of coefficients.
 195  */
 196
 197 GLOBAL (void)
 198 jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 199                  JCOEFPTR coef_block,
 200                  JSAMPARRAY output_buf, JDIMENSION output_col)
 201 {
 202   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 203   DCTELEM tmp10, tmp11, tmp12, tmp13;
 204   DCTELEM z5, z10, z11, z12, z13;
 205   JCOEFPTR inptr;
 206   IFAST_MULT_TYPE * quantptr;
 207   int * wsptr;
 208   JSAMPROW outptr;
 209   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 210   int ctr;
 211   int workspace[DCTSIZE2];      /* buffers data between passes */
 212   SHIFT_TEMPS                   /* for DESCALE */
 213   ISHIFT_TEMPS                  /* for IDESCALE */
 214
 215   /* Pass 1: process columns from input, store into work array. */
 216
 217   inptr = coef_block;
 218   quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
 219   wsptr = workspace;
 220   for (ctr = DCTSIZE; ctr > 0; ctr--) {
 221     /* Due to quantization, we will usually find that many of the input
 222      * coefficients are zero, especially the AC terms.  We can exploit this
 223      * by short-circuiting the IDCT calculation for any column in which all
 224      * the AC terms are zero.  In that case each output is equal to the
 225      * DC coefficient (with scale factor as needed).
 226      * With typical images and quantization tables, half or more of the
 227      * column DCT calculations can be simplified this way.
 228      */
 229
 230     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
 231         inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
 232         inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
 233         inptr[DCTSIZE*7] == 0) {
 234       /* AC terms all zero */
 235       int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 236
 237       wsptr[DCTSIZE*0] = dcval;
 238       wsptr[DCTSIZE*1] = dcval;
 239       wsptr[DCTSIZE*2] = dcval;
 240       wsptr[DCTSIZE*3] = dcval;
 241       wsptr[DCTSIZE*4] = dcval;
 242       wsptr[DCTSIZE*5] = dcval;
 243       wsptr[DCTSIZE*6] = dcval;
 244       wsptr[DCTSIZE*7] = dcval;
 245
 246       inptr++;                  /* advance pointers to next column */
 247       quantptr++;
 248       wsptr++;
 249       continue;
 250     }
 251
 252     /* Even part */
 253
 254     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 255     tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 256     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 257     tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 258
 259     tmp10 = tmp0 + tmp2;        /* phase 3 */
 260     tmp11 = tmp0 - tmp2;
 261
 262     tmp13 = tmp1 + tmp3;        /* phases 5-3 */
 263     tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
 264
 265     tmp0 = tmp10 + tmp13;       /* phase 2 */
 266     tmp3 = tmp10 - tmp13;
 267     tmp1 = tmp11 + tmp12;
 268     tmp2 = tmp11 - tmp12;
 269
 270     /* Odd part */
 271
 272     tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 273     tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 274     tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 275     tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
 276
 277     z13 = tmp6 + tmp5;          /* phase 6 */
 278     z10 = tmp6 - tmp5;
 279     z11 = tmp4 + tmp7;
 280     z12 = tmp4 - tmp7;
 281
 282     tmp7 = z11 + z13;           /* phase 5 */
 283     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 284
 285     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
 286     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
 287     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
 288
 289     tmp6 = tmp12 - tmp7;        /* phase 2 */
 290     tmp5 = tmp11 - tmp6;
 291     tmp4 = tmp10 + tmp5;
 292
 293     wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
 294     wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
 295     wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
 296     wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
 297     wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
 298     wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
 299     wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
 300     wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
 301
 302     inptr++;                    /* advance pointers to next column */
 303     quantptr++;
 304     wsptr++;
 305   }
 306
 307   /* Pass 2: process rows from work array, store into output array. */
 308   /* Note that we must descale the results by a factor of 8 == 2**3, */
 309   /* and also undo the PASS1_BITS scaling. */
 310
 311   wsptr = workspace;
 312   for (ctr = 0; ctr < DCTSIZE; ctr++) {
 313     outptr = output_buf[ctr] + output_col;
 314     /* Rows of zeroes can be exploited in the same way as we did with columns.
 315      * However, the column calculation has created many nonzero AC terms, so
 316      * the simplification applies less often (typically 5% to 10% of the time).
 317      * On machines with very fast multiplication, it's possible that the
 318      * test takes more time than it's worth.  In that case this section
 319      * may be commented out.
 320      */
 321
 322 #ifndef NO_ZERO_ROW_TEST
 323     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
 324         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
 325       /* AC terms all zero */
 326       JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
 327                                   & RANGE_MASK];
 328
 329       outptr[0] = dcval;
 330       outptr[1] = dcval;
 331       outptr[2] = dcval;
 332       outptr[3] = dcval;
 333       outptr[4] = dcval;
 334       outptr[5] = dcval;
 335       outptr[6] = dcval;
 336       outptr[7] = dcval;
 337
 338       wsptr += DCTSIZE;         /* advance pointer to next row */
 339       continue;
 340     }
 341 #endif
 342
 343     /* Even part */
 344
 345     tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
 346     tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
 347
 348     tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
 349     tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
 350             - tmp13;
 351
 352     tmp0 = tmp10 + tmp13;
 353     tmp3 = tmp10 - tmp13;
 354     tmp1 = tmp11 + tmp12;
 355     tmp2 = tmp11 - tmp12;
 356
 357     /* Odd part */
 358
 359     z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
 360     z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
 361     z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
 362     z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
 363
 364     tmp7 = z11 + z13;           /* phase 5 */
 365     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 366
 367     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
 368     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
 369     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
 370
 371     tmp6 = tmp12 - tmp7;        /* phase 2 */
 372     tmp5 = tmp11 - tmp6;
 373     tmp4 = tmp10 + tmp5;
 374
 375     /* Final output stage: scale down by a factor of 8 and range-limit */
 376
 377     outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
 378                             & RANGE_MASK];
 379     outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
 380                             & RANGE_MASK];
 381     outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
 382                             & RANGE_MASK];
 383     outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
 384                             & RANGE_MASK];
 385     outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
 386                             & RANGE_MASK];
 387     outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
 388                             & RANGE_MASK];
 389     outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
 390                             & RANGE_MASK];
 391     outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
 392                             & RANGE_MASK];
 393
 394     wsptr += DCTSIZE;           /* advance pointer to next row */
 395   }
 396 }
 397
 398 #endif
 399
 400 #ifdef HAVE_MMX_INTEL_MNEMONICS
 401
 402
 403 _inline GLOBAL(void)
 404 jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 405                  JCOEFPTR coef_block,
 406                  JSAMPARRAY output_buf, JDIMENSION output_col)
 407 {
 408   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 409   DCTELEM tmp10, tmp11, tmp12, tmp13;
 410   DCTELEM z5, z10, z11, z12, z13;
 411   JCOEFPTR inptr;
 412   IFAST_MULT_TYPE * quantptr;
 413   int * wsptr;
 414   JSAMPROW outptr;
 415   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 416   int ctr;
 417   int workspace[DCTSIZE2];      /* buffers data between passes */
 418   SHIFT_TEMPS                   /* for DESCALE */
 419   ISHIFT_TEMPS                  /* for IDESCALE */
 420
 421   /* Pass 1: process columns from input, store into work array. */
 422
 423   inptr = coef_block;
 424   quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
 425   wsptr = workspace;
 426   for (ctr = DCTSIZE; ctr > 0; ctr--) {
 427     /* Due to quantization, we will usually find that many of the input
 428      * coefficients are zero, especially the AC terms.  We can exploit this
 429      * by short-circuiting the IDCT calculation for any column in which all
 430      * the AC terms are zero.  In that case each output is equal to the
 431      * DC coefficient (with scale factor as needed).
 432      * With typical images and quantization tables, half or more of the
 433      * column DCT calculations can be simplified this way.
 434      */
 435
 436     if ((inptr[DCTSIZE*1] | inptr[DCTSIZE*2] | inptr[DCTSIZE*3] |
 437          inptr[DCTSIZE*4] | inptr[DCTSIZE*5] | inptr[DCTSIZE*6] |
 438          inptr[DCTSIZE*7]) == 0) {
 439       /* AC terms all zero */
 440       int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 441
 442       wsptr[DCTSIZE*0] = dcval;
 443       wsptr[DCTSIZE*1] = dcval;
 444       wsptr[DCTSIZE*2] = dcval;
 445       wsptr[DCTSIZE*3] = dcval;
 446       wsptr[DCTSIZE*4] = dcval;
 447       wsptr[DCTSIZE*5] = dcval;
 448       wsptr[DCTSIZE*6] = dcval;
 449       wsptr[DCTSIZE*7] = dcval;
 450
 451       inptr++;                  /* advance pointers to next column */
 452       quantptr++;
 453       wsptr++;
 454       continue;
 455     }
 456
 457     /* Even part */
 458
 459     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 460     tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 461     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 462     tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 463
 464     tmp10 = tmp0 + tmp2;        /* phase 3 */
 465     tmp11 = tmp0 - tmp2;
 466
 467     tmp13 = tmp1 + tmp3;        /* phases 5-3 */
 468     tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
 469
 470     tmp0 = tmp10 + tmp13;       /* phase 2 */
 471     tmp3 = tmp10 - tmp13;
 472     tmp1 = tmp11 + tmp12;
 473     tmp2 = tmp11 - tmp12;
 474
 475     /* Odd part */
 476
 477     tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 478     tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 479     tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 480     tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
 481
 482     z13 = tmp6 + tmp5;          /* phase 6 */
 483     z10 = tmp6 - tmp5;
 484     z11 = tmp4 + tmp7;
 485     z12 = tmp4 - tmp7;
 486
 487     tmp7 = z11 + z13;           /* phase 5 */
 488     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 489
 490     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
 491     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
 492     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
 493
 494     tmp6 = tmp12 - tmp7;        /* phase 2 */
 495     tmp5 = tmp11 - tmp6;
 496     tmp4 = tmp10 + tmp5;
 497
 498     wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
 499     wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
 500     wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
 501     wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
 502     wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
 503     wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
 504     wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
 505     wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
 506
 507     inptr++;                    /* advance pointers to next column */
 508     quantptr++;
 509     wsptr++;
 510   }
 511
 512   /* Pass 2: process rows from work array, store into output array. */
 513   /* Note that we must descale the results by a factor of 8 == 2**3, */
 514   /* and also undo the PASS1_BITS scaling. */
 515
 516   wsptr = workspace;
 517   for (ctr = 0; ctr < DCTSIZE; ctr++) {
 518     outptr = output_buf[ctr] + output_col;
 519     /* Rows of zeroes can be exploited in the same way as we did with columns.
 520      * However, the column calculation has created many nonzero AC terms, so
 521      * the simplification applies less often (typically 5% to 10% of the time).
 522      * On machines with very fast multiplication, it's possible that the
 523      * test takes more time than it's worth.  In that case this section
 524      * may be commented out.
 525      */
 526
 527 #ifndef NO_ZERO_ROW_TEST
 528     if ((wsptr[1] | wsptr[2] | wsptr[3] | wsptr[4] | wsptr[5] | wsptr[6] |
 529          wsptr[7]) == 0) {
 530       /* AC terms all zero */
 531       JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
 532                                   & RANGE_MASK];
 533
 534       outptr[0] = dcval;
 535       outptr[1] = dcval;
 536       outptr[2] = dcval;
 537       outptr[3] = dcval;
 538       outptr[4] = dcval;
 539       outptr[5] = dcval;
 540       outptr[6] = dcval;
 541       outptr[7] = dcval;
 542
 543       wsptr += DCTSIZE;         /* advance pointer to next row */
 544       continue;
 545     }
 546 #endif
 547
 548     /* Even part */
 549
 550     tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
 551     tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
 552
 553     tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
 554     tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
 555             - tmp13;
 556
 557     tmp0 = tmp10 + tmp13;
 558     tmp3 = tmp10 - tmp13;
 559     tmp1 = tmp11 + tmp12;
 560     tmp2 = tmp11 - tmp12;
 561
 562     /* Odd part */
 563
 564     z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
 565     z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
 566     z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
 567     z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
 568
 569     tmp7 = z11 + z13;           /* phase 5 */
 570     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 571
 572     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
 573     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
 574     tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
 575
 576     tmp6 = tmp12 - tmp7;        /* phase 2 */
 577     tmp5 = tmp11 - tmp6;
 578     tmp4 = tmp10 + tmp5;
 579
 580     /* Final output stage: scale down by a factor of 8 and range-limit */
 581
 582     outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
 583                             & RANGE_MASK];
 584     outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
 585                             & RANGE_MASK];
 586     outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
 587                             & RANGE_MASK];
 588     outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
 589                             & RANGE_MASK];
 590     outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
 591                             & RANGE_MASK];
 592     outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
 593                             & RANGE_MASK];
 594     outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
 595                             & RANGE_MASK];
 596     outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
 597                             & RANGE_MASK];
 598
 599     wsptr += DCTSIZE;           /* advance pointer to next row */
 600   }
 601 }
 602
 603
 604         static    __int64 fix_141               = 0x5a825a825a825a82;
 605         static    __int64 fix_184n261   = 0xcf04cf04cf04cf04;
 606         static    __int64 fix_184               = 0x7641764176417641;
 607         static    __int64 fix_n184              = 0x896f896f896f896f;
 608         static    __int64 fix_108n184   = 0xcf04cf04cf04cf04;
 609         static    __int64 const_0x0080  = 0x0080008000800080;
 610
 611
 612 __inline GLOBAL(void)
 613 jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 614                  JCOEFPTR inptr,
 615                  JSAMPARRAY outptr, JDIMENSION output_col)
 616 {
 617
 618   int16 workspace[DCTSIZE2 + 4];        /* buffers data between passes */
 619   int16 *wsptr=workspace;
 620   int16 *quantptr=compptr->dct_table;
 621
 622   __asm{
 623
 624         mov             edi, quantptr
 625         mov             ebx, inptr
 626         mov             esi, wsptr
 627         add             esi, 0x07               ;align wsptr to qword
 628         and             esi, 0xfffffff8 ;align wsptr to qword
 629
 630         mov             eax, esi
 631
 632     /* Odd part */
 633
 634
 635         movq            mm1, [ebx + 8*10]               ;load inptr[DCTSIZE*5]
 636
 637         pmullw          mm1, [edi + 8*10]               ;tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 638
 639         movq            mm0, [ebx + 8*6]                ;load inptr[DCTSIZE*3]
 640
 641         pmullw          mm0, [edi + 8*6]                ;tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 642
 643         movq            mm3, [ebx + 8*2]                ;load inptr[DCTSIZE*1]
 644         movq    mm2, mm1                                        ;copy tmp6      /* phase 6 */
 645
 646         pmullw          mm3, [edi + 8*2]                ;tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 647
 648         movq            mm4, [ebx + 8*14]               ;load inptr[DCTSIZE*1]
 649         paddw   mm1, mm0                                        ;z13 = tmp6 + tmp5;
 650
 651         pmullw          mm4, [edi + 8*14]           ;tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
 652         psubw   mm2, mm0                                        ;z10 = tmp6 - tmp5
 653
 654         psllw           mm2, 2                          ;shift z10
 655         movq            mm0, mm2                        ;copy z10
 656
 657         pmulhw          mm2, fix_184n261        ;MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
 658         movq            mm5, mm3                                ;copy tmp4
 659
 660         pmulhw          mm0, fix_n184           ;MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
 661         paddw           mm3, mm4                                ;z11 = tmp4 + tmp7;
 662
 663         movq            mm6, mm3                                ;copy z11                       /* phase 5 */
 664         psubw           mm5, mm4                                ;z12 = tmp4 - tmp7;
 665
 666         psubw           mm6, mm1                                ;z11-z13
 667         psllw           mm5, 2                          ;shift z12
 668
 669         movq            mm4, [ebx + 8*12]               ;load inptr[DCTSIZE*6], even part
 670         movq            mm7, mm5                        ;copy z12
 671
 672         pmulhw          mm5, fix_108n184        ;MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
 673         paddw           mm3, mm1                                ;tmp7 = z11 + z13;
 674
 675
 676     /* Even part */
 677         pmulhw          mm7, fix_184            ;MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
 678         psllw           mm6, 2
 679
 680         movq            mm1, [ebx + 8*4]                ;load inptr[DCTSIZE*2]
 681
 682         pmullw          mm1, [edi + 8*4]                ;tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 683         paddw           mm0, mm5                        ;tmp10
 684
 685         pmullw          mm4, [edi + 8*12]               ;tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 686         paddw           mm2, mm7                        ;tmp12
 687
 688         pmulhw          mm6, fix_141                    ;tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 689         psubw           mm2, mm3                ;tmp6 = tmp12 - tmp7
 690
 691         movq            mm5, mm1                                ;copy tmp1
 692         paddw           mm1, mm4                                ;tmp13= tmp1 + tmp3;    /* phases 5-3 */
 693
 694         psubw           mm5, mm4                                ;tmp1-tmp3
 695         psubw           mm6, mm2                ;tmp5 = tmp11 - tmp6;
 696
 697         movq            [esi+8*0], mm1                  ;save tmp13 in workspace
 698         psllw           mm5, 2                                  ;shift tmp1-tmp3
 699
 700         movq            mm7, [ebx + 8*0]                ;load inptr[DCTSIZE*0]
 701
 702         pmulhw          mm5, fix_141                    ;MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
 703         paddw           mm0, mm6                ;tmp4 = tmp10 + tmp5;
 704
 705         pmullw          mm7, [edi + 8*0]                ;tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 706
 707         movq            mm4, [ebx + 8*8]                ;load inptr[DCTSIZE*4]
 708
 709         pmullw          mm4, [edi + 8*8]                ;tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 710         psubw           mm5, mm1                                ;tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
 711
 712         movq            [esi+8*4], mm0          ;save tmp4 in workspace
 713         movq            mm1, mm7                        ;copy tmp0      /* phase 3 */
 714
 715         movq            [esi+8*2], mm5          ;save tmp12 in workspace
 716         psubw           mm1, mm4                        ;tmp11 = tmp0 - tmp2;
 717
 718         paddw           mm7, mm4                        ;tmp10 = tmp0 + tmp2;
 719     movq                mm5, mm1                ;copy tmp11
 720
 721         paddw           mm1, [esi+8*2]  ;tmp1 = tmp11 + tmp12;
 722         movq            mm4, mm7                ;copy tmp10             /* phase 2 */
 723
 724         paddw           mm7, [esi+8*0]  ;tmp0 = tmp10 + tmp13;
 725
 726         psubw           mm4, [esi+8*0]  ;tmp3 = tmp10 - tmp13;
 727         movq            mm0, mm7                ;copy tmp0
 728
 729         psubw           mm5, [esi+8*2]  ;tmp2 = tmp11 - tmp12;
 730         paddw           mm7, mm3                ;wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
 731
 732         psubw           mm0, mm3                        ;wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
 733
 734         movq            [esi + 8*0], mm7        ;wsptr[DCTSIZE*0]
 735         movq            mm3, mm1                        ;copy tmp1
 736
 737         movq            [esi + 8*14], mm0       ;wsptr[DCTSIZE*7]
 738         paddw           mm1, mm2                        ;wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
 739
 740         psubw           mm3, mm2                        ;wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
 741
 742         movq            [esi + 8*2], mm1        ;wsptr[DCTSIZE*1]
 743         movq            mm1, mm4                        ;copy tmp3
 744
 745         movq            [esi + 8*12], mm3       ;wsptr[DCTSIZE*6]
 746
 747         paddw           mm4, [esi+8*4]          ;wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
 748
 749         psubw           mm1, [esi+8*4]          ;wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
 750
 751         movq            [esi + 8*8], mm4
 752         movq            mm7, mm5                        ;copy tmp2
 753
 754         paddw           mm5, mm6                        ;wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
 755
 756         movq            [esi+8*6], mm1          ;
 757         psubw           mm7, mm6                        ;wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
 758
 759         movq            [esi + 8*4], mm5
 760
 761         movq            [esi + 8*10], mm7
 762
 763
 764
 765 /*****************************************************************/
 766         add             edi, 8
 767         add             ebx, 8
 768         add             esi, 8
 769
 770 /*****************************************************************/
 771
 772
 773
 774
 775         movq            mm1, [ebx + 8*10]               ;load inptr[DCTSIZE*5]
 776
 777         pmullw          mm1, [edi + 8*10]               ;tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 778
 779         movq            mm0, [ebx + 8*6]                ;load inptr[DCTSIZE*3]
 780
 781         pmullw          mm0, [edi + 8*6]                ;tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 782
 783         movq            mm3, [ebx + 8*2]                ;load inptr[DCTSIZE*1]
 784         movq    mm2, mm1                                        ;copy tmp6      /* phase 6 */
 785
 786         pmullw          mm3, [edi + 8*2]                ;tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 787
 788         movq            mm4, [ebx + 8*14]               ;load inptr[DCTSIZE*1]
 789         paddw   mm1, mm0                                        ;z13 = tmp6 + tmp5;
 790
 791         pmullw          mm4, [edi + 8*14]           ;tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
 792         psubw   mm2, mm0                                        ;z10 = tmp6 - tmp5
 793
 794         psllw           mm2, 2                          ;shift z10
 795         movq            mm0, mm2                        ;copy z10
 796
 797         pmulhw          mm2, fix_184n261        ;MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
 798         movq            mm5, mm3                                ;copy tmp4
 799
 800         pmulhw          mm0, fix_n184           ;MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
 801         paddw           mm3, mm4                                ;z11 = tmp4 + tmp7;
 802
 803         movq            mm6, mm3                                ;copy z11                       /* phase 5 */
 804         psubw           mm5, mm4                                ;z12 = tmp4 - tmp7;
 805
 806         psubw           mm6, mm1                                ;z11-z13
 807         psllw           mm5, 2                          ;shift z12
 808
 809         movq            mm4, [ebx + 8*12]               ;load inptr[DCTSIZE*6], even part
 810         movq            mm7, mm5                        ;copy z12
 811
 812         pmulhw          mm5, fix_108n184        ;MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
 813         paddw           mm3, mm1                                ;tmp7 = z11 + z13;
 814
 815
 816     /* Even part */
 817         pmulhw          mm7, fix_184            ;MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
 818         psllw           mm6, 2
 819
 820         movq            mm1, [ebx + 8*4]                ;load inptr[DCTSIZE*2]
 821
 822         pmullw          mm1, [edi + 8*4]                ;tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 823         paddw           mm0, mm5                        ;tmp10
 824
 825         pmullw          mm4, [edi + 8*12]               ;tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 826         paddw           mm2, mm7                        ;tmp12
 827
 828         pmulhw          mm6, fix_141                    ;tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 829         psubw           mm2, mm3                ;tmp6 = tmp12 - tmp7
 830
 831         movq            mm5, mm1                                ;copy tmp1
 832         paddw           mm1, mm4                                ;tmp13= tmp1 + tmp3;    /* phases 5-3 */
 833
 834         psubw           mm5, mm4                                ;tmp1-tmp3
 835         psubw           mm6, mm2                ;tmp5 = tmp11 - tmp6;
 836
 837         movq            [esi+8*0], mm1                  ;save tmp13 in workspace
 838         psllw           mm5, 2                                  ;shift tmp1-tmp3
 839
 840         movq            mm7, [ebx + 8*0]                ;load inptr[DCTSIZE*0]
 841         paddw           mm0, mm6                ;tmp4 = tmp10 + tmp5;
 842
 843         pmulhw          mm5, fix_141                    ;MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
 844
 845         pmullw          mm7, [edi + 8*0]                ;tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 846
 847         movq            mm4, [ebx + 8*8]                ;load inptr[DCTSIZE*4]
 848
 849         pmullw          mm4, [edi + 8*8]                ;tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 850         psubw           mm5, mm1                                ;tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
 851
 852         movq            [esi+8*4], mm0          ;save tmp4 in workspace
 853         movq            mm1, mm7                        ;copy tmp0      /* phase 3 */
 854
 855         movq            [esi+8*2], mm5          ;save tmp12 in workspace
 856         psubw           mm1, mm4                        ;tmp11 = tmp0 - tmp2;
 857
 858         paddw           mm7, mm4                        ;tmp10 = tmp0 + tmp2;
 859     movq                mm5, mm1                ;copy tmp11
 860
 861         paddw           mm1, [esi+8*2]  ;tmp1 = tmp11 + tmp12;
 862         movq            mm4, mm7                ;copy tmp10             /* phase 2 */
 863
 864         paddw           mm7, [esi+8*0]  ;tmp0 = tmp10 + tmp13;
 865
 866         psubw           mm4, [esi+8*0]  ;tmp3 = tmp10 - tmp13;
 867         movq            mm0, mm7                ;copy tmp0
 868
 869         psubw           mm5, [esi+8*2]  ;tmp2 = tmp11 - tmp12;
 870         paddw           mm7, mm3                ;wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
 871
 872         psubw           mm0, mm3                        ;wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
 873
 874         movq            [esi + 8*0], mm7        ;wsptr[DCTSIZE*0]
 875         movq            mm3, mm1                        ;copy tmp1
 876
 877         movq            [esi + 8*14], mm0       ;wsptr[DCTSIZE*7]
 878         paddw           mm1, mm2                        ;wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
 879
 880         psubw           mm3, mm2                        ;wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
 881
 882         movq            [esi + 8*2], mm1        ;wsptr[DCTSIZE*1]
 883         movq            mm1, mm4                        ;copy tmp3
 884
 885         movq            [esi + 8*12], mm3       ;wsptr[DCTSIZE*6]
 886
 887         paddw           mm4, [esi+8*4]          ;wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
 888
 889         psubw           mm1, [esi+8*4]          ;wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
 890
 891         movq            [esi + 8*8], mm4
 892         movq            mm7, mm5                        ;copy tmp2
 893
 894         paddw           mm5, mm6                        ;wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
 895
 896         movq            [esi+8*6], mm1          ;
 897         psubw           mm7, mm6                        ;wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
 898
 899         movq            [esi + 8*4], mm5
 900
 901         movq            [esi + 8*10], mm7
 902
 903
 904
 905
 906 /*****************************************************************/
 907
 908   /* Pass 2: process rows from work array, store into output array. */
 909   /* Note that we must descale the results by a factor of 8 == 2**3, */
 910   /* and also undo the PASS1_BITS scaling. */
 911
 912 /*****************************************************************/
 913     /* Even part */
 914
 915         mov                     esi, eax
 916         mov                     eax, outptr
 917
 918 //    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
 919 //    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
 920 //    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
 921 //    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
 922         movq            mm0, [esi+8*0]          ;wsptr[0,0],[0,1],[0,2],[0,3]
 923
 924         movq            mm1, [esi+8*1]          ;wsptr[0,4],[0,5],[0,6],[0,7]
 925         movq            mm2, mm0
 926
 927         movq            mm3, [esi+8*2]          ;wsptr[1,0],[1,1],[1,2],[1,3]
 928         paddw           mm0, mm1                        ;wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
 929
 930         movq            mm4, [esi+8*3]          ;wsptr[1,4],[1,5],[1,6],[1,7]
 931         psubw           mm2, mm1                        ;wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
 932
 933         movq            mm6, mm0
 934         movq            mm5, mm3
 935
 936         paddw           mm3, mm4                        ;wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
 937         movq            mm1, mm2
 938
 939         psubw           mm5, mm4                        ;wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
 940         punpcklwd       mm0, mm3                        ;wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
 941
 942         movq            mm7, [esi+8*7]          ;wsptr[3,4],[3,5],[3,6],[3,7]
 943         punpckhwd       mm6, mm3                        ;wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
 944
 945         movq            mm3, [esi+8*4]          ;wsptr[2,0],[2,1],[2,2],[2,3]
 946         punpckldq       mm0, mm6        ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
 947
 948         punpcklwd       mm1, mm5                        ;wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
 949         movq            mm4, mm3
 950
 951         movq            mm6, [esi+8*6]          ;wsptr[3,0],[3,1],[3,2],[3,3]
 952         punpckhwd       mm2, mm5                        ;wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
 953
 954         movq            mm5, [esi+8*5]          ;wsptr[2,4],[2,5],[2,6],[2,7]
 955         punpckldq       mm1, mm2        ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
 956
 957
 958         paddw           mm3, mm5                        ;wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
 959         movq            mm2, mm6
 960
 961         psubw           mm4, mm5                        ;wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
 962         paddw           mm6, mm7                        ;wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
 963
 964         movq            mm5, mm3
 965         punpcklwd       mm3, mm6                        ;wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
 966
 967         psubw           mm2, mm7                        ;wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
 968         punpckhwd       mm5, mm6                        ;wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
 969
 970         movq            mm7, mm4
 971         punpckldq       mm3, mm5        ;wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
 972
 973         punpcklwd       mm4, mm2                        ;wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
 974
 975         punpckhwd       mm7, mm2                        ;wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
 976
 977         punpckldq       mm4, mm7        ;wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
 978         movq            mm6, mm1
 979
 980 //      mm0 =   ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
 981 //      mm1 =   ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
 982
 983
 984         movq            mm2, mm0
 985         punpckhdq       mm6, mm4        ;wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
 986
 987         punpckldq       mm1, mm4        ;wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
 988         psllw           mm6, 2
 989
 990         pmulhw          mm6, fix_141
 991         punpckldq       mm0, mm3        ;wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
 992
 993         punpckhdq       mm2, mm3        ;wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
 994         movq            mm7, mm0
 995
 996 //    tmp0 = tmp10 + tmp13;
 997 //    tmp3 = tmp10 - tmp13;
 998         paddw           mm0, mm2        ;[0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
 999         psubw           mm7, mm2        ;[0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
1000
1001 //    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
1002         psubw           mm6, mm2        ;wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
1003 //    tmp1 = tmp11 + tmp12;
1004 //    tmp2 = tmp11 - tmp12;
1005         movq            mm5, mm1
1006
1007
1008
1009     /* Odd part */
1010
1011 //    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
1012 //    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
1013 //    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
1014 //    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
1015         movq            mm3, [esi+8*0]          ;wsptr[0,0],[0,1],[0,2],[0,3]
1016         paddw           mm1, mm6        ;[0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
1017
1018         movq            mm4, [esi+8*1]          ;wsptr[0,4],[0,5],[0,6],[0,7]
1019         psubw           mm5, mm6        ;[0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
1020
1021         movq            mm6, mm3
1022         punpckldq       mm3, mm4                        ;wsptr[0,0],[0,1],[0,4],[0,5]
1023
1024         punpckhdq       mm4, mm6                        ;wsptr[0,6],[0,7],[0,2],[0,3]
1025         movq            mm2, mm3
1026
1027 //Save tmp0 and tmp1 in wsptr
1028         movq            [esi+8*0], mm0          ;save tmp0
1029         paddw           mm2, mm4                        ;wsptr[xxx],[0,z11],[xxx],[0,z13]
1030
1031
1032 //Continue with z10 --- z13
1033         movq            mm6, [esi+8*2]          ;wsptr[1,0],[1,1],[1,2],[1,3]
1034         psubw           mm3, mm4                        ;wsptr[xxx],[0,z12],[xxx],[0,z10]
1035
1036         movq            mm0, [esi+8*3]          ;wsptr[1,4],[1,5],[1,6],[1,7]
1037         movq            mm4, mm6
1038
1039         movq            [esi+8*1], mm1          ;save tmp1
1040         punpckldq       mm6, mm0                        ;wsptr[1,0],[1,1],[1,4],[1,5]
1041
1042         punpckhdq       mm0, mm4                        ;wsptr[1,6],[1,7],[1,2],[1,3]
1043         movq            mm1, mm6
1044
1045 //Save tmp2 and tmp3 in wsptr
1046         paddw           mm6, mm0                ;wsptr[xxx],[1,z11],[xxx],[1,z13]
1047         movq            mm4, mm2
1048
1049 //Continue with z10 --- z13
1050         movq            [esi+8*2], mm5          ;save tmp2
1051         punpcklwd       mm2, mm6                ;wsptr[xxx],[xxx],[0,z11],[1,z11]
1052
1053         psubw           mm1, mm0                ;wsptr[xxx],[1,z12],[xxx],[1,z10]
1054         punpckhwd       mm4, mm6                ;wsptr[xxx],[xxx],[0,z13],[1,z13]
1055
1056         movq            mm0, mm3
1057         punpcklwd       mm3, mm1                ;wsptr[xxx],[xxx],[0,z12],[1,z12]
1058
1059         movq            [esi+8*3], mm7          ;save tmp3
1060         punpckhwd       mm0, mm1                ;wsptr[xxx],[xxx],[0,z10],[1,z10]
1061
1062         movq            mm6, [esi+8*4]          ;wsptr[2,0],[2,1],[2,2],[2,3]
1063         punpckhdq       mm0, mm2                ;wsptr[0,z10],[1,z10],[0,z11],[1,z11]
1064
1065         movq            mm7, [esi+8*5]          ;wsptr[2,4],[2,5],[2,6],[2,7]
1066         punpckhdq       mm3, mm4                ;wsptr[0,z12],[1,z12],[0,z13],[1,z13]
1067
1068         movq            mm1, [esi+8*6]          ;wsptr[3,0],[3,1],[3,2],[3,3]
1069         movq            mm4, mm6
1070
1071         punpckldq       mm6, mm7                        ;wsptr[2,0],[2,1],[2,4],[2,5]
1072         movq            mm5, mm1
1073
1074         punpckhdq       mm7, mm4                        ;wsptr[2,6],[2,7],[2,2],[2,3]
1075         movq            mm2, mm6
1076
1077         movq            mm4, [esi+8*7]          ;wsptr[3,4],[3,5],[3,6],[3,7]
1078         paddw           mm6, mm7                ;wsptr[xxx],[2,z11],[xxx],[2,z13]
1079
1080         psubw           mm2, mm7                ;wsptr[xxx],[2,z12],[xxx],[2,z10]
1081         punpckldq       mm1, mm4                        ;wsptr[3,0],[3,1],[3,4],[3,5]
1082
1083         punpckhdq       mm4, mm5                        ;wsptr[3,6],[3,7],[3,2],[3,3]
1084         movq            mm7, mm1
1085
1086         paddw           mm1, mm4                ;wsptr[xxx],[3,z11],[xxx],[3,z13]
1087         psubw           mm7, mm4                ;wsptr[xxx],[3,z12],[xxx],[3,z10]
1088
1089         movq            mm5, mm6
1090         punpcklwd       mm6, mm1                ;wsptr[xxx],[xxx],[2,z11],[3,z11]
1091
1092         punpckhwd       mm5, mm1                ;wsptr[xxx],[xxx],[2,z13],[3,z13]
1093         movq            mm4, mm2
1094
1095         punpcklwd       mm2, mm7                ;wsptr[xxx],[xxx],[2,z12],[3,z12]
1096
1097         punpckhwd       mm4, mm7                ;wsptr[xxx],[xxx],[2,z10],[3,z10]
1098
1099         punpckhdq       mm4, mm6                ;wsptr[2,z10],[3,z10],[2,z11],[3,z11]
1100
1101         punpckhdq       mm2, mm5                ;wsptr[2,z12],[3,z12],[2,z13],[3,z13]
1102         movq            mm5, mm0
1103
1104         punpckldq       mm0, mm4                ;wsptr[0,z10],[1,z10],[2,z10],[3,z10]
1105
1106         punpckhdq       mm5, mm4                ;wsptr[0,z11],[1,z11],[2,z11],[3,z11]
1107         movq            mm4, mm3
1108
1109         punpckhdq       mm4, mm2                ;wsptr[0,z13],[1,z13],[2,z13],[3,z13]
1110         movq            mm1, mm5
1111
1112         punpckldq       mm3, mm2                ;wsptr[0,z12],[1,z12],[2,z12],[3,z12]
1113 //    tmp7 = z11 + z13;         /* phase 5 */
1114 //    tmp8 = z11 - z13;         /* phase 5 */
1115         psubw           mm1, mm4                ;tmp8
1116
1117         paddw           mm5, mm4                ;tmp7
1118 //    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
1119         psllw           mm1, 2
1120
1121         psllw           mm0, 2
1122
1123         pmulhw          mm1, fix_141    ;tmp21
1124 //    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  /* 2*(c2-c6) */
1125 //                      + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
1126         psllw           mm3, 2
1127         movq            mm7, mm0
1128
1129         pmulhw          mm7, fix_n184
1130         movq            mm6, mm3
1131
1132         movq            mm2, [esi+8*0]  ;tmp0,final1
1133
1134         pmulhw          mm6, fix_108n184
1135 //       tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
1136 //                      + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
1137         movq            mm4, mm2                ;final1
1138
1139         pmulhw          mm0, fix_184n261
1140         paddw           mm2, mm5                ;tmp0+tmp7,final1
1141
1142         pmulhw          mm3, fix_184
1143         psubw           mm4, mm5                ;tmp0-tmp7,final1
1144
1145 //    tmp6 = tmp22 - tmp7;      /* phase 2 */
1146         psraw           mm2, 5                  ;outptr[0,0],[1,0],[2,0],[3,0],final1
1147
1148         paddsw          mm2, const_0x0080       ;final1
1149         paddw           mm7, mm6                        ;tmp20
1150         psraw           mm4, 5                  ;outptr[0,7],[1,7],[2,7],[3,7],final1
1151
1152         paddsw          mm4, const_0x0080       ;final1
1153         paddw           mm3, mm0                        ;tmp22
1154
1155 //    tmp5 = tmp21 - tmp6;
1156         psubw           mm3, mm5                ;tmp6
1157
1158 //    tmp4 = tmp20 + tmp5;
1159         movq            mm0, [esi+8*1]          ;tmp1,final2
1160         psubw           mm1, mm3                ;tmp5
1161
1162         movq            mm6, mm0                        ;final2
1163         paddw           mm0, mm3                ;tmp1+tmp6,final2
1164
1165     /* Final output stage: scale down by a factor of 8 and range-limit */
1166
1167
1168 //    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
1169 //                          & RANGE_MASK];
1170 //    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
1171 //                          & RANGE_MASK];      final1
1172
1173
1174 //    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
1175 //                          & RANGE_MASK];
1176 //    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
1177 //                          & RANGE_MASK];      final2
1178         psubw           mm6, mm3                ;tmp1-tmp6,final2
1179         psraw           mm0, 5                  ;outptr[0,1],[1,1],[2,1],[3,1]
1180
1181         paddsw          mm0, const_0x0080
1182         psraw           mm6, 5                  ;outptr[0,6],[1,6],[2,6],[3,6]
1183
1184         paddsw          mm6, const_0x0080               ;need to check this value
1185         packuswb        mm0, mm4        ;out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
1186
1187         movq            mm5, [esi+8*2]          ;tmp2,final3
1188         packuswb        mm2, mm6        ;out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
1189
1190 //    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
1191 //                          & RANGE_MASK];
1192 //    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
1193 //                          & RANGE_MASK];      final3
1194         paddw           mm7, mm1                ;tmp4
1195         movq            mm3, mm5
1196
1197         paddw           mm5, mm1                ;tmp2+tmp5
1198         psubw           mm3, mm1                ;tmp2-tmp5
1199
1200         psraw           mm5, 5                  ;outptr[0,2],[1,2],[2,2],[3,2]
1201
1202         paddsw          mm5, const_0x0080
1203         movq            mm4, [esi+8*3]          ;tmp3,final4
1204         psraw           mm3, 5                  ;outptr[0,5],[1,5],[2,5],[3,5]
1205
1206         paddsw          mm3, const_0x0080
1207
1208
1209 //    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
1210 //                          & RANGE_MASK];
1211 //    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
1212 //                          & RANGE_MASK];      final4
1213         movq            mm6, mm4
1214         paddw           mm4, mm7                ;tmp3+tmp4
1215
1216         psubw           mm6, mm7                ;tmp3-tmp4
1217         psraw           mm4, 5                  ;outptr[0,4],[1,4],[2,4],[3,4]
1218         mov                     ecx, [eax]
1219
1220         paddsw          mm4, const_0x0080
1221         psraw           mm6, 5                  ;outptr[0,3],[1,3],[2,3],[3,3]
1222
1223         paddsw          mm6, const_0x0080
1224         packuswb        mm5, mm4        ;out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
1225
1226         packuswb        mm6, mm3        ;out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
1227         movq            mm4, mm2
1228
1229         movq            mm7, mm5
1230         punpcklbw       mm2, mm0        ;out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
1231
1232         punpckhbw       mm4, mm0        ;out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
1233         movq            mm1, mm2
1234
1235         punpcklbw       mm5, mm6        ;out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
1236         add                     eax, 4
1237
1238         punpckhbw       mm7, mm6        ;out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
1239
1240         punpcklwd       mm2, mm5        ;out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
1241         add                     ecx, output_col
1242
1243         movq            mm6, mm7
1244         punpckhwd       mm1, mm5        ;out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
1245
1246         movq            mm0, mm2
1247         punpcklwd       mm6, mm4        ;out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
1248
1249         mov                     ebx, [eax]
1250         punpckldq       mm2, mm6        ;out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
1251
1252         add                     eax, 4
1253         movq            mm3, mm1
1254
1255         add                     ebx, output_col
1256         punpckhwd       mm7, mm4        ;out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
1257
1258         movq            [ecx], mm2
1259         punpckhdq       mm0, mm6        ;out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
1260
1261         mov                     ecx, [eax]
1262         add                     eax, 4
1263         add                     ecx, output_col
1264
1265         movq            [ebx], mm0
1266         punpckldq       mm1, mm7        ;out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
1267
1268         mov                     ebx, [eax]
1269
1270         add                     ebx, output_col
1271         punpckhdq       mm3, mm7        ;out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
1272         movq            [ecx], mm1
1273
1274
1275         movq            [ebx], mm3
1276
1277
1278
1279 /*******************************************************************/
1280
1281
1282         add                     esi, 64
1283         add                     eax, 4
1284
1285 /*******************************************************************/
1286
1287 //    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
1288 //    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
1289 //    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
1290 //    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
1291         movq            mm0, [esi+8*0]          ;wsptr[0,0],[0,1],[0,2],[0,3]
1292
1293         movq            mm1, [esi+8*1]          ;wsptr[0,4],[0,5],[0,6],[0,7]
1294         movq            mm2, mm0
1295
1296         movq            mm3, [esi+8*2]          ;wsptr[1,0],[1,1],[1,2],[1,3]
1297         paddw           mm0, mm1                        ;wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
1298
1299         movq            mm4, [esi+8*3]          ;wsptr[1,4],[1,5],[1,6],[1,7]
1300         psubw           mm2, mm1                        ;wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
1301
1302         movq            mm6, mm0
1303         movq            mm5, mm3
1304
1305         paddw           mm3, mm4                        ;wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
1306         movq            mm1, mm2
1307
1308         psubw           mm5, mm4                        ;wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
1309         punpcklwd       mm0, mm3                        ;wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
1310
1311         movq            mm7, [esi+8*7]          ;wsptr[3,4],[3,5],[3,6],[3,7]
1312         punpckhwd       mm6, mm3                        ;wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
1313
1314         movq            mm3, [esi+8*4]          ;wsptr[2,0],[2,1],[2,2],[2,3]
1315         punpckldq       mm0, mm6        ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1316
1317         punpcklwd       mm1, mm5                        ;wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
1318         movq            mm4, mm3
1319
1320         movq            mm6, [esi+8*6]          ;wsptr[3,0],[3,1],[3,2],[3,3]
1321         punpckhwd       mm2, mm5                        ;wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
1322
1323         movq            mm5, [esi+8*5]          ;wsptr[2,4],[2,5],[2,6],[2,7]
1324         punpckldq       mm1, mm2        ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1325
1326
1327         paddw           mm3, mm5                        ;wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
1328         movq            mm2, mm6
1329
1330         psubw           mm4, mm5                        ;wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
1331         paddw           mm6, mm7                        ;wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
1332
1333         movq            mm5, mm3
1334         punpcklwd       mm3, mm6                        ;wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
1335
1336         psubw           mm2, mm7                        ;wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
1337         punpckhwd       mm5, mm6                        ;wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
1338
1339         movq            mm7, mm4
1340         punpckldq       mm3, mm5        ;wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
1341
1342         punpcklwd       mm4, mm2                        ;wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
1343
1344         punpckhwd       mm7, mm2                        ;wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
1345
1346         punpckldq       mm4, mm7        ;wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
1347         movq            mm6, mm1
1348
1349 //      mm0 =   ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1350 //      mm1 =   ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1351
1352
1353         movq            mm2, mm0
1354         punpckhdq       mm6, mm4        ;wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
1355
1356         punpckldq       mm1, mm4        ;wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
1357         psllw           mm6, 2
1358
1359         pmulhw          mm6, fix_141
1360         punpckldq       mm0, mm3        ;wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
1361
1362         punpckhdq       mm2, mm3        ;wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
1363         movq            mm7, mm0
1364
1365 //    tmp0 = tmp10 + tmp13;
1366 //    tmp3 = tmp10 - tmp13;
1367         paddw           mm0, mm2        ;[0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
1368         psubw           mm7, mm2        ;[0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
1369
1370 //    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
1371         psubw           mm6, mm2        ;wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
1372 //    tmp1 = tmp11 + tmp12;
1373 //    tmp2 = tmp11 - tmp12;
1374         movq            mm5, mm1
1375
1376
1377
1378     /* Odd part */
1379
1380 //    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
1381 //    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
1382 //    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
1383 //    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
1384         movq            mm3, [esi+8*0]          ;wsptr[0,0],[0,1],[0,2],[0,3]
1385         paddw           mm1, mm6        ;[0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
1386
1387         movq            mm4, [esi+8*1]          ;wsptr[0,4],[0,5],[0,6],[0,7]
1388         psubw           mm5, mm6        ;[0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
1389
1390         movq            mm6, mm3
1391         punpckldq       mm3, mm4                        ;wsptr[0,0],[0,1],[0,4],[0,5]
1392
1393         punpckhdq       mm4, mm6                        ;wsptr[0,6],[0,7],[0,2],[0,3]
1394         movq            mm2, mm3
1395
1396 //Save tmp0 and tmp1 in wsptr
1397         movq            [esi+8*0], mm0          ;save tmp0
1398         paddw           mm2, mm4                        ;wsptr[xxx],[0,z11],[xxx],[0,z13]
1399
1400
1401 //Continue with z10 --- z13
1402         movq            mm6, [esi+8*2]          ;wsptr[1,0],[1,1],[1,2],[1,3]
1403         psubw           mm3, mm4                        ;wsptr[xxx],[0,z12],[xxx],[0,z10]
1404
1405         movq            mm0, [esi+8*3]          ;wsptr[1,4],[1,5],[1,6],[1,7]
1406         movq            mm4, mm6
1407
1408         movq            [esi+8*1], mm1          ;save tmp1
1409         punpckldq       mm6, mm0                        ;wsptr[1,0],[1,1],[1,4],[1,5]
1410
1411         punpckhdq       mm0, mm4                        ;wsptr[1,6],[1,7],[1,2],[1,3]
1412         movq            mm1, mm6
1413
1414 //Save tmp2 and tmp3 in wsptr
1415         paddw           mm6, mm0                ;wsptr[xxx],[1,z11],[xxx],[1,z13]
1416         movq            mm4, mm2
1417
1418 //Continue with z10 --- z13
1419         movq            [esi+8*2], mm5          ;save tmp2
1420         punpcklwd       mm2, mm6                ;wsptr[xxx],[xxx],[0,z11],[1,z11]
1421
1422         psubw           mm1, mm0                ;wsptr[xxx],[1,z12],[xxx],[1,z10]
1423         punpckhwd       mm4, mm6                ;wsptr[xxx],[xxx],[0,z13],[1,z13]
1424
1425         movq            mm0, mm3
1426         punpcklwd       mm3, mm1                ;wsptr[xxx],[xxx],[0,z12],[1,z12]
1427
1428         movq            [esi+8*3], mm7          ;save tmp3
1429         punpckhwd       mm0, mm1                ;wsptr[xxx],[xxx],[0,z10],[1,z10]
1430
1431         movq            mm6, [esi+8*4]          ;wsptr[2,0],[2,1],[2,2],[2,3]
1432         punpckhdq       mm0, mm2                ;wsptr[0,z10],[1,z10],[0,z11],[1,z11]
1433
1434         movq            mm7, [esi+8*5]          ;wsptr[2,4],[2,5],[2,6],[2,7]
1435         punpckhdq       mm3, mm4                ;wsptr[0,z12],[1,z12],[0,z13],[1,z13]
1436
1437         movq            mm1, [esi+8*6]          ;wsptr[3,0],[3,1],[3,2],[3,3]
1438         movq            mm4, mm6
1439
1440         punpckldq       mm6, mm7                        ;wsptr[2,0],[2,1],[2,4],[2,5]
1441         movq            mm5, mm1
1442
1443         punpckhdq       mm7, mm4                        ;wsptr[2,6],[2,7],[2,2],[2,3]
1444         movq            mm2, mm6
1445
1446         movq            mm4, [esi+8*7]          ;wsptr[3,4],[3,5],[3,6],[3,7]
1447         paddw           mm6, mm7                ;wsptr[xxx],[2,z11],[xxx],[2,z13]
1448
1449         psubw           mm2, mm7                ;wsptr[xxx],[2,z12],[xxx],[2,z10]
1450         punpckldq       mm1, mm4                        ;wsptr[3,0],[3,1],[3,4],[3,5]
1451
1452         punpckhdq       mm4, mm5                        ;wsptr[3,6],[3,7],[3,2],[3,3]
1453         movq            mm7, mm1
1454
1455         paddw           mm1, mm4                ;wsptr[xxx],[3,z11],[xxx],[3,z13]
1456         psubw           mm7, mm4                ;wsptr[xxx],[3,z12],[xxx],[3,z10]
1457
1458         movq            mm5, mm6
1459         punpcklwd       mm6, mm1                ;wsptr[xxx],[xxx],[2,z11],[3,z11]
1460
1461         punpckhwd       mm5, mm1                ;wsptr[xxx],[xxx],[2,z13],[3,z13]
1462         movq            mm4, mm2
1463
1464         punpcklwd       mm2, mm7                ;wsptr[xxx],[xxx],[2,z12],[3,z12]
1465
1466         punpckhwd       mm4, mm7                ;wsptr[xxx],[xxx],[2,z10],[3,z10]
1467
1468         punpckhdq       mm4, mm6                ;wsptr[2,z10],[3,z10],[2,z11],[3,z11]
1469
1470         punpckhdq       mm2, mm5                ;wsptr[2,z12],[3,z12],[2,z13],[3,z13]
1471         movq            mm5, mm0
1472
1473         punpckldq       mm0, mm4                ;wsptr[0,z10],[1,z10],[2,z10],[3,z10]
1474
1475         punpckhdq       mm5, mm4                ;wsptr[0,z11],[1,z11],[2,z11],[3,z11]
1476         movq            mm4, mm3
1477
1478         punpckhdq       mm4, mm2                ;wsptr[0,z13],[1,z13],[2,z13],[3,z13]
1479         movq            mm1, mm5
1480
1481         punpckldq       mm3, mm2                ;wsptr[0,z12],[1,z12],[2,z12],[3,z12]
1482 //    tmp7 = z11 + z13;         /* phase 5 */
1483 //    tmp8 = z11 - z13;         /* phase 5 */
1484         psubw           mm1, mm4                ;tmp8
1485
1486         paddw           mm5, mm4                ;tmp7
1487 //    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
1488         psllw           mm1, 2
1489
1490         psllw           mm0, 2
1491
1492         pmulhw          mm1, fix_141    ;tmp21
1493 //    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  /* 2*(c2-c6) */
1494 //                      + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
1495         psllw           mm3, 2
1496         movq            mm7, mm0
1497
1498         pmulhw          mm7, fix_n184
1499         movq            mm6, mm3
1500
1501         movq            mm2, [esi+8*0]  ;tmp0,final1
1502
1503         pmulhw          mm6, fix_108n184
1504 //       tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
1505 //                      + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
1506         movq            mm4, mm2                ;final1
1507
1508         pmulhw          mm0, fix_184n261
1509         paddw           mm2, mm5                ;tmp0+tmp7,final1
1510
1511         pmulhw          mm3, fix_184
1512         psubw           mm4, mm5                ;tmp0-tmp7,final1
1513
1514 //    tmp6 = tmp22 - tmp7;      /* phase 2 */
1515         psraw           mm2, 5                  ;outptr[0,0],[1,0],[2,0],[3,0],final1
1516
1517         paddsw          mm2, const_0x0080       ;final1
1518         paddw           mm7, mm6                        ;tmp20
1519         psraw           mm4, 5                  ;outptr[0,7],[1,7],[2,7],[3,7],final1
1520
1521         paddsw          mm4, const_0x0080       ;final1
1522         paddw           mm3, mm0                        ;tmp22
1523
1524 //    tmp5 = tmp21 - tmp6;
1525         psubw           mm3, mm5                ;tmp6
1526
1527 //    tmp4 = tmp20 + tmp5;
1528         movq            mm0, [esi+8*1]          ;tmp1,final2
1529         psubw           mm1, mm3                ;tmp5
1530
1531         movq            mm6, mm0                        ;final2
1532         paddw           mm0, mm3                ;tmp1+tmp6,final2
1533
1534     /* Final output stage: scale down by a factor of 8 and range-limit */
1535
1536
1537 //    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
1538 //                          & RANGE_MASK];
1539 //    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
1540 //                          & RANGE_MASK];      final1
1541
1542
1543 //    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
1544 //                          & RANGE_MASK];
1545 //    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
1546 //                          & RANGE_MASK];      final2
1547         psubw           mm6, mm3                ;tmp1-tmp6,final2
1548         psraw           mm0, 5                  ;outptr[0,1],[1,1],[2,1],[3,1]
1549
1550         paddsw          mm0, const_0x0080
1551         psraw           mm6, 5                  ;outptr[0,6],[1,6],[2,6],[3,6]
1552
1553         paddsw          mm6, const_0x0080               ;need to check this value
1554         packuswb        mm0, mm4        ;out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
1555
1556         movq            mm5, [esi+8*2]          ;tmp2,final3
1557         packuswb        mm2, mm6        ;out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
1558
1559 //    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
1560 //                          & RANGE_MASK];
1561 //    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
1562 //                          & RANGE_MASK];      final3
1563         paddw           mm7, mm1                ;tmp4
1564         movq            mm3, mm5
1565
1566         paddw           mm5, mm1                ;tmp2+tmp5
1567         psubw           mm3, mm1                ;tmp2-tmp5
1568
1569         psraw           mm5, 5                  ;outptr[0,2],[1,2],[2,2],[3,2]
1570
1571         paddsw          mm5, const_0x0080
1572         movq            mm4, [esi+8*3]          ;tmp3,final4
1573         psraw           mm3, 5                  ;outptr[0,5],[1,5],[2,5],[3,5]
1574
1575         paddsw          mm3, const_0x0080
1576
1577
1578 //    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
1579 //                          & RANGE_MASK];
1580 //    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
1581 //                          & RANGE_MASK];      final4
1582         movq            mm6, mm4
1583         paddw           mm4, mm7                ;tmp3+tmp4
1584
1585         psubw           mm6, mm7                ;tmp3-tmp4
1586         psraw           mm4, 5                  ;outptr[0,4],[1,4],[2,4],[3,4]
1587         mov                     ecx, [eax]
1588
1589         paddsw          mm4, const_0x0080
1590         psraw           mm6, 5                  ;outptr[0,3],[1,3],[2,3],[3,3]
1591
1592         paddsw          mm6, const_0x0080
1593         packuswb        mm5, mm4        ;out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
1594
1595         packuswb        mm6, mm3        ;out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
1596         movq            mm4, mm2
1597
1598         movq            mm7, mm5
1599         punpcklbw       mm2, mm0        ;out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
1600
1601         punpckhbw       mm4, mm0        ;out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
1602         movq            mm1, mm2
1603
1604         punpcklbw       mm5, mm6        ;out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
1605         add                     eax, 4
1606
1607         punpckhbw       mm7, mm6        ;out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
1608
1609         punpcklwd       mm2, mm5        ;out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
1610         add                     ecx, output_col
1611
1612         movq            mm6, mm7
1613         punpckhwd       mm1, mm5        ;out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
1614
1615         movq            mm0, mm2
1616         punpcklwd       mm6, mm4        ;out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
1617
1618         mov                     ebx, [eax]
1619         punpckldq       mm2, mm6        ;out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
1620
1621         add                     eax, 4
1622         movq            mm3, mm1
1623
1624         add                     ebx, output_col
1625         punpckhwd       mm7, mm4        ;out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
1626
1627         movq            [ecx], mm2
1628         punpckhdq       mm0, mm6        ;out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
1629
1630         mov                     ecx, [eax]
1631         add                     eax, 4
1632         add                     ecx, output_col
1633
1634         movq            [ebx], mm0
1635         punpckldq       mm1, mm7        ;out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
1636
1637         mov                     ebx, [eax]
1638
1639         add                     ebx, output_col
1640         punpckhdq       mm3, mm7        ;out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
1641         movq            [ecx], mm1
1642
1643         movq            [ebx], mm3
1644
1645         emms
1646         }
1647 }
1648 #endif
1649
1650 #endif /* DCT_IFAST_SUPPORTED */