jpeg/jidctint.c

   1 /*
   2  * jidctint.c
   3  *
   4  * Copyright (C) 1991-1998, Thomas G. Lane.
   5  * This file is part of the Independent JPEG Group's software.
   6  * For conditions of distribution and use, see the accompanying README file.
   7  *
   8  * This file contains a slow-but-accurate integer implementation of the
   9  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
  10  * must also perform dequantization of the input coefficients.
  11  *
  12  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
  13  * on each row (or vice versa, but it's more convenient to emit a row at
  14  * a time).  Direct algorithms are also available, but they are much more
  15  * complex and seem not to be any faster when reduced to code.
  16  *
  17  * This implementation is based on an algorithm described in
  18  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  19  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  20  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  21  * The primary algorithm described there uses 11 multiplies and 29 adds.
  22  * We use their alternate method with 12 multiplies and 32 adds.
  23  * The advantage of this method is that no data path contains more than one
  24  * multiplication; this allows a very simple and accurate implementation in
  25  * scaled fixed-point arithmetic, with a minimal number of shifts.
  26  */
  27
  28 #define JPEG_INTERNALS
  29 #include "jinclude.h"
  30 #include "jpeglib.h"
  31 #include "jdct.h"               /* Private declarations for DCT subsystem */
  32
  33 #ifdef DCT_ISLOW_SUPPORTED
  34
  35
  36 /*
  37  * This module is specialized to the case DCTSIZE = 8.
  38  */
  39
  40 #if DCTSIZE != 8
  41   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
  42 #endif
  43
  44
  45 /*
  46  * The poop on this scaling stuff is as follows:
  47  *
  48  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
  49  * larger than the true IDCT outputs.  The final outputs are therefore
  50  * a factor of N larger than desired; since N=8 this can be cured by
  51  * a simple right shift at the end of the algorithm.  The advantage of
  52  * this arrangement is that we save two multiplications per 1-D IDCT,
  53  * because the y0 and y4 inputs need not be divided by sqrt(N).
  54  *
  55  * We have to do addition and subtraction of the integer inputs, which
  56  * is no problem, and multiplication by fractional constants, which is
  57  * a problem to do in integer arithmetic.  We multiply all the constants
  58  * by CONST_SCALE and convert them to integer constants (thus retaining
  59  * CONST_BITS bits of precision in the constants).  After doing a
  60  * multiplication we have to divide the product by CONST_SCALE, with proper
  61  * rounding, to produce the correct output.  This division can be done
  62  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
  63  * as long as possible so that partial sums can be added together with
  64  * full fractional precision.
  65  *
  66  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
  67  * they are represented to better-than-integral precision.  These outputs
  68  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  69  * with the recommended scaling.  (To scale up 12-bit sample data further, an
  70  * intermediate INT32 array would be needed.)
  71  *
  72  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  73  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
  74  * shows that the values given below are the most effective.
  75  */
  76
  77 #if BITS_IN_JSAMPLE == 8
  78 #define CONST_BITS  13
  79 #define PASS1_BITS  2
  80 #else
  81 #define CONST_BITS  13
  82 #define PASS1_BITS  1           /* lose a little precision to avoid overflow */
  83 #endif
  84
  85 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
  86  * causing a lot of useless floating-point operations at run time.
  87  * To get around this we use the following pre-calculated constants.
  88  * If you change CONST_BITS you may want to add appropriate values.
  89  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
  90  */
  91
  92 #if CONST_BITS == 13
  93 #define FIX_0_298631336  ((INT32)  2446)        /* FIX(0.298631336) */
  94 #define FIX_0_390180644  ((INT32)  3196)        /* FIX(0.390180644) */
  95 #define FIX_0_541196100  ((INT32)  4433)        /* FIX(0.541196100) */
  96 #define FIX_0_765366865  ((INT32)  6270)        /* FIX(0.765366865) */
  97 #define FIX_0_899976223  ((INT32)  7373)        /* FIX(0.899976223) */
  98 #define FIX_1_175875602  ((INT32)  9633)        /* FIX(1.175875602) */
  99 #define FIX_1_501321110  ((INT32)  12299)       /* FIX(1.501321110) */
 100 #define FIX_1_847759065  ((INT32)  15137)       /* FIX(1.847759065) */
 101 #define FIX_1_961570560  ((INT32)  16069)       /* FIX(1.961570560) */
 102 #define FIX_2_053119869  ((INT32)  16819)       /* FIX(2.053119869) */
 103 #define FIX_2_562915447  ((INT32)  20995)       /* FIX(2.562915447) */
 104 #define FIX_3_072711026  ((INT32)  25172)       /* FIX(3.072711026) */
 105 #else
 106 #define FIX_0_298631336  FIX(0.298631336)
 107 #define FIX_0_390180644  FIX(0.390180644)
 108 #define FIX_0_541196100  FIX(0.541196100)
 109 #define FIX_0_765366865  FIX(0.765366865)
 110 #define FIX_0_899976223  FIX(0.899976223)
 111 #define FIX_1_175875602  FIX(1.175875602)
 112 #define FIX_1_501321110  FIX(1.501321110)
 113 #define FIX_1_847759065  FIX(1.847759065)
 114 #define FIX_1_961570560  FIX(1.961570560)
 115 #define FIX_2_053119869  FIX(2.053119869)
 116 #define FIX_2_562915447  FIX(2.562915447)
 117 #define FIX_3_072711026  FIX(3.072711026)
 118 #endif
 119
 120
 121 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
 122  * For 8-bit samples with the recommended scaling, all the variable
 123  * and constant values involved are no more than 16 bits wide, so a
 124  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
 125  * For 12-bit samples, a full 32-bit multiplication will be needed.
 126  */
 127
 128 #if BITS_IN_JSAMPLE == 8
 129 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
 130 #else
 131 #define MULTIPLY(var,const)  ((var) * (const))
 132 #endif
 133
 134
 135 /* Dequantize a coefficient by multiplying it by the multiplier-table
 136  * entry; produce an int result.  In this module, both inputs and result
 137  * are 16 bits or less, so either int or short multiply will work.
 138  */
 139
 140 #define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
 141
 142
 143 /*
 144  * Perform dequantization and inverse DCT on one block of coefficients.
 145  */
 146
 147 GLOBAL(void)
 148 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 149                  JCOEFPTR coef_block,
 150                  JSAMPARRAY output_buf, JDIMENSION output_col)
 151 {
 152   INT32 tmp0, tmp1, tmp2, tmp3;
 153   INT32 tmp10, tmp11, tmp12, tmp13;
 154   INT32 z1, z2, z3, z4, z5;
 155   JCOEFPTR inptr;
 156   ISLOW_MULT_TYPE * quantptr;
 157   int * wsptr;
 158   JSAMPROW outptr;
 159   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
 160   int ctr;
 161   int workspace[DCTSIZE2];      /* buffers data between passes */
 162   SHIFT_TEMPS
 163
 164   /* Pass 1: process columns from input, store into work array. */
 165   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
 166   /* furthermore, we scale the results by 2**PASS1_BITS. */
 167
 168   inptr = coef_block;
 169   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
 170   wsptr = workspace;
 171   for (ctr = DCTSIZE; ctr > 0; ctr--) {
 172     /* Due to quantization, we will usually find that many of the input
 173      * coefficients are zero, especially the AC terms.  We can exploit this
 174      * by short-circuiting the IDCT calculation for any column in which all
 175      * the AC terms are zero.  In that case each output is equal to the
 176      * DC coefficient (with scale factor as needed).
 177      * With typical images and quantization tables, half or more of the
 178      * column DCT calculations can be simplified this way.
 179      */
 180
 181     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
 182         inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
 183         inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
 184         inptr[DCTSIZE*7] == 0) {
 185       /* AC terms all zero */
 186       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
 187
 188       wsptr[DCTSIZE*0] = dcval;
 189       wsptr[DCTSIZE*1] = dcval;
 190       wsptr[DCTSIZE*2] = dcval;
 191       wsptr[DCTSIZE*3] = dcval;
 192       wsptr[DCTSIZE*4] = dcval;
 193       wsptr[DCTSIZE*5] = dcval;
 194       wsptr[DCTSIZE*6] = dcval;
 195       wsptr[DCTSIZE*7] = dcval;
 196
 197       inptr++;                  /* advance pointers to next column */
 198       quantptr++;
 199       wsptr++;
 200       continue;
 201     }
 202
 203     /* Even part: reverse the even part of the forward DCT. */
 204     /* The rotator is sqrt(2)*c(-6). */
 205
 206     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
 207     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
 208
 209     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
 210     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
 211     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
 212
 213     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
 214     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
 215
 216     tmp0 = (z2 + z3) << CONST_BITS;
 217     tmp1 = (z2 - z3) << CONST_BITS;
 218
 219     tmp10 = tmp0 + tmp3;
 220     tmp13 = tmp0 - tmp3;
 221     tmp11 = tmp1 + tmp2;
 222     tmp12 = tmp1 - tmp2;
 223
 224     /* Odd part per figure 8; the matrix is unitary and hence its
 225      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 226      */
 227
 228     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
 229     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
 230     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
 231     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
 232
 233     z1 = tmp0 + tmp3;
 234     z2 = tmp1 + tmp2;
 235     z3 = tmp0 + tmp2;
 236     z4 = tmp1 + tmp3;
 237     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
 238
 239     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
 240     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
 241     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
 242     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
 243     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
 244     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
 245     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
 246     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
 247
 248     z3 += z5;
 249     z4 += z5;
 250
 251     tmp0 += z1 + z3;
 252     tmp1 += z2 + z4;
 253     tmp2 += z2 + z3;
 254     tmp3 += z1 + z4;
 255
 256     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 257
 258     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
 259     wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
 260     wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
 261     wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
 262     wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
 263     wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
 264     wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
 265     wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
 266
 267     inptr++;                    /* advance pointers to next column */
 268     quantptr++;
 269     wsptr++;
 270   }
 271
 272   /* Pass 2: process rows from work array, store into output array. */
 273   /* Note that we must descale the results by a factor of 8 == 2**3, */
 274   /* and also undo the PASS1_BITS scaling. */
 275
 276   wsptr = workspace;
 277   for (ctr = 0; ctr < DCTSIZE; ctr++) {
 278     outptr = output_buf[ctr] + output_col;
 279     /* Rows of zeroes can be exploited in the same way as we did with columns.
 280      * However, the column calculation has created many nonzero AC terms, so
 281      * the simplification applies less often (typically 5% to 10% of the time).
 282      * On machines with very fast multiplication, it's possible that the
 283      * test takes more time than it's worth.  In that case this section
 284      * may be commented out.
 285      */
 286
 287 #ifndef NO_ZERO_ROW_TEST
 288     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
 289         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
 290       /* AC terms all zero */
 291       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
 292                                   & RANGE_MASK];
 293
 294       outptr[0] = dcval;
 295       outptr[1] = dcval;
 296       outptr[2] = dcval;
 297       outptr[3] = dcval;
 298       outptr[4] = dcval;
 299       outptr[5] = dcval;
 300       outptr[6] = dcval;
 301       outptr[7] = dcval;
 302
 303       wsptr += DCTSIZE;         /* advance pointer to next row */
 304       continue;
 305     }
 306 #endif
 307
 308     /* Even part: reverse the even part of the forward DCT. */
 309     /* The rotator is sqrt(2)*c(-6). */
 310
 311     z2 = (INT32) wsptr[2];
 312     z3 = (INT32) wsptr[6];
 313
 314     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
 315     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
 316     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
 317
 318     tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
 319     tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
 320
 321     tmp10 = tmp0 + tmp3;
 322     tmp13 = tmp0 - tmp3;
 323     tmp11 = tmp1 + tmp2;
 324     tmp12 = tmp1 - tmp2;
 325
 326     /* Odd part per figure 8; the matrix is unitary and hence its
 327      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 328      */
 329
 330     tmp0 = (INT32) wsptr[7];
 331     tmp1 = (INT32) wsptr[5];
 332     tmp2 = (INT32) wsptr[3];
 333     tmp3 = (INT32) wsptr[1];
 334
 335     z1 = tmp0 + tmp3;
 336     z2 = tmp1 + tmp2;
 337     z3 = tmp0 + tmp2;
 338     z4 = tmp1 + tmp3;
 339     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
 340
 341     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
 342     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
 343     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
 344     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
 345     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
 346     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
 347     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
 348     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
 349
 350     z3 += z5;
 351     z4 += z5;
 352
 353     tmp0 += z1 + z3;
 354     tmp1 += z2 + z4;
 355     tmp2 += z2 + z3;
 356     tmp3 += z1 + z4;
 357
 358     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 359
 360     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
 361                                           CONST_BITS+PASS1_BITS+3)
 362                             & RANGE_MASK];
 363     outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
 364                                           CONST_BITS+PASS1_BITS+3)
 365                             & RANGE_MASK];
 366     outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
 367                                           CONST_BITS+PASS1_BITS+3)
 368                             & RANGE_MASK];
 369     outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
 370                                           CONST_BITS+PASS1_BITS+3)
 371                             & RANGE_MASK];
 372     outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
 373                                           CONST_BITS+PASS1_BITS+3)
 374                             & RANGE_MASK];
 375     outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
 376                                           CONST_BITS+PASS1_BITS+3)
 377                             & RANGE_MASK];
 378     outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
 379                                           CONST_BITS+PASS1_BITS+3)
 380                             & RANGE_MASK];
 381     outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
 382                                           CONST_BITS+PASS1_BITS+3)
 383                             & RANGE_MASK];
 384
 385     wsptr += DCTSIZE;           /* advance pointer to next row */
 386   }
 387 }
 388
 389
 390 #ifdef HAVE_SSE2_INTEL_MNEMONICS
 391
 392 /*
 393 * Intel SSE2 optimized Inverse Discrete Cosine Transform
 394 *
 395 *
 396 * Copyright (c) 2001-2002 Intel Corporation
 397 * All Rights Reserved
 398 *
 399 *
 400 *  Authors:
 401 *      Danilov G.
 402 *
 403 *
 404 *-----------------------------------------------------------------------------
 405 *
 406 * References:
 407 *    K.R. Rao and P. Yip
 408 *       Discrete Cosine Transform.
 409 *       Algorithms, Advantages, Applications.
 410 *       Academic Press, Inc, London, 1990.
 411 *    JPEG Group's software.
 412 *       This implementation is based on Appendix A.2 of the book (R&Y) ...
 413 *
 414 *-----------------------------------------------------------------------------
 415 */
 416
 417 typedef unsigned char   Ipp8u;
 418 typedef unsigned short  Ipp16u;
 419 typedef unsigned int    Ipp32u;
 420
 421 typedef signed char    Ipp8s;
 422 typedef signed short   Ipp16s;
 423 typedef signed int     Ipp32s;
 424
 425 #define BITS_INV_ACC  4
 426 #define SHIFT_INV_ROW  16 - BITS_INV_ACC
 427 #define SHIFT_INV_COL 1 + BITS_INV_ACC
 428
 429 #define RND_INV_ROW  1024 * (6 - BITS_INV_ACC)  /* 1 << (SHIFT_INV_ROW-1)               */
 430 #define RND_INV_COL = 16 * (BITS_INV_ACC - 3)   /* 1 << (SHIFT_INV_COL-1)               */
 431 #define RND_INV_CORR = RND_INV_COL - 1          /* correction -1.0 and round    */
 432
 433 #define c_inv_corr_0 -1024 * (6 - BITS_INV_ACC) + 65536         /* -0.5 + (16.0 or 32.0)        */
 434 #define c_inv_corr_1 1877 * (6 - BITS_INV_ACC)                          /* 0.9167       */
 435 #define c_inv_corr_2 1236 * (6 - BITS_INV_ACC)                          /* 0.6035       */
 436 #define c_inv_corr_3 680  * (6 - BITS_INV_ACC)                          /* 0.3322       */
 437 #define c_inv_corr_4 0    * (6 - BITS_INV_ACC)                          /* 0.0          */
 438 #define c_inv_corr_5 -569  * (6 - BITS_INV_ACC)                         /* -0.278       */
 439 #define c_inv_corr_6 -512  * (6 - BITS_INV_ACC)                         /* -0.25        */
 440 #define c_inv_corr_7 -651  * (6 - BITS_INV_ACC)                         /* -0.3176      */
 441
 442 #define RND_INV_ROW_0 RND_INV_ROW + c_inv_corr_0
 443 #define RND_INV_ROW_1 RND_INV_ROW + c_inv_corr_1
 444 #define RND_INV_ROW_2 RND_INV_ROW + c_inv_corr_2
 445 #define RND_INV_ROW_3 RND_INV_ROW + c_inv_corr_3
 446 #define RND_INV_ROW_4 RND_INV_ROW + c_inv_corr_4
 447 #define RND_INV_ROW_5 RND_INV_ROW + c_inv_corr_5
 448 #define RND_INV_ROW_6 RND_INV_ROW + c_inv_corr_6
 449 #define RND_INV_ROW_7 RND_INV_ROW + c_inv_corr_7
 450
 451 /* Table for rows 0,4 - constants are multiplied on cos_4_16 */
 452
 453 __declspec(align(16)) short tab_i_04[] = {
 454         16384, 21407, 16384, 8867,
 455         -16384, 21407, 16384, -8867,
 456         16384,  -8867,  16384, -21407,
 457     16384,   8867, -16384, -21407,
 458     22725,  19266,  19266,  -4520,
 459     4520,  19266,  19266, -22725,
 460     12873, -22725,   4520, -12873,
 461     12873,   4520, -22725, -12873};
 462
 463 /* Table for rows 1,7 - constants are multiplied on cos_1_16 */
 464
 465 __declspec(align(16)) short tab_i_17[] = {
 466         22725,  29692,  22725,  12299,
 467     -22725,  29692,  22725, -12299,
 468     22725, -12299,  22725, -29692,
 469     22725,  12299, -22725, -29692,
 470     31521,  26722,  26722,  -6270,
 471     6270,  26722,  26722, -31521,
 472     17855, -31521,   6270, -17855,
 473     17855,   6270, -31521, -17855};
 474
 475 /* Table for rows 2,6 - constants are multiplied on cos_2_16 */
 476
 477 __declspec(align(16)) short tab_i_26[] = {
 478         21407,  27969,  21407,  11585,
 479     -21407,  27969,  21407, -11585,
 480     21407, -11585,  21407, -27969,
 481     21407,  11585, -21407, -27969,
 482     29692,  25172,  25172,  -5906,
 483     5906,  25172,  25172, -29692,
 484     16819, -29692,   5906, -16819,
 485     16819,   5906, -29692, -16819};
 486
 487 /* Table for rows 3,5 - constants are multiplied on cos_3_16 */
 488
 489 __declspec(align(16)) short tab_i_35[] = {
 490         19266,  25172,  19266,  10426,
 491     -19266,  25172,  19266, -10426,
 492     19266, -10426,  19266, -25172,
 493     19266,  10426, -19266, -25172,
 494     26722,  22654,  22654,  -5315,
 495     5315,  22654,  22654, -26722,
 496     15137, -26722,   5315, -15137,
 497     15137,   5315, -26722, -15137};
 498
 499 __declspec(align(16)) long round_i_0[] = {RND_INV_ROW_0,RND_INV_ROW_0,
 500         RND_INV_ROW_0,RND_INV_ROW_0};
 501 __declspec(align(16)) long round_i_1[] = {RND_INV_ROW_1,RND_INV_ROW_1,
 502         RND_INV_ROW_1,RND_INV_ROW_1};
 503 __declspec(align(16)) long round_i_2[] = {RND_INV_ROW_2,RND_INV_ROW_2,
 504         RND_INV_ROW_2,RND_INV_ROW_2};
 505 __declspec(align(16)) long round_i_3[] = {RND_INV_ROW_3,RND_INV_ROW_3,
 506         RND_INV_ROW_3,RND_INV_ROW_3};
 507 __declspec(align(16)) long round_i_4[] = {RND_INV_ROW_4,RND_INV_ROW_4,
 508         RND_INV_ROW_4,RND_INV_ROW_4};
 509 __declspec(align(16)) long round_i_5[] = {RND_INV_ROW_5,RND_INV_ROW_5,
 510         RND_INV_ROW_5,RND_INV_ROW_5};
 511 __declspec(align(16)) long round_i_6[] = {RND_INV_ROW_6,RND_INV_ROW_6,
 512         RND_INV_ROW_6,RND_INV_ROW_6};
 513 __declspec(align(16)) long round_i_7[] = {RND_INV_ROW_7,RND_INV_ROW_7,
 514         RND_INV_ROW_7,RND_INV_ROW_7};
 515
 516 __declspec(align(16)) short tg_1_16[] = {
 517         13036,  13036,  13036,  13036,  /* tg * (2<<16) + 0.5 */
 518         13036,  13036,  13036,  13036};
 519 __declspec(align(16)) short tg_2_16[] = {
 520         27146,  27146,  27146,  27146,  /* tg * (2<<16) + 0.5 */
 521         27146,  27146,  27146,  27146};
 522 __declspec(align(16)) short tg_3_16[] = {
 523         -21746, -21746, -21746, -21746, /* tg * (2<<16) + 0.5 */
 524         -21746, -21746, -21746, -21746};
 525 __declspec(align(16)) short cos_4_16[] = {
 526         -19195, -19195, -19195, -19195, /* cos * (2<<16) + 0.5 */
 527         -19195, -19195, -19195, -19195};
 528
 529 /*
 530 * In this implementation the outputs of the iDCT-1D are multiplied
 531 *    for rows 0,4 - on cos_4_16,
 532 *    for rows 1,7 - on cos_1_16,
 533 *    for rows 2,6 - on cos_2_16,
 534 *    for rows 3,5 - on cos_3_16
 535 * and are shifted to the left for rise of accuracy
 536 *
 537 * For used constants
 538 *    FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
 539 *
 540 *-----------------------------------------------------------------------------
 541 *
 542 * On the first stage the calculation is executed at once for two rows.
 543 * The permutation for each output row is done on second stage
 544 *    t7 t6 t5 t4 t3 t2 t1 t0 -> t4 t5 t6 t7 t3 t2 t1 t0
 545 *
 546 *-----------------------------------------------------------------------------
 547 */
 548
 549 #define DCT_8_INV_ROW_2R(TABLE, ROUND1, ROUND2) __asm { \
 550         __asm pshuflw  xmm1, xmm0, 10001000b                            \
 551     __asm pshuflw  xmm0, xmm0, 11011101b                        \
 552     __asm pshufhw  xmm1, xmm1, 10001000b                        \
 553         __asm pshufhw  xmm0, xmm0, 11011101b                            \
 554         __asm movdqa   xmm2, XMMWORD PTR [TABLE]                        \
 555         __asm pmaddwd  xmm2, xmm1                                                       \
 556         __asm movdqa   xmm3, XMMWORD PTR [TABLE + 32]           \
 557         __asm pmaddwd  xmm3, xmm0                                       \
 558         __asm pmaddwd  xmm1, XMMWORD PTR [TABLE + 16]           \
 559         __asm pmaddwd  xmm0, XMMWORD PTR [TABLE + 48]           \
 560         __asm pshuflw  xmm5, xmm4, 10001000b                            \
 561         __asm pshuflw  xmm4, xmm4, 11011101b                            \
 562         __asm pshufhw  xmm5, xmm5, 10001000b                            \
 563         __asm pshufhw  xmm4, xmm4, 11011101b                            \
 564         __asm movdqa   xmm6, XMMWORD PTR [TABLE]                        \
 565         __asm pmaddwd  xmm6, xmm5                                       \
 566         __asm movdqa   xmm7, XMMWORD PTR [TABLE + 32]           \
 567         __asm pmaddwd  xmm7, xmm4                                       \
 568         __asm pmaddwd  xmm5, XMMWORD PTR [TABLE + 16]           \
 569         __asm pmaddwd  xmm4, XMMWORD PTR [TABLE + 48]           \
 570         __asm pshufd   xmm1, xmm1, 01001110b                            \
 571         __asm pshufd   xmm0, xmm0, 01001110b                            \
 572         __asm paddd    xmm2, XMMWORD PTR [ROUND1]                       \
 573         __asm paddd    xmm3, xmm0                                                       \
 574         __asm paddd    xmm1, xmm2                                                       \
 575         __asm pshufd   xmm5, xmm5, 01001110b                            \
 576         __asm pshufd   xmm4, xmm4, 01001110b                            \
 577         __asm movdqa   xmm2, xmm1                                       \
 578         __asm psubd    xmm2, xmm3                                       \
 579         __asm psrad    xmm2, SHIFT_INV_ROW                              \
 580         __asm paddd    xmm1, xmm3                                                       \
 581         __asm psrad    xmm1, SHIFT_INV_ROW                              \
 582         __asm packssdw xmm1, xmm2                                                       \
 583         __asm paddd    xmm6, XMMWORD PTR [ROUND2]                       \
 584         __asm paddd    xmm7, xmm4                                                       \
 585         __asm paddd    xmm5, xmm6                                                       \
 586         __asm movdqa   xmm6, xmm5                                       \
 587         __asm psubd    xmm6, xmm7                                       \
 588         __asm psrad    xmm6, SHIFT_INV_ROW                              \
 589         __asm paddd    xmm5, xmm7                                                       \
 590         __asm psrad    xmm5, SHIFT_INV_ROW                              \
 591         __asm packssdw xmm5, xmm6                                                       \
 592         }
 593
 594 /*
 595 *
 596 * The second stage - inverse DCTs of columns
 597 *
 598 * The inputs are multiplied
 599 *    for rows 0,4 - on cos_4_16,
 600 *    for rows 1,7 - on cos_1_16,
 601 *    for rows 2,6 - on cos_2_16,
 602 *    for rows 3,5 - on cos_3_16
 603 * and are shifted to the left for rise of accuracy
 604 */
 605
 606 #define DCT_8_INV_COL_8R(INP, OUTP) __asm {             \
 607         __asm movdqa   xmm0, [INP + 5*16]                       \
 608     __asm movdqa   xmm1, XMMWORD PTR tg_3_16    \
 609     __asm movdqa   xmm2, xmm0                           \
 610     __asm movdqa   xmm3, [INP + 3*16]                   \
 611     __asm pmulhw   xmm0, xmm1                           \
 612     __asm movdqa   xmm4, [INP + 7*16]                   \
 613     __asm pmulhw   xmm1, xmm3                           \
 614     __asm movdqa   xmm5, XMMWORD PTR tg_1_16    \
 615     __asm movdqa   xmm6, xmm4                           \
 616     __asm pmulhw   xmm4, xmm5                           \
 617     __asm paddsw   xmm0, xmm2                           \
 618     __asm pmulhw   xmm5, [INP + 1*16]                   \
 619     __asm paddsw   xmm1, xmm3                           \
 620     __asm movdqa   xmm7, [INP + 6*16]                   \
 621     __asm paddsw   xmm0, xmm3                                   \
 622     __asm movdqa   xmm3, XMMWORD PTR tg_2_16    \
 623     __asm psubsw   xmm2, xmm1                                   \
 624     __asm pmulhw   xmm7, xmm3                           \
 625     __asm movdqa   xmm1, xmm0                           \
 626     __asm pmulhw   xmm3, [INP + 2*16]                   \
 627     __asm psubsw   xmm5, xmm6                                   \
 628     __asm paddsw   xmm4, [INP + 1*16]                   \
 629     __asm paddsw   xmm0, xmm4                           \
 630     __asm psubsw   xmm4, xmm1                                   \
 631     __asm pshufhw  xmm0, xmm0, 00011011b                \
 632     __asm paddsw   xmm7, [INP + 2*16]                   \
 633     __asm movdqa   xmm6, xmm5                                   \
 634     __asm psubsw   xmm3, [INP + 6*16]                   \
 635     __asm psubsw   xmm5, xmm2                           \
 636     __asm paddsw   xmm6, xmm2                                   \
 637         __asm movdqa   [OUTP + 7*16], xmm0              \
 638     __asm movdqa   xmm1, xmm4                           \
 639     __asm movdqa   xmm2, XMMWORD PTR cos_4_16   \
 640     __asm paddsw   xmm4, xmm5                           \
 641     __asm movdqa   xmm0, XMMWORD PTR cos_4_16   \
 642     __asm pmulhw   xmm2, xmm4                                   \
 643     __asm pshufhw  xmm6, xmm6, 00011011b                \
 644     __asm movdqa   [OUTP + 3*16], xmm6                  \
 645     __asm psubsw   xmm1, xmm5                           \
 646     __asm movdqa   xmm6, [INP + 0*16]                   \
 647     __asm pmulhw   xmm0, xmm1                                   \
 648     __asm movdqa   xmm5, [INP + 4*16]                   \
 649     __asm paddsw   xmm4, xmm2                                   \
 650     __asm paddsw   xmm5, xmm6                           \
 651     __asm psubsw   xmm6, [INP + 4*16]                   \
 652     __asm paddsw   xmm0, xmm1                                   \
 653     __asm pshufhw  xmm4, xmm4, 00011011b                \
 654     __asm movdqa   xmm2, xmm5                           \
 655     __asm paddsw   xmm5, xmm7                           \
 656     __asm movdqa   xmm1, xmm6                                   \
 657     __asm psubsw   xmm2, xmm7                                   \
 658     __asm movdqa   xmm7, [OUTP + 7*16]                  \
 659     __asm paddsw   xmm6, xmm3                           \
 660     __asm pshufhw  xmm5, xmm5, 00011011b                \
 661         __asm paddsw   xmm7, xmm5                                       \
 662     __asm psubsw   xmm1, xmm3                                   \
 663     __asm pshufhw  xmm6, xmm6, 00011011b                \
 664         __asm movdqa   xmm3, xmm6                                       \
 665     __asm paddsw   xmm6, xmm4                           \
 666     __asm pshufhw  xmm2, xmm2, 00011011b                \
 667     __asm psraw    xmm7, SHIFT_INV_COL                  \
 668     __asm movdqa   [OUTP + 0*16], xmm7                  \
 669     __asm movdqa   xmm7, xmm1                           \
 670     __asm paddsw   xmm1, xmm0                                   \
 671     __asm psraw    xmm6, SHIFT_INV_COL                  \
 672     __asm movdqa   [OUTP + 1*16], xmm6                  \
 673     __asm pshufhw  xmm1, xmm1, 00011011b                \
 674         __asm movdqa   xmm6, [OUTP + 3*16]                      \
 675     __asm psubsw   xmm7, xmm0                           \
 676     __asm psraw    xmm1, SHIFT_INV_COL                  \
 677     __asm movdqa   [OUTP + 2*16], xmm1                  \
 678     __asm psubsw   xmm5, [OUTP + 7*16]                  \
 679     __asm paddsw   xmm6, xmm2                           \
 680     __asm psubsw   xmm2, [OUTP + 3*16]                  \
 681     __asm psubsw   xmm3, xmm4                           \
 682     __asm psraw    xmm7, SHIFT_INV_COL                  \
 683     __asm pshufhw  xmm7, xmm7, 00011011b                \
 684     __asm movdqa   [OUTP + 5*16], xmm7                  \
 685     __asm psraw    xmm5, SHIFT_INV_COL                  \
 686     __asm movdqa   [OUTP + 7*16], xmm5                  \
 687     __asm psraw    xmm6, SHIFT_INV_COL                  \
 688     __asm movdqa   [OUTP + 3*16], xmm6                  \
 689     __asm psraw    xmm2, SHIFT_INV_COL                  \
 690     __asm movdqa   [OUTP + 4*16], xmm2                  \
 691     __asm psraw    xmm3, SHIFT_INV_COL                  \
 692     __asm movdqa   [OUTP + 6*16], xmm3                  \
 693         }
 694
 695 /*
 696 *
 697 *  Name:      dct_8x8_inv_16s
 698 *  Purpose:   Inverse Discrete Cosine Transform 8x8 with
 699 *             2D buffer of short int data
 700 *  Context:
 701 *      void dct_8x8_inv_16s ( short *src, short *dst )
 702 *  Parameters:
 703 *      src  - Pointer to the source buffer
 704 *      dst  - Pointer to the destination buffer
 705 *
 706 */
 707
 708 GLOBAL(void)
 709 dct_8x8_inv_16s ( short *src, short *dst ) {
 710
 711         __asm {
 712
 713                 mov     ecx,  src
 714                 mov     edx,  dst
 715
 716                 movdqa  xmm0, [ecx+0*16]
 717                 movdqa  xmm4, [ecx+4*16]
 718                 DCT_8_INV_ROW_2R(tab_i_04, round_i_0, round_i_4)
 719                 movdqa     [edx+0*16], xmm1
 720                 movdqa     [edx+4*16], xmm5
 721
 722                 movdqa  xmm0, [ecx+1*16]
 723                 movdqa  xmm4, [ecx+7*16]
 724                 DCT_8_INV_ROW_2R(tab_i_17, round_i_1, round_i_7)
 725                 movdqa     [edx+1*16], xmm1
 726                 movdqa     [edx+7*16], xmm5
 727
 728                 movdqa  xmm0, [ecx+3*16]
 729                 movdqa  xmm4, [ecx+5*16]
 730                 DCT_8_INV_ROW_2R(tab_i_35, round_i_3, round_i_5);
 731                 movdqa     [edx+3*16], xmm1
 732                 movdqa     [edx+5*16], xmm5
 733
 734                 movdqa  xmm0, [ecx+2*16]
 735                 movdqa  xmm4, [ecx+6*16]
 736                 DCT_8_INV_ROW_2R(tab_i_26, round_i_2, round_i_6);
 737                 movdqa     [edx+2*16], xmm1
 738                 movdqa     [edx+6*16], xmm5
 739
 740                 DCT_8_INV_COL_8R(edx+0, edx+0);
 741         }
 742 }
 743
 744
 745 /*
 746 *  Name:
 747 *    ownpj_QuantInv_8x8_16s
 748 *
 749 *  Purpose:
 750 *    Dequantize 8x8 block of DCT coefficients
 751 *
 752 *  Context:
 753 *    void ownpj_QuantInv_8x8_16s
 754 *            Ipp16s*  pSrc,
 755 *            Ipp16s*  pDst,
 756 *      const Ipp16u*  pQTbl)*
 757 *
 758 */
 759
 760 GLOBAL(void)
 761 ownpj_QuantInv_8x8_16s(short * pSrc, short * pDst, const unsigned short * pQTbl)
 762 {
 763         __asm {
 764
 765                 push        ebx
 766                 push        ecx
 767                 push        edx
 768                 push        esi
 769                 push        edi
 770
 771                 mov         esi, pSrc
 772                 mov         edi, pDst
 773                 mov         edx, pQTbl
 774                 mov         ecx, 4
 775                 mov         ebx, 32
 776
 777         again:
 778
 779                 movq        mm0, QWORD PTR [esi+0]
 780                 movq        mm1, QWORD PTR [esi+8]
 781                 movq        mm2, QWORD PTR [esi+16]
 782                 movq        mm3, QWORD PTR [esi+24]
 783
 784                 prefetcht0  [esi+ebx] ; fetch next cache line
 785
 786                 pmullw      mm0, QWORD PTR [edx+0]
 787                 pmullw      mm1, QWORD PTR [edx+8]
 788                 pmullw      mm2, QWORD PTR [edx+16]
 789                 pmullw      mm3, QWORD PTR [edx+24]
 790
 791                 movq        QWORD PTR [edi+0], mm0
 792                 movq        QWORD PTR [edi+8], mm1
 793                 movq        QWORD PTR [edi+16], mm2
 794                 movq        QWORD PTR [edi+24], mm3
 795
 796                 add         esi, ebx
 797                 add         edi, ebx
 798                 add         edx, ebx
 799                 dec         ecx
 800                 jnz         again
 801
 802                 emms
 803
 804                 pop         edi
 805                 pop         esi
 806                 pop         edx
 807                 pop         ecx
 808                 pop         ebx
 809         }
 810 }
 811
 812
 813 /*
 814 *  Name:
 815 *    ownpj_Add128_8x8_16s8u
 816 *
 817 *  Purpose:
 818 *    signed to unsigned conversion (level shift)
 819 *    for 8x8 block of DCT coefficients
 820 *
 821 *  Context:
 822 *    void ownpj_Add128_8x8_16s8u
 823 *      const Ipp16s* pSrc,
 824 *            Ipp8u*  pDst,
 825 *            int     DstStep);
 826 *
 827 */
 828
 829 __declspec(align(16)) long const_128[]= {0x00800080, 0x00800080, 0x00800080, 0x00800080};
 830
 831 GLOBAL(void)
 832 ownpj_Add128_8x8_16s8u(const short * pSrc, unsigned char * pDst, int DstStep)
 833 {
 834         __asm {
 835                 push        eax
 836                 push        ebx
 837                 push        ecx
 838                 push        edx
 839                 push        esi
 840                 push        edi
 841
 842                 mov         esi, pSrc
 843                 mov         edi, pDst
 844                 mov         edx, DstStep
 845                 mov         ecx, 2
 846                 mov         ebx, edx
 847                 mov         eax, edx
 848                 sal         ebx, 1
 849                 add         eax, ebx
 850                 movdqa      xmm7, XMMWORD PTR const_128
 851
 852         again:
 853
 854                 movdqa      xmm0, XMMWORD PTR [esi+0]  ; line 0
 855                 movdqa      xmm1, XMMWORD PTR [esi+16] ; line 1
 856                 movdqa      xmm2, XMMWORD PTR [esi+32] ; line 2
 857                 movdqa      xmm3, XMMWORD PTR [esi+48] ; line 3
 858
 859                 paddw     xmm0, xmm7
 860                 paddw     xmm1, xmm7
 861                 paddw     xmm2, xmm7
 862                 paddw     xmm3, xmm7
 863
 864                 packuswb  xmm0, xmm1
 865                 packuswb  xmm2, xmm3
 866
 867                 movq      QWORD PTR [edi], xmm0      ;0*DstStep
 868                 movq      QWORD PTR [edi+ebx], xmm2  ;2*DstStep
 869
 870                 psrldq      xmm0, 8
 871                 psrldq      xmm2, 8
 872
 873                 movq      QWORD PTR [edi+edx], xmm0  ;1*DstStep
 874                 movq      QWORD PTR [edi+eax], xmm2  ;3*DstStep
 875
 876                 add         edi, ebx
 877                 add         esi, 64
 878                 add         edi, ebx
 879                 dec         ecx
 880                 jnz         again
 881
 882                 pop         edi
 883                 pop         esi
 884                 pop         edx
 885                 pop         ecx
 886                 pop         ebx
 887                 pop         eax
 888         }
 889 }
 890
 891
 892 /*
 893 *  Name:
 894 *    ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R
 895 *
 896 *  Purpose:
 897 *    Inverse DCT transform, de-quantization and level shift
 898 *
 899 *  Parameters:
 900 *    pSrc               - pointer to source
 901 *    pDst               - pointer to output array
 902 *    DstStep            - line offset for output data
 903 *    pEncoderQuantTable - pointer to Quantization table
 904 *
 905 */
 906
 907 GLOBAL(void)
 908 ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(
 909   short * pSrc,
 910   unsigned char *  pDst,
 911   int     DstStep,
 912   const unsigned short * pQuantInvTable)
 913 {
 914
 915         __declspec(align(16)) Ipp8u buf[DCTSIZE2*sizeof(Ipp16s)];
 916         Ipp16s * workbuf = (Ipp16s *)buf;
 917
 918         ownpj_QuantInv_8x8_16s(pSrc,workbuf,pQuantInvTable);
 919         dct_8x8_inv_16s(workbuf,workbuf);
 920         ownpj_Add128_8x8_16s8u(workbuf,pDst,DstStep);
 921
 922 }
 923
 924 GLOBAL(void)
 925 jpeg_idct_islow_sse2 (
 926         j_decompress_ptr cinfo,
 927         jpeg_component_info * compptr,
 928         JCOEFPTR coef_block,
 929         JSAMPARRAY output_buf,
 930         JDIMENSION output_col)
 931 {
 932         int                     ctr;
 933         JCOEFPTR        inptr;
 934         Ipp16u*         quantptr;
 935         Ipp8u*          wsptr;
 936         __declspec(align(16)) Ipp8u workspace[DCTSIZE2];
 937         JSAMPROW        outptr;
 938
 939         inptr = coef_block;
 940         quantptr = (Ipp16u*)compptr->dct_table;
 941         wsptr = workspace;
 942
 943         ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(inptr, workspace, 8, quantptr);
 944
 945         for(ctr = 0; ctr < DCTSIZE; ctr++)
 946         {
 947                 outptr = output_buf[ctr] + output_col;
 948
 949                 outptr[0] = wsptr[0];
 950                 outptr[1] = wsptr[1];
 951                 outptr[2] = wsptr[2];
 952                 outptr[3] = wsptr[3];
 953                 outptr[4] = wsptr[4];
 954                 outptr[5] = wsptr[5];
 955                 outptr[6] = wsptr[6];
 956                 outptr[7] = wsptr[7];
 957
 958                 wsptr += DCTSIZE;
 959         }
 960 }
 961 #endif /* HAVE_SSE2_INTEL_MNEMONICS */
 962
 963 #endif /* DCT_ISLOW_SUPPORTED */