libmpeg2/idct_alpha.c

   1 /*
   2  * idct_alpha.c
   3  * Copyright (C) 2002-2003 Falk Hueffner <falk@debian.org>
   4  * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
   5  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   6  *
   7  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
   8  * See http://libmpeg2.sourceforge.net/ for updates.
   9  *
  10  * mpeg2dec is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * mpeg2dec is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  23  */
  24
  25 #include "config.h"
  26
  27 #if ARCH_ALPHA
  28
  29 #include <stdlib.h>
  30 #include <inttypes.h>
  31
  32 #include "mpeg2.h"
  33 #include "attributes.h"
  34 #include "mpeg2_internal.h"
  35 #include "alpha_asm.h"
  36
  37 #define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */
  38 #define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */
  39 #define W3 2408 /* 2048 * sqrt (2) * cos (3 * pi / 16) */
  40 #define W5 1609 /* 2048 * sqrt (2) * cos (5 * pi / 16) */
  41 #define W6 1108 /* 2048 * sqrt (2) * cos (6 * pi / 16) */
  42 #define W7 565  /* 2048 * sqrt (2) * cos (7 * pi / 16) */
  43
  44 extern uint8_t mpeg2_clip[3840 * 2 + 256];
  45 #define CLIP(i) ((mpeg2_clip + 3840)[i])
  46
  47 #if 0
  48 #define BUTTERFLY(t0,t1,W0,W1,d0,d1)    \
  49 do {                                    \
  50     t0 = W0 * d0 + W1 * d1;                     \
  51     t1 = W0 * d1 - W1 * d0;                     \
  52 } while (0)
  53 #else
  54 #define BUTTERFLY(t0,t1,W0,W1,d0,d1)    \
  55 do {                                    \
  56     int_fast32_t tmp = W0 * (d0 + d1);  \
  57     t0 = tmp + (W1 - W0) * d1;          \
  58     t1 = tmp - (W1 + W0) * d0;          \
  59 } while (0)
  60 #endif
  61
  62 static inline void idct_row (int16_t * const block)
  63 {
  64     uint64_t l, r;
  65     int_fast32_t d0, d1, d2, d3;
  66     int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
  67     int_fast32_t t0, t1, t2, t3;
  68
  69     l = ldq (block);
  70     r = ldq (block + 4);
  71
  72     /* shortcut */
  73     if (likely (!((l & ~0xffffUL) | r))) {
  74         uint64_t tmp = (uint16_t) (l >> 1);
  75         tmp |= tmp << 16;
  76         tmp |= tmp << 32;
  77         ((int32_t *)block)[0] = tmp;
  78         ((int32_t *)block)[1] = tmp;
  79         ((int32_t *)block)[2] = tmp;
  80         ((int32_t *)block)[3] = tmp;
  81         return;
  82     }
  83
  84     d0 = (sextw (l) << 11) + 2048;
  85     d1 = sextw (extwl (l, 2));
  86     d2 = sextw (extwl (l, 4)) << 11;
  87     d3 = sextw (extwl (l, 6));
  88     t0 = d0 + d2;
  89     t1 = d0 - d2;
  90     BUTTERFLY (t2, t3, W6, W2, d3, d1);
  91     a0 = t0 + t2;
  92     a1 = t1 + t3;
  93     a2 = t1 - t3;
  94     a3 = t0 - t2;
  95
  96     d0 = sextw (r);
  97     d1 = sextw (extwl (r, 2));
  98     d2 = sextw (extwl (r, 4));
  99     d3 = sextw (extwl (r, 6));
 100     BUTTERFLY (t0, t1, W7, W1, d3, d0);
 101     BUTTERFLY (t2, t3, W3, W5, d1, d2);
 102     b0 = t0 + t2;
 103     b3 = t1 + t3;
 104     t0 -= t2;
 105     t1 -= t3;
 106     b1 = ((t0 + t1) >> 8) * 181;
 107     b2 = ((t0 - t1) >> 8) * 181;
 108
 109     block[0] = (a0 + b0) >> 12;
 110     block[1] = (a1 + b1) >> 12;
 111     block[2] = (a2 + b2) >> 12;
 112     block[3] = (a3 + b3) >> 12;
 113     block[4] = (a3 - b3) >> 12;
 114     block[5] = (a2 - b2) >> 12;
 115     block[6] = (a1 - b1) >> 12;
 116     block[7] = (a0 - b0) >> 12;
 117 }
 118
 119 static inline void idct_col (int16_t * const block)
 120 {
 121     int_fast32_t d0, d1, d2, d3;
 122     int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
 123     int_fast32_t t0, t1, t2, t3;
 124
 125     d0 = (block[8*0] << 11) + 65536;
 126     d1 = block[8*1];
 127     d2 = block[8*2] << 11;
 128     d3 = block[8*3];
 129     t0 = d0 + d2;
 130     t1 = d0 - d2;
 131     BUTTERFLY (t2, t3, W6, W2, d3, d1);
 132     a0 = t0 + t2;
 133     a1 = t1 + t3;
 134     a2 = t1 - t3;
 135     a3 = t0 - t2;
 136
 137     d0 = block[8*4];
 138     d1 = block[8*5];
 139     d2 = block[8*6];
 140     d3 = block[8*7];
 141     BUTTERFLY (t0, t1, W7, W1, d3, d0);
 142     BUTTERFLY (t2, t3, W3, W5, d1, d2);
 143     b0 = t0 + t2;
 144     b3 = t1 + t3;
 145     t0 -= t2;
 146     t1 -= t3;
 147     b1 = ((t0 + t1) >> 8) * 181;
 148     b2 = ((t0 - t1) >> 8) * 181;
 149
 150     block[8*0] = (a0 + b0) >> 17;
 151     block[8*1] = (a1 + b1) >> 17;
 152     block[8*2] = (a2 + b2) >> 17;
 153     block[8*3] = (a3 + b3) >> 17;
 154     block[8*4] = (a3 - b3) >> 17;
 155     block[8*5] = (a2 - b2) >> 17;
 156     block[8*6] = (a1 - b1) >> 17;
 157     block[8*7] = (a0 - b0) >> 17;
 158 }
 159
 160 void mpeg2_idct_copy_mvi (int16_t * block, uint8_t * dest, const int stride)
 161 {
 162     uint64_t clampmask;
 163     int i;
 164
 165     for (i = 0; i < 8; i++)
 166         idct_row (block + 8 * i);
 167
 168     for (i = 0; i < 8; i++)
 169         idct_col (block + i);
 170
 171     clampmask = zap (-1, 0xaa); /* 0x00ff00ff00ff00ff */
 172     do {
 173         uint64_t shorts0, shorts1;
 174
 175         shorts0 = ldq (block);
 176         shorts0 = maxsw4 (shorts0, 0);
 177         shorts0 = minsw4 (shorts0, clampmask);
 178         stl (pkwb (shorts0), dest);
 179
 180         shorts1 = ldq (block + 4);
 181         shorts1 = maxsw4 (shorts1, 0);
 182         shorts1 = minsw4 (shorts1, clampmask);
 183         stl (pkwb (shorts1), dest + 4);
 184
 185         stq (0, block);
 186         stq (0, block + 4);
 187
 188         dest += stride;
 189         block += 8;
 190     } while (--i);
 191 }
 192
 193 void mpeg2_idct_add_mvi (const int last, int16_t * block,
 194                          uint8_t * dest, const int stride)
 195 {
 196     uint64_t clampmask;
 197     uint64_t signmask;
 198     int i;
 199
 200     if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
 201         for (i = 0; i < 8; i++)
 202             idct_row (block + 8 * i);
 203         for (i = 0; i < 8; i++)
 204             idct_col (block + i);
 205         clampmask = zap (-1, 0xaa);     /* 0x00ff00ff00ff00ff */
 206         signmask = zap (-1, 0x33);
 207         signmask ^= signmask >> 1;      /* 0x8000800080008000 */
 208
 209         do {
 210             uint64_t shorts0, pix0, signs0;
 211             uint64_t shorts1, pix1, signs1;
 212
 213             shorts0 = ldq (block);
 214             shorts1 = ldq (block + 4);
 215
 216             pix0 = unpkbw (ldl (dest));
 217             /* signed subword add (MMX paddw).  */
 218             signs0 = shorts0 & signmask;
 219             shorts0 &= ~signmask;
 220             shorts0 += pix0;
 221             shorts0 ^= signs0;
 222             /* clamp. */
 223             shorts0 = maxsw4 (shorts0, 0);
 224             shorts0 = minsw4 (shorts0, clampmask);
 225
 226             /* next 4.  */
 227             pix1 = unpkbw (ldl (dest + 4));
 228             signs1 = shorts1 & signmask;
 229             shorts1 &= ~signmask;
 230             shorts1 += pix1;
 231             shorts1 ^= signs1;
 232             shorts1 = maxsw4 (shorts1, 0);
 233             shorts1 = minsw4 (shorts1, clampmask);
 234
 235             stl (pkwb (shorts0), dest);
 236             stl (pkwb (shorts1), dest + 4);
 237             stq (0, block);
 238             stq (0, block + 4);
 239
 240             dest += stride;
 241             block += 8;
 242         } while (--i);
 243     } else {
 244         int DC;
 245         uint64_t p0, p1, p2, p3, p4, p5, p6, p7;
 246         uint64_t DCs;
 247
 248         DC = (block[0] + 64) >> 7;
 249         block[0] = block[63] = 0;
 250
 251         p0 = ldq (dest + 0 * stride);
 252         p1 = ldq (dest + 1 * stride);
 253         p2 = ldq (dest + 2 * stride);
 254         p3 = ldq (dest + 3 * stride);
 255         p4 = ldq (dest + 4 * stride);
 256         p5 = ldq (dest + 5 * stride);
 257         p6 = ldq (dest + 6 * stride);
 258         p7 = ldq (dest + 7 * stride);
 259
 260         if (DC > 0) {
 261             DCs = BYTE_VEC (likely (DC <= 255) ? DC : 255);
 262             p0 += minub8 (DCs, ~p0);
 263             p1 += minub8 (DCs, ~p1);
 264             p2 += minub8 (DCs, ~p2);
 265             p3 += minub8 (DCs, ~p3);
 266             p4 += minub8 (DCs, ~p4);
 267             p5 += minub8 (DCs, ~p5);
 268             p6 += minub8 (DCs, ~p6);
 269             p7 += minub8 (DCs, ~p7);
 270         } else {
 271             DCs = BYTE_VEC (likely (-DC <= 255) ? -DC : 255);
 272             p0 -= minub8 (DCs, p0);
 273             p1 -= minub8 (DCs, p1);
 274             p2 -= minub8 (DCs, p2);
 275             p3 -= minub8 (DCs, p3);
 276             p4 -= minub8 (DCs, p4);
 277             p5 -= minub8 (DCs, p5);
 278             p6 -= minub8 (DCs, p6);
 279             p7 -= minub8 (DCs, p7);
 280         }
 281
 282         stq (p0, dest + 0 * stride);
 283         stq (p1, dest + 1 * stride);
 284         stq (p2, dest + 2 * stride);
 285         stq (p3, dest + 3 * stride);
 286         stq (p4, dest + 4 * stride);
 287         stq (p5, dest + 5 * stride);
 288         stq (p6, dest + 6 * stride);
 289         stq (p7, dest + 7 * stride);
 290     }
 291 }
 292
 293 void mpeg2_idct_copy_alpha (int16_t * block, uint8_t * dest, const int stride)
 294 {
 295     int i;
 296
 297     for (i = 0; i < 8; i++)
 298         idct_row (block + 8 * i);
 299     for (i = 0; i < 8; i++)
 300         idct_col (block + i);
 301     do {
 302         dest[0] = CLIP (block[0]);
 303         dest[1] = CLIP (block[1]);
 304         dest[2] = CLIP (block[2]);
 305         dest[3] = CLIP (block[3]);
 306         dest[4] = CLIP (block[4]);
 307         dest[5] = CLIP (block[5]);
 308         dest[6] = CLIP (block[6]);
 309         dest[7] = CLIP (block[7]);
 310
 311         stq(0, block);
 312         stq(0, block + 4);
 313
 314         dest += stride;
 315         block += 8;
 316     } while (--i);
 317 }
 318
 319 void mpeg2_idct_add_alpha (const int last, int16_t * block,
 320                            uint8_t * dest, const int stride)
 321 {
 322     int i;
 323
 324     if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
 325         for (i = 0; i < 8; i++)
 326             idct_row (block + 8 * i);
 327         for (i = 0; i < 8; i++)
 328             idct_col (block + i);
 329         do {
 330             dest[0] = CLIP (block[0] + dest[0]);
 331             dest[1] = CLIP (block[1] + dest[1]);
 332             dest[2] = CLIP (block[2] + dest[2]);
 333             dest[3] = CLIP (block[3] + dest[3]);
 334             dest[4] = CLIP (block[4] + dest[4]);
 335             dest[5] = CLIP (block[5] + dest[5]);
 336             dest[6] = CLIP (block[6] + dest[6]);
 337             dest[7] = CLIP (block[7] + dest[7]);
 338
 339             stq(0, block);
 340             stq(0, block + 4);
 341
 342             dest += stride;
 343             block += 8;
 344         } while (--i);
 345     } else {
 346         int DC;
 347
 348         DC = (block[0] + 64) >> 7;
 349         block[0] = block[63] = 0;
 350         i = 8;
 351         do {
 352             dest[0] = CLIP (DC + dest[0]);
 353             dest[1] = CLIP (DC + dest[1]);
 354             dest[2] = CLIP (DC + dest[2]);
 355             dest[3] = CLIP (DC + dest[3]);
 356             dest[4] = CLIP (DC + dest[4]);
 357             dest[5] = CLIP (DC + dest[5]);
 358             dest[6] = CLIP (DC + dest[6]);
 359             dest[7] = CLIP (DC + dest[7]);
 360             dest += stride;
 361         } while (--i);
 362     }
 363 }
 364
 365 void mpeg2_idct_alpha_init (void)
 366 {
 367     int i, j;
 368
 369     for (i = 0; i < 64; i++) {
 370         j = mpeg2_scan_norm[i];
 371         mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
 372         j = mpeg2_scan_alt[i];
 373         mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
 374     }
 375 }
 376
 377 #endif /* ARCH_ALPHA */