libmpeg2/motion_comp_alpha.c

   1 /*
   2  * motion_comp_alpha.c
   3  * Copyright (C) 2002-2003 Falk Hueffner <falk@debian.org>
   4  *
   5  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
   6  * See http://libmpeg2.sourceforge.net/ for updates.
   7  *
   8  * mpeg2dec is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * mpeg2dec is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #include "config.h"
  24
  25 #ifdef ARCH_ALPHA
  26
  27 #include <inttypes.h>
  28
  29 #include "mpeg2.h"
  30 #include "attributes.h"
  31 #include "mpeg2_internal.h"
  32 #include "alpha_asm.h"
  33
  34 static inline uint64_t avg2 (uint64_t a, uint64_t b)
  35 {
  36     return (a | b) - (((a ^ b) & BYTE_VEC (0xfe)) >> 1);
  37 }
  38
  39 // Load two unaligned quadwords from addr. This macro only works if
  40 // addr is actually unaligned.
  41 #define ULOAD16(ret_l,ret_r,addr)                       \
  42     do {                                                \
  43         uint64_t _l = ldq_u (addr +  0);                \
  44         uint64_t _m = ldq_u (addr +  8);                \
  45         uint64_t _r = ldq_u (addr + 16);                \
  46         ret_l = extql (_l, addr) | extqh (_m, addr);    \
  47         ret_r = extql (_m, addr) | extqh (_r, addr);    \
  48     } while (0)
  49
  50 // Load two aligned quadwords from addr.
  51 #define ALOAD16(ret_l,ret_r,addr)                       \
  52     do {                                                \
  53         ret_l = ldq (addr);                             \
  54         ret_r = ldq (addr + 8);                         \
  55     } while (0)
  56
  57 #define OP8(LOAD,LOAD16,STORE)                  \
  58     do {                                        \
  59         STORE (LOAD (pixels), block);           \
  60         pixels += line_size;                    \
  61         block += line_size;                     \
  62     } while (--h)
  63
  64 #define OP16(LOAD,LOAD16,STORE)                 \
  65     do {                                        \
  66         uint64_t l, r;                          \
  67         LOAD16 (l, r, pixels);                  \
  68         STORE (l, block);                       \
  69         STORE (r, block + 8);                   \
  70         pixels += line_size;                    \
  71         block += line_size;                     \
  72     } while (--h)
  73
  74 #define OP8_X2(LOAD,LOAD16,STORE)                       \
  75     do {                                                \
  76         uint64_t p0, p1;                                \
  77                                                         \
  78         p0 = LOAD (pixels);                             \
  79         p1 = p0 >> 8 | ((uint64_t) pixels[8] << 56);    \
  80         STORE (avg2 (p0, p1), block);                   \
  81         pixels += line_size;                            \
  82         block += line_size;                             \
  83     } while (--h)
  84
  85 #define OP16_X2(LOAD,LOAD16,STORE)                              \
  86     do {                                                        \
  87         uint64_t p0, p1;                                        \
  88                                                                 \
  89         LOAD16 (p0, p1, pixels);                                \
  90         STORE (avg2(p0, p0 >> 8 | p1 << 56), block);            \
  91         STORE (avg2(p1, p1 >> 8 | (uint64_t) pixels[16] << 56), \
  92                block + 8);                                      \
  93         pixels += line_size;                                    \
  94         block += line_size;                                     \
  95     } while (--h)
  96
  97 #define OP8_Y2(LOAD,LOAD16,STORE)               \
  98     do {                                        \
  99         uint64_t p0, p1;                        \
 100         p0 = LOAD (pixels);                     \
 101         pixels += line_size;                    \
 102         p1 = LOAD (pixels);                     \
 103         do {                                    \
 104             uint64_t av = avg2 (p0, p1);        \
 105             if (--h == 0) line_size = 0;        \
 106             pixels += line_size;                \
 107             p0 = p1;                            \
 108             p1 = LOAD (pixels);                 \
 109             STORE (av, block);                  \
 110             block += line_size;                 \
 111         } while (h);                            \
 112     } while (0)
 113
 114 #define OP16_Y2(LOAD,LOAD16,STORE)              \
 115     do {                                        \
 116         uint64_t p0l, p0r, p1l, p1r;            \
 117         LOAD16 (p0l, p0r, pixels);              \
 118         pixels += line_size;                    \
 119         LOAD16 (p1l, p1r, pixels);              \
 120         do {                                    \
 121             uint64_t avl, avr;                  \
 122             if (--h == 0) line_size = 0;        \
 123             avl = avg2 (p0l, p1l);              \
 124             avr = avg2 (p0r, p1r);              \
 125             p0l = p1l;                          \
 126             p0r = p1r;                          \
 127             pixels += line_size;                \
 128             LOAD16 (p1l, p1r, pixels);          \
 129             STORE (avl, block);                 \
 130             STORE (avr, block + 8);             \
 131             block += line_size;                 \
 132         } while (h);                            \
 133     } while (0)
 134
 135 #define OP8_XY2(LOAD,LOAD16,STORE)                              \
 136     do {                                                        \
 137         uint64_t pl, ph;                                        \
 138         uint64_t p1 = LOAD (pixels);                            \
 139         uint64_t p2 = p1 >> 8 | ((uint64_t) pixels[8] << 56);   \
 140                                                                 \
 141         ph = (((p1 & ~BYTE_VEC (0x03)) >> 2) +                  \
 142               ((p2 & ~BYTE_VEC (0x03)) >> 2));                  \
 143         pl = ((p1 & BYTE_VEC (0x03)) +                          \
 144               (p2 & BYTE_VEC (0x03)));                          \
 145                                                                 \
 146         do {                                                    \
 147             uint64_t npl, nph;                                  \
 148                                                                 \
 149             pixels += line_size;                                \
 150             p1 = LOAD (pixels);                                 \
 151             p2 = (p1 >> 8) | ((uint64_t) pixels[8] << 56);      \
 152             nph = (((p1 & ~BYTE_VEC (0x03)) >> 2) +             \
 153                    ((p2 & ~BYTE_VEC (0x03)) >> 2));             \
 154             npl = ((p1 & BYTE_VEC (0x03)) +                     \
 155                    (p2 & BYTE_VEC (0x03)));                     \
 156                                                                 \
 157             STORE (ph + nph +                                   \
 158                    (((pl + npl + BYTE_VEC (0x02)) >> 2) &       \
 159                     BYTE_VEC (0x03)), block);                   \
 160                                                                 \
 161             block += line_size;                                 \
 162             pl = npl;                                           \
 163             ph = nph;                                           \
 164         } while (--h);                                          \
 165     } while (0)
 166
 167 #define OP16_XY2(LOAD,LOAD16,STORE)                             \
 168     do {                                                        \
 169         uint64_t p0, p1, p2, p3, pl_l, ph_l, pl_r, ph_r;        \
 170         LOAD16 (p0, p2, pixels);                                \
 171         p1 = p0 >> 8 | (p2 << 56);                              \
 172         p3 = p2 >> 8 | ((uint64_t)pixels[16] << 56);            \
 173                                                                 \
 174         ph_l = (((p0 & ~BYTE_VEC (0x03)) >> 2) +                \
 175                 ((p1 & ~BYTE_VEC (0x03)) >> 2));                \
 176         pl_l = ((p0 & BYTE_VEC (0x03)) +                        \
 177                 (p1 & BYTE_VEC(0x03)));                         \
 178         ph_r = (((p2 & ~BYTE_VEC (0x03)) >> 2) +                \
 179                 ((p3 & ~BYTE_VEC (0x03)) >> 2));                \
 180         pl_r = ((p2 & BYTE_VEC (0x03)) +                        \
 181                 (p3 & BYTE_VEC (0x03)));                        \
 182                                                                 \
 183         do {                                                    \
 184             uint64_t npl_l, nph_l, npl_r, nph_r;                \
 185                                                                 \
 186             pixels += line_size;                                \
 187             LOAD16 (p0, p2, pixels);                            \
 188             p1 = p0 >> 8 | (p2 << 56);                          \
 189             p3 = p2 >> 8 | ((uint64_t)pixels[16] << 56);        \
 190             nph_l = (((p0 & ~BYTE_VEC (0x03)) >> 2) +           \
 191                      ((p1 & ~BYTE_VEC (0x03)) >> 2));           \
 192             npl_l = ((p0 & BYTE_VEC (0x03)) +                   \
 193                      (p1 & BYTE_VEC (0x03)));                   \
 194             nph_r = (((p2 & ~BYTE_VEC (0x03)) >> 2) +           \
 195                      ((p3 & ~BYTE_VEC (0x03)) >> 2));           \
 196             npl_r = ((p2 & BYTE_VEC (0x03)) +                   \
 197                      (p3 & BYTE_VEC (0x03)));                   \
 198                                                                 \
 199             STORE (ph_l + nph_l +                               \
 200                    (((pl_l + npl_l + BYTE_VEC (0x02)) >> 2) &   \
 201                     BYTE_VEC(0x03)), block);                    \
 202             STORE (ph_r + nph_r +                               \
 203                    (((pl_r + npl_r + BYTE_VEC (0x02)) >> 2) &   \
 204                     BYTE_VEC(0x03)), block + 8);                \
 205                                                                 \
 206             block += line_size;                                 \
 207             pl_l = npl_l;                                       \
 208             ph_l = nph_l;                                       \
 209             pl_r = npl_r;                                       \
 210             ph_r = nph_r;                                       \
 211         } while (--h);                                          \
 212     } while (0)
 213
 214 #define MAKE_OP(OPNAME,SIZE,SUFF,OPKIND,STORE)                          \
 215 static void MC_ ## OPNAME ## _ ## SUFF ## _ ## SIZE ## _alpha           \
 216         (uint8_t *restrict block, const uint8_t *restrict pixels,       \
 217          int line_size, int h)                                          \
 218 {                                                                       \
 219     if ((uint64_t) pixels & 0x7) {                                      \
 220         OPKIND (uldq, ULOAD16, STORE);                                  \
 221     } else {                                                            \
 222         OPKIND (ldq, ALOAD16, STORE);                                   \
 223     }                                                                   \
 224 }
 225
 226 #define PIXOP(OPNAME,STORE)                     \
 227     MAKE_OP (OPNAME, 8,  o,  OP8,      STORE);  \
 228     MAKE_OP (OPNAME, 8,  x,  OP8_X2,   STORE);  \
 229     MAKE_OP (OPNAME, 8,  y,  OP8_Y2,   STORE);  \
 230     MAKE_OP (OPNAME, 8,  xy, OP8_XY2,  STORE);  \
 231     MAKE_OP (OPNAME, 16, o,  OP16,     STORE);  \
 232     MAKE_OP (OPNAME, 16, x,  OP16_X2,  STORE);  \
 233     MAKE_OP (OPNAME, 16, y,  OP16_Y2,  STORE);  \
 234     MAKE_OP (OPNAME, 16, xy, OP16_XY2, STORE);
 235
 236 #define STORE(l,b) stq (l, b)
 237 PIXOP (put, STORE);
 238 #undef STORE
 239 #define STORE(l,b) stq (avg2 (l, ldq (b)), b);
 240 PIXOP (avg, STORE);
 241
 242 mpeg2_mc_t mpeg2_mc_alpha = {
 243     { MC_put_o_16_alpha, MC_put_x_16_alpha,
 244       MC_put_y_16_alpha, MC_put_xy_16_alpha,
 245       MC_put_o_8_alpha, MC_put_x_8_alpha,
 246       MC_put_y_8_alpha, MC_put_xy_8_alpha },
 247     { MC_avg_o_16_alpha, MC_avg_x_16_alpha,
 248       MC_avg_y_16_alpha, MC_avg_xy_16_alpha,
 249       MC_avg_o_8_alpha, MC_avg_x_8_alpha,
 250       MC_avg_y_8_alpha, MC_avg_xy_8_alpha }
 251 };
 252
 253 #endif