libswscale/yuv2rgb_altivec.c

   1 /*
   2   marc.hoffman@analog.com    March 8, 2004
   3
   4   AltiVec acceleration for colorspace conversion revision 0.2
   5
   6   convert I420 YV12 to RGB in various formats,
   7     it rejects images that are not in 420 formats
   8     it rejects images that don't have widths of multiples of 16
   9     it rejects images that don't have heights of multiples of 2
  10   reject defers to C simulation codes.
  11
  12   lots of optimizations to be done here
  13
  14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
  15      so we currently use max min to clip
  16
  17   2. the inefficient use of chroma loading needs a bit of brushing up
  18
  19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
  20
  21
  22   MODIFIED to calculate coeffs from currently selected color space.
  23   MODIFIED core to be a macro which you spec the output format.
  24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
  25   CORRECTED algorithim selection to be strict on input formats.
  26   ADDED runtime detection of altivec.
  27
  28   ADDED altivec_yuv2packedX vertical scl + RGB converter
  29
  30   March 27,2004
  31   PERFORMANCE ANALYSIS
  32
  33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
  34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
  35
  36   720*480*30  ~10MPS
  37
  38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
  39
  40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
  41
  42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
  43   guaranteed to have the input video frame it was just decompressed so
  44   it probably resides in L1 caches.  However we are creating the
  45   output video stream this needs to use the DSTST instruction to
  46   optimize for the cache.  We couple this with the fact that we are
  47   not going to be visiting the input buffer again so we mark it Least
  48   Recently Used.  This shaves 25% of the processor cycles off.
  49
  50   Now MEMCPY is the largest mips consumer in the system, probably due
  51   to the inefficient X11 stuff.
  52
  53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
  54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
  55   a versioning issues, however i have libGL.1.2.dylib for both
  56   machines. ((We need to figure this out now))
  57
  58   GL2 libraries work now with patch for RGB32
  59
  60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
  61
  62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
  63 */
  64
  65 /*
  66  * This file is part of FFmpeg.
  67  *
  68  * FFmpeg is free software; you can redistribute it and/or modify
  69  * it under the terms of the GNU General Public License as published by
  70  * the Free Software Foundation; either version 2 of the License, or
  71  * (at your option) any later version.
  72  *
  73  * FFmpeg is distributed in the hope that it will be useful,
  74  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  75  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  76  * GNU General Public License for more details.
  77  *
  78  * You should have received a copy of the GNU General Public License
  79  * along with FFmpeg; if not, write to the Free Software
  80  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  81  */
  82
  83 #include <stdio.h>
  84 #include <stdlib.h>
  85 #include <string.h>
  86 #include <inttypes.h>
  87 #include <assert.h>
  88 #include "config.h"
  89 #ifdef HAVE_MALLOC_H
  90 #include <malloc.h>
  91 #endif
  92 #include "rgb2rgb.h"
  93 #include "swscale.h"
  94 #include "swscale_internal.h"
  95
  96 #undef PROFILE_THE_BEAST
  97 #undef INC_SCALING
  98
  99 typedef unsigned char ubyte;
 100 typedef signed char   sbyte;
 101
 102
 103 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
 104    homogeneous vector registers x0,x1,x2 are interleaved with the
 105    following technique:
 106
 107       o0 = vec_mergeh (x0,x1);
 108       o1 = vec_perm (o0, x2, perm_rgb_0);
 109       o2 = vec_perm (o0, x2, perm_rgb_1);
 110       o3 = vec_mergel (x0,x1);
 111       o4 = vec_perm (o3,o2,perm_rgb_2);
 112       o5 = vec_perm (o3,o2,perm_rgb_3);
 113
 114   perm_rgb_0:   o0(RG).h v1(B) --> o1*
 115               0   1  2   3   4
 116              rgbr|gbrg|brgb|rgbr
 117              0010 0100 1001 0010
 118              0102 3145 2673 894A
 119
 120   perm_rgb_1:   o0(RG).h v1(B) --> o2
 121               0   1  2   3   4
 122              gbrg|brgb|bbbb|bbbb
 123              0100 1001 1111 1111
 124              B5CD 6EF7 89AB CDEF
 125
 126   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
 127               0   1  2   3   4
 128              gbrg|brgb|rgbr|gbrg
 129              1111 1111 0010 0100
 130              89AB CDEF 0182 3945
 131
 132   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
 133               0   1  2   3   4
 134              brgb|rgbr|gbrg|brgb
 135              1001 0010 0100 1001
 136              a67b 89cA BdCD eEFf
 137
 138 */
 139 static
 140 const vector unsigned char
 141   perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
 142                                                0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
 143   perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
 144                                                0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
 145   perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 146                                                0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
 147   perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
 148                                                0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
 149
 150 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
 151 do {                                        \
 152     typeof(x0) o0,o2,o3;                    \
 153         o0 = vec_mergeh (x0,x1);            \
 154         y0 = vec_perm (o0, x2, perm_rgb_0); \
 155         o2 = vec_perm (o0, x2, perm_rgb_1); \
 156         o3 = vec_mergel (x0,x1);            \
 157         y1 = vec_perm (o3,o2,perm_rgb_2);   \
 158         y2 = vec_perm (o3,o2,perm_rgb_3);   \
 159 } while(0)
 160
 161 #define vec_mstbgr24(x0,x1,x2,ptr)      \
 162 do {                                    \
 163     typeof(x0) _0,_1,_2;                \
 164     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
 165     vec_st (_0, 0, ptr++);              \
 166     vec_st (_1, 0, ptr++);              \
 167     vec_st (_2, 0, ptr++);              \
 168 }  while (0);
 169
 170 #define vec_mstrgb24(x0,x1,x2,ptr)      \
 171 do {                                    \
 172     typeof(x0) _0,_1,_2;                \
 173     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
 174     vec_st (_0, 0, ptr++);              \
 175     vec_st (_1, 0, ptr++);              \
 176     vec_st (_2, 0, ptr++);              \
 177 }  while (0);
 178
 179 /* pack the pixels in rgb0 format
 180    msb R
 181    lsb 0
 182 */
 183 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
 184 do {                                                                          \
 185     T _0,_1,_2,_3;                                                            \
 186     _0 = vec_mergeh (x0,x1);                                                  \
 187     _1 = vec_mergeh (x2,x3);                                                  \
 188     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 189     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 190     vec_st (_2, 0*16, (T *)ptr);                                              \
 191     vec_st (_3, 1*16, (T *)ptr);                                              \
 192     _0 = vec_mergel (x0,x1);                                                  \
 193     _1 = vec_mergel (x2,x3);                                                  \
 194     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 195     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 196     vec_st (_2, 2*16, (T *)ptr);                                              \
 197     vec_st (_3, 3*16, (T *)ptr);                                              \
 198     ptr += 4;                                                                 \
 199 }  while (0);
 200
 201 /*
 202
 203   | 1     0       1.4021   | | Y |
 204   | 1    -0.3441 -0.7142   |x| Cb|
 205   | 1     1.7718  0        | | Cr|
 206
 207
 208   Y:      [-128 127]
 209   Cb/Cr : [-128 127]
 210
 211   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
 212
 213 */
 214
 215
 216
 217
 218 #define vec_unh(x) \
 219     (vector signed short) \
 220         vec_perm(x,(typeof(x))AVV(0),\
 221                  (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
 222                                            0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
 223 #define vec_unl(x) \
 224     (vector signed short) \
 225         vec_perm(x,(typeof(x))AVV(0),\
 226                  (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
 227                                            0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
 228
 229 #define vec_clip_s16(x) \
 230     vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
 231                          (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16))
 232
 233 #define vec_packclp(x,y) \
 234     (vector unsigned char)vec_packs \
 235         ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
 236          (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
 237
 238 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
 239
 240
 241 static inline void cvtyuvtoRGB (SwsContext *c,
 242                                 vector signed short Y, vector signed short U, vector signed short V,
 243                                 vector signed short *R, vector signed short *G, vector signed short *B)
 244 {
 245     vector signed   short vx,ux,uvx;
 246
 247     Y = vec_mradds (Y, c->CY, c->OY);
 248     U  = vec_sub (U,(vector signed short)
 249                     vec_splat((vector signed short)AVV(128),0));
 250     V  = vec_sub (V,(vector signed short)
 251                     vec_splat((vector signed short)AVV(128),0));
 252
 253     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
 254     ux = vec_sl (U, c->CSHIFT);
 255     *B = vec_mradds (ux, c->CBU, Y);
 256
 257     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
 258     vx = vec_sl (V, c->CSHIFT);
 259     *R = vec_mradds (vx, c->CRV, Y);
 260
 261     // uvx = ((CGU*u) + (CGV*v))>>15;
 262     uvx = vec_mradds (U, c->CGU, Y);
 263     *G  = vec_mradds (V, c->CGV, uvx);
 264 }
 265
 266
 267 /*
 268   ------------------------------------------------------------------------------
 269   CS converters
 270   ------------------------------------------------------------------------------
 271 */
 272
 273
 274 #define DEFCSP420_CVT(name,out_pixels)                                  \
 275 static int altivec_##name (SwsContext *c,                               \
 276                            unsigned char **in, int *instrides,          \
 277                            int srcSliceY,        int srcSliceH,         \
 278                            unsigned char **oplanes, int *outstrides)    \
 279 {                                                                       \
 280     int w = c->srcW;                                                    \
 281     int h = srcSliceH;                                                  \
 282     int i,j;                                                            \
 283     int instrides_scl[3];                                               \
 284     vector unsigned char y0,y1;                                         \
 285                                                                         \
 286     vector signed char  u,v;                                            \
 287                                                                         \
 288     vector signed short Y0,Y1,Y2,Y3;                                    \
 289     vector signed short U,V;                                            \
 290     vector signed short vx,ux,uvx;                                      \
 291     vector signed short vx0,ux0,uvx0;                                   \
 292     vector signed short vx1,ux1,uvx1;                                   \
 293     vector signed short R0,G0,B0;                                       \
 294     vector signed short R1,G1,B1;                                       \
 295     vector unsigned char R,G,B;                                         \
 296                                                                         \
 297     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
 298     vector unsigned char align_perm;                                    \
 299                                                                         \
 300     vector signed short                                                 \
 301         lCY  = c->CY,                                                   \
 302         lOY  = c->OY,                                                   \
 303         lCRV = c->CRV,                                                  \
 304         lCBU = c->CBU,                                                  \
 305         lCGU = c->CGU,                                                  \
 306         lCGV = c->CGV;                                                  \
 307                                                                         \
 308     vector unsigned short lCSHIFT = c->CSHIFT;                          \
 309                                                                         \
 310     ubyte *y1i   = in[0];                                               \
 311     ubyte *y2i   = in[0]+instrides[0];                                  \
 312     ubyte *ui    = in[1];                                               \
 313     ubyte *vi    = in[2];                                               \
 314                                                                         \
 315     vector unsigned char *oute                                          \
 316         = (vector unsigned char *)                                      \
 317             (oplanes[0]+srcSliceY*outstrides[0]);                       \
 318     vector unsigned char *outo                                          \
 319         = (vector unsigned char *)                                      \
 320             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
 321                                                                         \
 322                                                                         \
 323     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
 324     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
 325     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
 326                                                                         \
 327                                                                         \
 328     for (i=0;i<h/2;i++) {                                               \
 329         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
 330         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
 331                                                                         \
 332         for (j=0;j<w/16;j++) {                                          \
 333                                                                         \
 334             y1ivP = (vector unsigned char *)y1i;                        \
 335             y2ivP = (vector unsigned char *)y2i;                        \
 336             uivP  = (vector unsigned char *)ui;                         \
 337             vivP  = (vector unsigned char *)vi;                         \
 338                                                                         \
 339             align_perm = vec_lvsl (0, y1i);                             \
 340             y0 = (vector unsigned char)                                 \
 341                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
 342                                                                         \
 343             align_perm = vec_lvsl (0, y2i);                             \
 344             y1 = (vector unsigned char)                                 \
 345                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
 346                                                                         \
 347             align_perm = vec_lvsl (0, ui);                              \
 348             u = (vector signed char)                                    \
 349                 vec_perm (uivP[0], uivP[1], align_perm);                \
 350                                                                         \
 351             align_perm = vec_lvsl (0, vi);                              \
 352             v = (vector signed char)                                    \
 353                 vec_perm (vivP[0], vivP[1], align_perm);                \
 354                                                                         \
 355             u  = (vector signed char)                                   \
 356                  vec_sub (u,(vector signed char)                        \
 357                           vec_splat((vector signed char)AVV(128),0));   \
 358             v  = (vector signed char)                                   \
 359                  vec_sub (v,(vector signed char)                        \
 360                           vec_splat((vector signed char)AVV(128),0));   \
 361                                                                         \
 362             U  = vec_unpackh (u);                                       \
 363             V  = vec_unpackh (v);                                       \
 364                                                                         \
 365                                                                         \
 366             Y0 = vec_unh (y0);                                          \
 367             Y1 = vec_unl (y0);                                          \
 368             Y2 = vec_unh (y1);                                          \
 369             Y3 = vec_unl (y1);                                          \
 370                                                                         \
 371             Y0 = vec_mradds (Y0, lCY, lOY);                             \
 372             Y1 = vec_mradds (Y1, lCY, lOY);                             \
 373             Y2 = vec_mradds (Y2, lCY, lOY);                             \
 374             Y3 = vec_mradds (Y3, lCY, lOY);                             \
 375                                                                         \
 376             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
 377             ux = vec_sl (U, lCSHIFT);                                   \
 378             ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));    \
 379             ux0  = vec_mergeh (ux,ux);                                  \
 380             ux1  = vec_mergel (ux,ux);                                  \
 381                                                                         \
 382             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
 383             vx = vec_sl (V, lCSHIFT);                                   \
 384             vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));    \
 385             vx0  = vec_mergeh (vx,vx);                                  \
 386             vx1  = vec_mergel (vx,vx);                                  \
 387                                                                         \
 388             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
 389             uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));    \
 390             uvx = vec_mradds (V, lCGV, uvx);                            \
 391             uvx0 = vec_mergeh (uvx,uvx);                                \
 392             uvx1 = vec_mergel (uvx,uvx);                                \
 393                                                                         \
 394             R0 = vec_add (Y0,vx0);                                      \
 395             G0 = vec_add (Y0,uvx0);                                     \
 396             B0 = vec_add (Y0,ux0);                                      \
 397             R1 = vec_add (Y1,vx1);                                      \
 398             G1 = vec_add (Y1,uvx1);                                     \
 399             B1 = vec_add (Y1,ux1);                                      \
 400                                                                         \
 401             R  = vec_packclp (R0,R1);                                   \
 402             G  = vec_packclp (G0,G1);                                   \
 403             B  = vec_packclp (B0,B1);                                   \
 404                                                                         \
 405             out_pixels(R,G,B,oute);                                     \
 406                                                                         \
 407             R0 = vec_add (Y2,vx0);                                      \
 408             G0 = vec_add (Y2,uvx0);                                     \
 409             B0 = vec_add (Y2,ux0);                                      \
 410             R1 = vec_add (Y3,vx1);                                      \
 411             G1 = vec_add (Y3,uvx1);                                     \
 412             B1 = vec_add (Y3,ux1);                                      \
 413             R  = vec_packclp (R0,R1);                                   \
 414             G  = vec_packclp (G0,G1);                                   \
 415             B  = vec_packclp (B0,B1);                                   \
 416                                                                         \
 417                                                                         \
 418             out_pixels(R,G,B,outo);                                     \
 419                                                                         \
 420             y1i  += 16;                                                 \
 421             y2i  += 16;                                                 \
 422             ui   += 8;                                                  \
 423             vi   += 8;                                                  \
 424                                                                         \
 425         }                                                               \
 426                                                                         \
 427         outo  += (outstrides[0])>>4;                                    \
 428         oute  += (outstrides[0])>>4;                                    \
 429                                                                         \
 430         ui    += instrides_scl[1];                                      \
 431         vi    += instrides_scl[2];                                      \
 432         y1i   += instrides_scl[0];                                      \
 433         y2i   += instrides_scl[0];                                      \
 434     }                                                                   \
 435     return srcSliceH;                                                   \
 436 }
 437
 438
 439 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
 440 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
 441 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
 442 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
 443 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
 444 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
 445
 446 DEFCSP420_CVT (yuv2_abgr, out_abgr)
 447 #if 1
 448 DEFCSP420_CVT (yuv2_bgra, out_bgra)
 449 #else
 450 static int altivec_yuv2_bgra32 (SwsContext *c,
 451                                 unsigned char **in, int *instrides,
 452                                 int srcSliceY,        int srcSliceH,
 453                                 unsigned char **oplanes, int *outstrides)
 454 {
 455     int w = c->srcW;
 456     int h = srcSliceH;
 457     int i,j;
 458     int instrides_scl[3];
 459     vector unsigned char y0,y1;
 460
 461     vector signed char  u,v;
 462
 463     vector signed short Y0,Y1,Y2,Y3;
 464     vector signed short U,V;
 465     vector signed short vx,ux,uvx;
 466     vector signed short vx0,ux0,uvx0;
 467     vector signed short vx1,ux1,uvx1;
 468     vector signed short R0,G0,B0;
 469     vector signed short R1,G1,B1;
 470     vector unsigned char R,G,B;
 471
 472     vector unsigned char *uivP, *vivP;
 473     vector unsigned char align_perm;
 474
 475     vector signed short
 476         lCY  = c->CY,
 477         lOY  = c->OY,
 478         lCRV = c->CRV,
 479         lCBU = c->CBU,
 480         lCGU = c->CGU,
 481         lCGV = c->CGV;
 482
 483     vector unsigned short lCSHIFT = c->CSHIFT;
 484
 485     ubyte *y1i   = in[0];
 486     ubyte *y2i   = in[0]+w;
 487     ubyte *ui    = in[1];
 488     ubyte *vi    = in[2];
 489
 490     vector unsigned char *oute
 491         = (vector unsigned char *)
 492           (oplanes[0]+srcSliceY*outstrides[0]);
 493     vector unsigned char *outo
 494         = (vector unsigned char *)
 495           (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
 496
 497
 498     instrides_scl[0] = instrides[0];
 499     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
 500     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
 501
 502
 503     for (i=0;i<h/2;i++) {
 504         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
 505         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
 506
 507         for (j=0;j<w/16;j++) {
 508
 509             y0 = vec_ldl (0,y1i);
 510             y1 = vec_ldl (0,y2i);
 511             uivP = (vector unsigned char *)ui;
 512             vivP = (vector unsigned char *)vi;
 513
 514             align_perm = vec_lvsl (0, ui);
 515             u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
 516
 517             align_perm = vec_lvsl (0, vi);
 518             v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
 519             u  = (vector signed char)
 520                  vec_sub (u,(vector signed char)
 521                           vec_splat((vector signed char)AVV(128),0));
 522
 523             v  = (vector signed char)
 524                  vec_sub (v, (vector signed char)
 525                           vec_splat((vector signed char)AVV(128),0));
 526
 527             U  = vec_unpackh (u);
 528             V  = vec_unpackh (v);
 529
 530
 531             Y0 = vec_unh (y0);
 532             Y1 = vec_unl (y0);
 533             Y2 = vec_unh (y1);
 534             Y3 = vec_unl (y1);
 535
 536             Y0 = vec_mradds (Y0, lCY, lOY);
 537             Y1 = vec_mradds (Y1, lCY, lOY);
 538             Y2 = vec_mradds (Y2, lCY, lOY);
 539             Y3 = vec_mradds (Y3, lCY, lOY);
 540
 541             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
 542             ux = vec_sl (U, lCSHIFT);
 543             ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
 544             ux0  = vec_mergeh (ux,ux);
 545             ux1  = vec_mergel (ux,ux);
 546
 547             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
 548             vx = vec_sl (V, lCSHIFT);
 549             vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
 550             vx0  = vec_mergeh (vx,vx);
 551             vx1  = vec_mergel (vx,vx);
 552             /* uvx = ((CGU*u) + (CGV*v))>>15 */
 553             uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
 554             uvx = vec_mradds (V, lCGV, uvx);
 555             uvx0 = vec_mergeh (uvx,uvx);
 556             uvx1 = vec_mergel (uvx,uvx);
 557             R0 = vec_add (Y0,vx0);
 558             G0 = vec_add (Y0,uvx0);
 559             B0 = vec_add (Y0,ux0);
 560             R1 = vec_add (Y1,vx1);
 561             G1 = vec_add (Y1,uvx1);
 562             B1 = vec_add (Y1,ux1);
 563             R  = vec_packclp (R0,R1);
 564             G  = vec_packclp (G0,G1);
 565             B  = vec_packclp (B0,B1);
 566
 567             out_argb(R,G,B,oute);
 568             R0 = vec_add (Y2,vx0);
 569             G0 = vec_add (Y2,uvx0);
 570             B0 = vec_add (Y2,ux0);
 571             R1 = vec_add (Y3,vx1);
 572             G1 = vec_add (Y3,uvx1);
 573             B1 = vec_add (Y3,ux1);
 574             R  = vec_packclp (R0,R1);
 575             G  = vec_packclp (G0,G1);
 576             B  = vec_packclp (B0,B1);
 577
 578             out_argb(R,G,B,outo);
 579             y1i  += 16;
 580             y2i  += 16;
 581             ui   += 8;
 582             vi   += 8;
 583
 584         }
 585
 586         outo  += (outstrides[0])>>4;
 587         oute  += (outstrides[0])>>4;
 588
 589         ui    += instrides_scl[1];
 590         vi    += instrides_scl[2];
 591         y1i   += instrides_scl[0];
 592         y2i   += instrides_scl[0];
 593     }
 594     return srcSliceH;
 595 }
 596
 597 #endif
 598
 599
 600 DEFCSP420_CVT (yuv2_rgba, out_rgba)
 601 DEFCSP420_CVT (yuv2_argb, out_argb)
 602 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
 603 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
 604
 605
 606 // uyvy|uyvy|uyvy|uyvy
 607 // 0123 4567 89ab cdef
 608 static
 609 const vector unsigned char
 610     demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
 611                                               0x10,0x04,0x10,0x04,
 612                                               0x10,0x08,0x10,0x08,
 613                                               0x10,0x0c,0x10,0x0c),
 614     demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
 615                                               0x10,0x06,0x10,0x06,
 616                                               0x10,0x0A,0x10,0x0A,
 617                                               0x10,0x0E,0x10,0x0E),
 618     demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
 619                                               0x10,0x05,0x10,0x07,
 620                                               0x10,0x09,0x10,0x0B,
 621                                               0x10,0x0D,0x10,0x0F);
 622
 623 /*
 624   this is so I can play live CCIR raw video
 625 */
 626 static int altivec_uyvy_rgb32 (SwsContext *c,
 627                                unsigned char **in, int *instrides,
 628                                int srcSliceY,        int srcSliceH,
 629                                unsigned char **oplanes, int *outstrides)
 630 {
 631     int w = c->srcW;
 632     int h = srcSliceH;
 633     int i,j;
 634     vector unsigned char uyvy;
 635     vector signed   short Y,U,V;
 636     vector signed   short R0,G0,B0,R1,G1,B1;
 637     vector unsigned char  R,G,B;
 638     vector unsigned char *out;
 639     ubyte *img;
 640
 641     img = in[0];
 642     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
 643
 644     for (i=0;i<h;i++) {
 645         for (j=0;j<w/16;j++) {
 646             uyvy = vec_ld (0, img);
 647             U = (vector signed short)
 648                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
 649
 650             V = (vector signed short)
 651                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
 652
 653             Y = (vector signed short)
 654                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
 655
 656             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
 657
 658             uyvy = vec_ld (16, img);
 659             U = (vector signed short)
 660                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
 661
 662             V = (vector signed short)
 663                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
 664
 665             Y = (vector signed short)
 666                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
 667
 668             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
 669
 670             R  = vec_packclp (R0,R1);
 671             G  = vec_packclp (G0,G1);
 672             B  = vec_packclp (B0,B1);
 673
 674             //      vec_mstbgr24 (R,G,B, out);
 675             out_rgba (R,G,B,out);
 676
 677             img += 32;
 678         }
 679     }
 680     return srcSliceH;
 681 }
 682
 683
 684
 685 /* Ok currently the acceleration routine only supports
 686    inputs of widths a multiple of 16
 687    and heights a multiple 2
 688
 689    So we just fall back to the C codes for this.
 690 */
 691 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
 692 {
 693     if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
 694         return NULL;
 695
 696     /*
 697       and this seems not to matter too much I tried a bunch of
 698       videos with abnormal widths and mplayer crashes else where.
 699       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
 700       boom with X11 bad match.
 701
 702     */
 703     if ((c->srcW & 0xf) != 0)    return NULL;
 704
 705     switch (c->srcFormat) {
 706     case PIX_FMT_YUV410P:
 707     case PIX_FMT_YUV420P:
 708     /*case IMGFMT_CLPL:        ??? */
 709     case PIX_FMT_GRAY8:
 710     case PIX_FMT_NV12:
 711     case PIX_FMT_NV21:
 712         if ((c->srcH & 0x1) != 0)
 713             return NULL;
 714
 715         switch(c->dstFormat){
 716         case PIX_FMT_RGB24:
 717             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
 718             return altivec_yuv2_rgb24;
 719         case PIX_FMT_BGR24:
 720             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
 721             return altivec_yuv2_bgr24;
 722         case PIX_FMT_ARGB:
 723             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
 724             return altivec_yuv2_argb;
 725         case PIX_FMT_ABGR:
 726             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
 727             return altivec_yuv2_abgr;
 728         case PIX_FMT_RGBA:
 729             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
 730             return altivec_yuv2_rgba;
 731         case PIX_FMT_BGRA:
 732             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
 733             return altivec_yuv2_bgra;
 734         default: return NULL;
 735         }
 736         break;
 737
 738     case PIX_FMT_UYVY422:
 739         switch(c->dstFormat){
 740         case PIX_FMT_BGR32:
 741             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
 742             return altivec_uyvy_rgb32;
 743         default: return NULL;
 744         }
 745         break;
 746
 747     }
 748     return NULL;
 749 }
 750
 751 static uint16_t roundToInt16(int64_t f){
 752     int r= (f + (1<<15))>>16;
 753          if (r<-0x7FFF) return 0x8000;
 754     else if (r> 0x7FFF) return 0x7FFF;
 755     else                return r;
 756 }
 757
 758 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
 759 {
 760     union {
 761         signed short tmp[8] __attribute__ ((aligned(16)));
 762         vector signed short vec;
 763     } buf;
 764
 765     buf.tmp[0] =  ( (0xffffLL) * contrast>>8 )>>9;                      //cy
 766     buf.tmp[1] =  -256*brightness;                                      //oy
 767     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
 768     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
 769     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
 770     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
 771
 772
 773     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
 774     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
 775     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
 776     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
 777     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
 778     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
 779     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
 780 #if 0
 781     {
 782     int i;
 783     char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
 784     for (i=0; i<6; i++)
 785         printf("%s %d ", v[i],buf.tmp[i] );
 786         printf("\n");
 787     }
 788 #endif
 789     return;
 790 }
 791
 792
 793 void
 794 altivec_yuv2packedX (SwsContext *c,
 795                      int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 796                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 797                      uint8_t *dest, int dstW, int dstY)
 798 {
 799     int i,j;
 800     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
 801     vector signed short R0,G0,B0,R1,G1,B1;
 802
 803     vector unsigned char R,G,B;
 804     vector unsigned char *out,*nout;
 805
 806     vector signed short   RND = vec_splat_s16(1<<3);
 807     vector unsigned short SCL = vec_splat_u16(4);
 808     unsigned long scratch[16] __attribute__ ((aligned (16)));
 809
 810     vector signed short *YCoeffs, *CCoeffs;
 811
 812     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
 813     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
 814
 815     out = (vector unsigned char *)dest;
 816
 817     for (i=0; i<dstW; i+=16){
 818         Y0 = RND;
 819         Y1 = RND;
 820         /* extract 16 coeffs from lumSrc */
 821         for (j=0; j<lumFilterSize; j++) {
 822             X0 = vec_ld (0,  &lumSrc[j][i]);
 823             X1 = vec_ld (16, &lumSrc[j][i]);
 824             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 825             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 826         }
 827
 828         U = RND;
 829         V = RND;
 830         /* extract 8 coeffs from U,V */
 831         for (j=0; j<chrFilterSize; j++) {
 832             X  = vec_ld (0, &chrSrc[j][i/2]);
 833             U  = vec_mradds (X, CCoeffs[j], U);
 834             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
 835             V  = vec_mradds (X, CCoeffs[j], V);
 836         }
 837
 838         /* scale and clip signals */
 839         Y0 = vec_sra (Y0, SCL);
 840         Y1 = vec_sra (Y1, SCL);
 841         U  = vec_sra (U,  SCL);
 842         V  = vec_sra (V,  SCL);
 843
 844         Y0 = vec_clip_s16 (Y0);
 845         Y1 = vec_clip_s16 (Y1);
 846         U  = vec_clip_s16 (U);
 847         V  = vec_clip_s16 (V);
 848
 849         /* now we have
 850           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 851           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
 852
 853           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 854           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 855           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 856         */
 857
 858         U0 = vec_mergeh (U,U);
 859         V0 = vec_mergeh (V,V);
 860
 861         U1 = vec_mergel (U,U);
 862         V1 = vec_mergel (V,V);
 863
 864         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 865         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 866
 867         R  = vec_packclp (R0,R1);
 868         G  = vec_packclp (G0,G1);
 869         B  = vec_packclp (B0,B1);
 870
 871         switch(c->dstFormat) {
 872             case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
 873             case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
 874             case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
 875             case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
 876             case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
 877             case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
 878             default:
 879             {
 880                 /* If this is reached, the caller should have called yuv2packedXinC
 881                    instead. */
 882                 static int printed_error_message;
 883                 if (!printed_error_message) {
 884                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 885                            sws_format_name(c->dstFormat));
 886                     printed_error_message=1;
 887                 }
 888                 return;
 889             }
 890         }
 891     }
 892
 893     if (i < dstW) {
 894         i -= 16;
 895
 896         Y0 = RND;
 897         Y1 = RND;
 898         /* extract 16 coeffs from lumSrc */
 899         for (j=0; j<lumFilterSize; j++) {
 900             X0 = vec_ld (0,  &lumSrc[j][i]);
 901             X1 = vec_ld (16, &lumSrc[j][i]);
 902             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 903             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 904         }
 905
 906         U = RND;
 907         V = RND;
 908         /* extract 8 coeffs from U,V */
 909         for (j=0; j<chrFilterSize; j++) {
 910             X  = vec_ld (0, &chrSrc[j][i/2]);
 911             U  = vec_mradds (X, CCoeffs[j], U);
 912             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
 913             V  = vec_mradds (X, CCoeffs[j], V);
 914         }
 915
 916         /* scale and clip signals */
 917         Y0 = vec_sra (Y0, SCL);
 918         Y1 = vec_sra (Y1, SCL);
 919         U  = vec_sra (U,  SCL);
 920         V  = vec_sra (V,  SCL);
 921
 922         Y0 = vec_clip_s16 (Y0);
 923         Y1 = vec_clip_s16 (Y1);
 924         U  = vec_clip_s16 (U);
 925         V  = vec_clip_s16 (V);
 926
 927         /* now we have
 928            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 929            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
 930
 931            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 932            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 933            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 934         */
 935
 936         U0 = vec_mergeh (U,U);
 937         V0 = vec_mergeh (V,V);
 938
 939         U1 = vec_mergel (U,U);
 940         V1 = vec_mergel (V,V);
 941
 942         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 943         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 944
 945         R  = vec_packclp (R0,R1);
 946         G  = vec_packclp (G0,G1);
 947         B  = vec_packclp (B0,B1);
 948
 949         nout = (vector unsigned char *)scratch;
 950         switch(c->dstFormat) {
 951             case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
 952             case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
 953             case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
 954             case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
 955             case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
 956             case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
 957             default:
 958                 /* Unreachable, I think. */
 959                 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 960                        sws_format_name(c->dstFormat));
 961                 return;
 962         }
 963
 964         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
 965     }
 966
 967 }