libswscale/yuv2rgb_altivec.c

   1 /*
   2   marc.hoffman@analog.com    March 8, 2004
   3
   4   Altivec Acceleration for Color Space Conversion revision 0.2
   5
   6   convert I420 YV12 to RGB in various formats,
   7     it rejects images that are not in 420 formats
   8     it rejects images that don't have widths of multiples of 16
   9     it rejects images that don't have heights of multiples of 2
  10   reject defers to C simulation codes.
  11
  12   lots of optimizations to be done here
  13
  14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
  15      so we currently use max min to clip
  16
  17   2. the inefficient use of chroma loading needs a bit of brushing up
  18
  19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
  20
  21
  22   MODIFIED to calculate coeffs from currently selected color space.
  23   MODIFIED core to be a macro which you spec the output format.
  24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
  25   CORRECTED algorithim selection to be strict on input formats.
  26   ADDED runtime detection of altivec.
  27
  28   ADDED altivec_yuv2packedX vertical scl + RGB converter
  29
  30   March 27,2004
  31   PERFORMANCE ANALYSIS
  32
  33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
  34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
  35
  36   720*480*30  ~10MPS
  37
  38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
  39
  40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
  41
  42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
  43   guaranteed to have the input video frame it was just decompressed so
  44   it probably resides in L1 caches.  However we are creating the
  45   output video stream this needs to use the DSTST instruction to
  46   optimize for the cache.  We couple this with the fact that we are
  47   not going to be visiting the input buffer again so we mark it Least
  48   Recently Used.  This shaves 25% of the processor cycles off.
  49
  50   Now MEMCPY is the largest mips consumer in the system, probably due
  51   to the inefficient X11 stuff.
  52
  53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
  54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
  55   a versioning issues, however i have libGL.1.2.dylib for both
  56   machines. ((We need to figure this out now))
  57
  58   GL2 libraries work now with patch for RGB32
  59
  60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
  61
  62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
  63 */
  64
  65 /*
  66  * This file is part of FFmpeg.
  67  *
  68  * FFmpeg is free software; you can redistribute it and/or modify
  69  * it under the terms of the GNU General Public License as published by
  70  * the Free Software Foundation; either version 2 of the License, or
  71  * (at your option) any later version.
  72  *
  73  * FFmpeg is distributed in the hope that it will be useful,
  74  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  75  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  76  * GNU General Public License for more details.
  77  *
  78  * You should have received a copy of the GNU General Public License
  79  * along with FFmpeg; if not, write to the Free Software
  80  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  81  */
  82
  83 #include <stdio.h>
  84 #include <stdlib.h>
  85 #include <string.h>
  86 #include <inttypes.h>
  87 #include <assert.h>
  88 #include "config.h"
  89 #ifdef HAVE_MALLOC_H
  90 #include <malloc.h>
  91 #endif
  92 #include "rgb2rgb.h"
  93 #include "swscale.h"
  94 #include "swscale_internal.h"
  95
  96 #undef PROFILE_THE_BEAST
  97 #undef INC_SCALING
  98
  99 typedef unsigned char ubyte;
 100 typedef signed char   sbyte;
 101
 102
 103 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
 104    homogeneous vector registers x0,x1,x2 are interleaved with the
 105    following technique:
 106
 107       o0 = vec_mergeh (x0,x1);
 108       o1 = vec_perm (o0, x2, perm_rgb_0);
 109       o2 = vec_perm (o0, x2, perm_rgb_1);
 110       o3 = vec_mergel (x0,x1);
 111       o4 = vec_perm (o3,o2,perm_rgb_2);
 112       o5 = vec_perm (o3,o2,perm_rgb_3);
 113
 114   perm_rgb_0:   o0(RG).h v1(B) --> o1*
 115               0   1  2   3   4
 116              rgbr|gbrg|brgb|rgbr
 117              0010 0100 1001 0010
 118              0102 3145 2673 894A
 119
 120   perm_rgb_1:   o0(RG).h v1(B) --> o2
 121               0   1  2   3   4
 122              gbrg|brgb|bbbb|bbbb
 123              0100 1001 1111 1111
 124              B5CD 6EF7 89AB CDEF
 125
 126   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
 127               0   1  2   3   4
 128              gbrg|brgb|rgbr|gbrg
 129              1111 1111 0010 0100
 130              89AB CDEF 0182 3945
 131
 132   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
 133               0   1  2   3   4
 134              brgb|rgbr|gbrg|brgb
 135              1001 0010 0100 1001
 136              a67b 89cA BdCD eEFf
 137
 138 */
 139 static
 140 const vector unsigned char
 141   perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
 142                                       0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
 143   perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
 144                                       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
 145   perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 146                                       0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
 147   perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
 148                                       0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
 149
 150 #define vec_merge3(x2,x1,x0,y0,y1,y2)    \
 151 do {                                     \
 152   typeof(x0) o0,o2,o3;                   \
 153       o0 = vec_mergeh (x0,x1);           \
 154       y0 = vec_perm (o0, x2, perm_rgb_0);\
 155       o2 = vec_perm (o0, x2, perm_rgb_1);\
 156       o3 = vec_mergel (x0,x1);           \
 157       y1 = vec_perm (o3,o2,perm_rgb_2);  \
 158       y2 = vec_perm (o3,o2,perm_rgb_3);  \
 159 } while(0)
 160
 161 #define vec_mstbgr24(x0,x1,x2,ptr)        \
 162 do {                                     \
 163   typeof(x0) _0,_1,_2;                   \
 164   vec_merge3 (x0,x1,x2,_0,_1,_2);        \
 165   vec_st (_0, 0, ptr++);                 \
 166   vec_st (_1, 0, ptr++);                 \
 167   vec_st (_2, 0, ptr++);                 \
 168 }  while (0);
 169
 170 #define vec_mstrgb24(x0,x1,x2,ptr)       \
 171 do {                                     \
 172   typeof(x0) _0,_1,_2;                   \
 173   vec_merge3 (x2,x1,x0,_0,_1,_2);        \
 174   vec_st (_0, 0, ptr++);                 \
 175   vec_st (_1, 0, ptr++);                 \
 176   vec_st (_2, 0, ptr++);                 \
 177 }  while (0);
 178
 179 /* pack the pixels in rgb0 format
 180    msb R
 181    lsb 0
 182 */
 183 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                \
 184 do {                                                                                   \
 185   T _0,_1,_2,_3;                                                                       \
 186   _0 = vec_mergeh (x0,x1);                                                             \
 187   _1 = vec_mergeh (x2,x3);                                                             \
 188   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
 189   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
 190   vec_st (_2, 0*16, (T *)ptr);                                                         \
 191   vec_st (_3, 1*16, (T *)ptr);                                                         \
 192   _0 = vec_mergel (x0,x1);                                                             \
 193   _1 = vec_mergel (x2,x3);                                                             \
 194   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
 195   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
 196   vec_st (_2, 2*16, (T *)ptr);                                                         \
 197   vec_st (_3, 3*16, (T *)ptr);                                                         \
 198   ptr += 4;                                                                            \
 199 }  while (0);
 200
 201 /*
 202
 203   | 1     0       1.4021   | | Y |
 204   | 1    -0.3441 -0.7142   |x| Cb|
 205   | 1     1.7718  0        | | Cr|
 206
 207
 208   Y:      [-128 127]
 209   Cb/Cr : [-128 127]
 210
 211   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
 212
 213 */
 214
 215
 216
 217
 218 #define vec_unh(x) \
 219   (vector signed short) \
 220     vec_perm(x,(typeof(x))AVV(0),\
 221              (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
 222                                     0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
 223 #define vec_unl(x) \
 224   (vector signed short) \
 225     vec_perm(x,(typeof(x))AVV(0),\
 226              (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
 227                                     0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
 228
 229 #define vec_clip_s16(x) \
 230   vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
 231                        (vector signed short)AVV(16, 16, 16, 16, 16, 16, 16, 16 ))
 232
 233 #define vec_packclp(x,y) \
 234   (vector unsigned char)vec_packs \
 235       ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
 236        (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
 237
 238 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
 239
 240
 241 static inline void cvtyuvtoRGB (SwsContext *c,
 242                            vector signed short Y, vector signed short U, vector signed short V,
 243                            vector signed short *R, vector signed short *G, vector signed short *B)
 244 {
 245   vector signed   short vx,ux,uvx;
 246
 247   Y = vec_mradds (Y, c->CY, c->OY);
 248   U  = vec_sub (U,(vector signed short)
 249                         vec_splat((vector signed short)AVV(128),0));
 250   V  = vec_sub (V,(vector signed short)
 251                         vec_splat((vector signed short)AVV(128),0));
 252
 253   //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
 254   ux = vec_sl (U, c->CSHIFT);
 255   *B = vec_mradds (ux, c->CBU, Y);
 256
 257   // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
 258   vx = vec_sl (V, c->CSHIFT);
 259   *R = vec_mradds (vx, c->CRV, Y);
 260
 261   // uvx = ((CGU*u) + (CGV*v))>>15;
 262   uvx = vec_mradds (U, c->CGU, Y);
 263   *G = vec_mradds (V, c->CGV, uvx);
 264 }
 265
 266
 267 /*
 268   ------------------------------------------------------------------------------
 269   CS converters
 270   ------------------------------------------------------------------------------
 271 */
 272
 273
 274 #define DEFCSP420_CVT(name,out_pixels)                                     \
 275 static int altivec_##name (SwsContext *c,                                  \
 276                                 unsigned char **in, int *instrides,        \
 277                                 int srcSliceY,  int srcSliceH,             \
 278                                 unsigned char **oplanes, int *outstrides)  \
 279 {                                                                          \
 280   int w = c->srcW;                                                         \
 281   int h = srcSliceH;                                                       \
 282   int i,j;                                                                 \
 283   int instrides_scl[3];                                                    \
 284   vector unsigned char y0,y1;                                              \
 285                                                                            \
 286   vector signed char  u,v;                                                 \
 287                                                                            \
 288   vector signed short Y0,Y1,Y2,Y3;                                         \
 289   vector signed short U,V;                                                 \
 290   vector signed short vx,ux,uvx;                                           \
 291   vector signed short vx0,ux0,uvx0;                                        \
 292   vector signed short vx1,ux1,uvx1;                                        \
 293   vector signed short R0,G0,B0;                                            \
 294   vector signed short R1,G1,B1;                                            \
 295   vector unsigned char R,G,B;                                              \
 296                                                                            \
 297   vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                       \
 298   vector unsigned char align_perm;                                         \
 299                                                                            \
 300   vector signed short                                                      \
 301     lCY  = c->CY,                                                          \
 302     lOY  = c->OY,                                                          \
 303     lCRV = c->CRV,                                                         \
 304     lCBU = c->CBU,                                                         \
 305     lCGU = c->CGU,                                                         \
 306     lCGV = c->CGV;                                                         \
 307                                                                            \
 308   vector unsigned short lCSHIFT = c->CSHIFT;                               \
 309                                                                            \
 310   ubyte *y1i   = in[0];                                                    \
 311   ubyte *y2i   = in[0]+instrides[0];                                       \
 312   ubyte *ui    = in[1];                                                    \
 313   ubyte *vi    = in[2];                                                    \
 314                                                                            \
 315   vector unsigned char *oute                                               \
 316     = (vector unsigned char *)                                             \
 317         (oplanes[0]+srcSliceY*outstrides[0]);                              \
 318   vector unsigned char *outo                                               \
 319     = (vector unsigned char *)                                             \
 320         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                \
 321                                                                            \
 322                                                                            \
 323   instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */  \
 324   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     \
 325   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     \
 326                                                                            \
 327                                                                            \
 328   for (i=0;i<h/2;i++) {                                                    \
 329     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
 330     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
 331                                                                            \
 332     for (j=0;j<w/16;j++) {                                                 \
 333                                                                            \
 334       y1ivP = (vector unsigned char *)y1i;                                 \
 335       y2ivP = (vector unsigned char *)y2i;                                 \
 336       uivP = (vector unsigned char *)ui;                                   \
 337       vivP = (vector unsigned char *)vi;                                   \
 338                                                                            \
 339       align_perm = vec_lvsl (0, y1i);                                      \
 340       y0 = (vector unsigned char)vec_perm (y1ivP[0], y1ivP[1], align_perm);\
 341                                                                            \
 342       align_perm = vec_lvsl (0, y2i);                                      \
 343       y1 = (vector unsigned char)vec_perm (y2ivP[0], y2ivP[1], align_perm);\
 344                                                                            \
 345       align_perm = vec_lvsl (0, ui);                                       \
 346       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     \
 347                                                                            \
 348       align_perm = vec_lvsl (0, vi);                                       \
 349       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);     \
 350                                                                            \
 351       u  = (vector signed char)                                            \
 352                 vec_sub (u,(vector signed char)                            \
 353                                 vec_splat((vector signed char)AVV(128),0));\
 354       v  = (vector signed char)                                            \
 355                 vec_sub (v,(vector signed char)                            \
 356                                 vec_splat((vector signed char)AVV(128),0));\
 357                                                                            \
 358       U  = vec_unpackh (u);                                                \
 359       V  = vec_unpackh (v);                                                \
 360                                                                            \
 361                                                                            \
 362         Y0 = vec_unh (y0);                                                 \
 363         Y1 = vec_unl (y0);                                                 \
 364         Y2 = vec_unh (y1);                                                 \
 365         Y3 = vec_unl (y1);                                                 \
 366                                                                            \
 367         Y0 = vec_mradds (Y0, lCY, lOY);                                    \
 368         Y1 = vec_mradds (Y1, lCY, lOY);                                    \
 369         Y2 = vec_mradds (Y2, lCY, lOY);                                    \
 370         Y3 = vec_mradds (Y3, lCY, lOY);                                    \
 371                                                                            \
 372         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         \
 373         ux = vec_sl (U, lCSHIFT);                                          \
 374         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));                   \
 375         ux0  = vec_mergeh (ux,ux);                                         \
 376         ux1  = vec_mergel (ux,ux);                                         \
 377                                                                            \
 378         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         \
 379         vx = vec_sl (V, lCSHIFT);                                          \
 380         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));                   \
 381         vx0  = vec_mergeh (vx,vx);                                         \
 382         vx1  = vec_mergel (vx,vx);                                         \
 383                                                                            \
 384         /* uvx = ((CGU*u) + (CGV*v))>>15 */                                \
 385         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));                   \
 386         uvx = vec_mradds (V, lCGV, uvx);                                   \
 387         uvx0 = vec_mergeh (uvx,uvx);                                       \
 388         uvx1 = vec_mergel (uvx,uvx);                                       \
 389                                                                            \
 390         R0 = vec_add (Y0,vx0);                                             \
 391         G0 = vec_add (Y0,uvx0);                                            \
 392         B0 = vec_add (Y0,ux0);                                             \
 393         R1 = vec_add (Y1,vx1);                                             \
 394         G1 = vec_add (Y1,uvx1);                                            \
 395         B1 = vec_add (Y1,ux1);                                             \
 396                                                                            \
 397         R  = vec_packclp (R0,R1);                                          \
 398         G  = vec_packclp (G0,G1);                                          \
 399         B  = vec_packclp (B0,B1);                                          \
 400                                                                            \
 401         out_pixels(R,G,B,oute);                                            \
 402                                                                            \
 403         R0 = vec_add (Y2,vx0);                                             \
 404         G0 = vec_add (Y2,uvx0);                                            \
 405         B0 = vec_add (Y2,ux0);                                             \
 406         R1 = vec_add (Y3,vx1);                                             \
 407         G1 = vec_add (Y3,uvx1);                                            \
 408         B1 = vec_add (Y3,ux1);                                             \
 409         R  = vec_packclp (R0,R1);                                          \
 410         G  = vec_packclp (G0,G1);                                          \
 411         B  = vec_packclp (B0,B1);                                          \
 412                                                                            \
 413                                                                            \
 414         out_pixels(R,G,B,outo);                                            \
 415                                                                            \
 416       y1i  += 16;                                                          \
 417       y2i  += 16;                                                          \
 418       ui   += 8;                                                           \
 419       vi   += 8;                                                           \
 420                                                                            \
 421     }                                                                      \
 422                                                                            \
 423     outo += (outstrides[0])>>4;                                            \
 424     oute += (outstrides[0])>>4;                                            \
 425                                                                            \
 426     ui    += instrides_scl[1];                                             \
 427     vi    += instrides_scl[2];                                             \
 428     y1i   += instrides_scl[0];                                             \
 429     y2i   += instrides_scl[0];                                             \
 430   }                                                                        \
 431   return srcSliceH;                                                        \
 432 }
 433
 434
 435 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
 436 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
 437 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
 438 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
 439 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
 440 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
 441
 442 DEFCSP420_CVT (yuv2_abgr, out_abgr)
 443 #if 1
 444 DEFCSP420_CVT (yuv2_bgra, out_bgra)
 445 #else
 446 static int altivec_yuv2_bgra32 (SwsContext *c,
 447                                 unsigned char **in, int *instrides,
 448                                 int srcSliceY,  int srcSliceH,
 449                                 unsigned char **oplanes, int *outstrides)
 450 {
 451   int w = c->srcW;
 452   int h = srcSliceH;
 453   int i,j;
 454   int instrides_scl[3];
 455   vector unsigned char y0,y1;
 456
 457   vector signed char  u,v;
 458
 459   vector signed short Y0,Y1,Y2,Y3;
 460   vector signed short U,V;
 461   vector signed short vx,ux,uvx;
 462   vector signed short vx0,ux0,uvx0;
 463   vector signed short vx1,ux1,uvx1;
 464   vector signed short R0,G0,B0;
 465   vector signed short R1,G1,B1;
 466   vector unsigned char R,G,B;
 467
 468   vector unsigned char *uivP, *vivP;
 469   vector unsigned char align_perm;
 470
 471   vector signed short
 472     lCY  = c->CY,
 473     lOY  = c->OY,
 474     lCRV = c->CRV,
 475     lCBU = c->CBU,
 476     lCGU = c->CGU,
 477     lCGV = c->CGV;
 478
 479   vector unsigned short lCSHIFT = c->CSHIFT;
 480
 481   ubyte *y1i   = in[0];
 482   ubyte *y2i   = in[0]+w;
 483   ubyte *ui    = in[1];
 484   ubyte *vi    = in[2];
 485
 486   vector unsigned char *oute
 487     = (vector unsigned char *)
 488         (oplanes[0]+srcSliceY*outstrides[0]);
 489   vector unsigned char *outo
 490     = (vector unsigned char *)
 491         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
 492
 493
 494   instrides_scl[0] = instrides[0];
 495   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
 496   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
 497
 498
 499   for (i=0;i<h/2;i++) {
 500     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
 501     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
 502
 503     for (j=0;j<w/16;j++) {
 504
 505       y0 = vec_ldl (0,y1i);
 506       y1 = vec_ldl (0,y2i);
 507       uivP = (vector unsigned char *)ui;
 508       vivP = (vector unsigned char *)vi;
 509
 510       align_perm = vec_lvsl (0, ui);
 511       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
 512
 513       align_perm = vec_lvsl (0, vi);
 514       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
 515       u  = (vector signed char)
 516                 vec_sub (u,(vector signed char)
 517                                 vec_splat((vector signed char)AVV(128),0));
 518
 519       v  = (vector signed char)
 520                 vec_sub (v, (vector signed char)
 521                                 vec_splat((vector signed char)AVV(128),0));
 522
 523       U  = vec_unpackh (u);
 524       V  = vec_unpackh (v);
 525
 526
 527         Y0 = vec_unh (y0);
 528         Y1 = vec_unl (y0);
 529         Y2 = vec_unh (y1);
 530         Y3 = vec_unl (y1);
 531
 532         Y0 = vec_mradds (Y0, lCY, lOY);
 533         Y1 = vec_mradds (Y1, lCY, lOY);
 534         Y2 = vec_mradds (Y2, lCY, lOY);
 535         Y3 = vec_mradds (Y3, lCY, lOY);
 536
 537         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
 538         ux = vec_sl (U, lCSHIFT);
 539         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
 540         ux0  = vec_mergeh (ux,ux);
 541         ux1  = vec_mergel (ux,ux);
 542
 543         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */
 544         vx = vec_sl (V, lCSHIFT);
 545         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
 546         vx0  = vec_mergeh (vx,vx);
 547         vx1  = vec_mergel (vx,vx);
 548         /* uvx = ((CGU*u) + (CGV*v))>>15 */
 549         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
 550         uvx = vec_mradds (V, lCGV, uvx);
 551         uvx0 = vec_mergeh (uvx,uvx);
 552         uvx1 = vec_mergel (uvx,uvx);
 553         R0 = vec_add (Y0,vx0);
 554         G0 = vec_add (Y0,uvx0);
 555         B0 = vec_add (Y0,ux0);
 556         R1 = vec_add (Y1,vx1);
 557         G1 = vec_add (Y1,uvx1);
 558         B1 = vec_add (Y1,ux1);
 559         R  = vec_packclp (R0,R1);
 560         G  = vec_packclp (G0,G1);
 561         B  = vec_packclp (B0,B1);
 562
 563         out_argb(R,G,B,oute);
 564         R0 = vec_add (Y2,vx0);
 565         G0 = vec_add (Y2,uvx0);
 566         B0 = vec_add (Y2,ux0);
 567         R1 = vec_add (Y3,vx1);
 568         G1 = vec_add (Y3,uvx1);
 569         B1 = vec_add (Y3,ux1);
 570         R  = vec_packclp (R0,R1);
 571         G  = vec_packclp (G0,G1);
 572         B  = vec_packclp (B0,B1);
 573
 574         out_argb(R,G,B,outo);
 575         y1i  += 16;
 576         y2i  += 16;
 577         ui   += 8;
 578         vi   += 8;
 579
 580     }
 581
 582     outo += (outstrides[0])>>4;
 583     oute += (outstrides[0])>>4;
 584
 585     ui    += instrides_scl[1];
 586     vi    += instrides_scl[2];
 587     y1i   += instrides_scl[0];
 588     y2i   += instrides_scl[0];
 589   }
 590   return srcSliceH;
 591 }
 592
 593 #endif
 594
 595
 596 DEFCSP420_CVT (yuv2_rgba, out_rgba)
 597 DEFCSP420_CVT (yuv2_argb, out_argb)
 598 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
 599 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
 600
 601
 602 // uyvy|uyvy|uyvy|uyvy
 603 // 0123 4567 89ab cdef
 604 static
 605 const vector unsigned char
 606   demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
 607                                    0x10,0x04,0x10,0x04,
 608                                    0x10,0x08,0x10,0x08,
 609                                    0x10,0x0c,0x10,0x0c),
 610   demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
 611                                    0x10,0x06,0x10,0x06,
 612                                    0x10,0x0A,0x10,0x0A,
 613                                    0x10,0x0E,0x10,0x0E),
 614   demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
 615                                    0x10,0x05,0x10,0x07,
 616                                    0x10,0x09,0x10,0x0B,
 617                                    0x10,0x0D,0x10,0x0F);
 618
 619 /*
 620   this is so I can play live CCIR raw video
 621 */
 622 static int altivec_uyvy_rgb32 (SwsContext *c,
 623                                unsigned char **in, int *instrides,
 624                                int srcSliceY,   int srcSliceH,
 625                                unsigned char **oplanes, int *outstrides)
 626 {
 627   int w = c->srcW;
 628   int h = srcSliceH;
 629   int i,j;
 630   vector unsigned char uyvy;
 631   vector signed   short Y,U,V;
 632   vector signed   short R0,G0,B0,R1,G1,B1;
 633   vector unsigned char  R,G,B;
 634   vector unsigned char *out;
 635   ubyte *img;
 636
 637   img = in[0];
 638   out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
 639
 640   for (i=0;i<h;i++) {
 641     for (j=0;j<w/16;j++) {
 642       uyvy = vec_ld (0, img);
 643       U = (vector signed short)
 644         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
 645
 646       V = (vector signed short)
 647         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
 648
 649       Y = (vector signed short)
 650         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
 651
 652       cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
 653
 654       uyvy = vec_ld (16, img);
 655       U = (vector signed short)
 656         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
 657
 658       V = (vector signed short)
 659         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
 660
 661       Y = (vector signed short)
 662         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
 663
 664       cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
 665
 666       R  = vec_packclp (R0,R1);
 667       G  = vec_packclp (G0,G1);
 668       B  = vec_packclp (B0,B1);
 669
 670       //      vec_mstbgr24 (R,G,B, out);
 671       out_rgba (R,G,B,out);
 672
 673       img += 32;
 674     }
 675   }
 676   return srcSliceH;
 677 }
 678
 679
 680
 681 /* Ok currently the acceleration routine only supports
 682    inputs of widths a multiple of 16
 683    and heights a multiple 2
 684
 685    So we just fall back to the C codes for this.
 686 */
 687 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
 688 {
 689   if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
 690     return NULL;
 691
 692   /*
 693     and this seems not to matter too much I tried a bunch of
 694     videos with abnormal widths and mplayer crashes else where.
 695     mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
 696     boom with X11 bad match.
 697
 698   */
 699   if ((c->srcW & 0xf) != 0)    return NULL;
 700
 701   switch (c->srcFormat) {
 702   case PIX_FMT_YUV410P:
 703   case PIX_FMT_YUV420P:
 704   /*case IMGFMT_CLPL:   ??? */
 705   case PIX_FMT_GRAY8:
 706   case PIX_FMT_NV12:
 707   case PIX_FMT_NV21:
 708     if ((c->srcH & 0x1) != 0)
 709       return NULL;
 710
 711     switch(c->dstFormat){
 712     case PIX_FMT_RGB24:
 713       MSG_WARN("ALTIVEC: Color Space RGB24\n");
 714       return altivec_yuv2_rgb24;
 715     case PIX_FMT_BGR24:
 716       MSG_WARN("ALTIVEC: Color Space BGR24\n");
 717       return altivec_yuv2_bgr24;
 718     case PIX_FMT_ARGB:
 719       MSG_WARN("ALTIVEC: Color Space ARGB\n");
 720       return altivec_yuv2_argb;
 721     case PIX_FMT_ABGR:
 722       MSG_WARN("ALTIVEC: Color Space ABGR\n");
 723       return altivec_yuv2_abgr;
 724     case PIX_FMT_RGBA:
 725       MSG_WARN("ALTIVEC: Color Space RGBA\n");
 726       return altivec_yuv2_rgba;
 727     case PIX_FMT_BGRA:
 728       MSG_WARN("ALTIVEC: Color Space BGRA\n");
 729       return altivec_yuv2_bgra;
 730     default: return NULL;
 731     }
 732     break;
 733
 734   case PIX_FMT_UYVY422:
 735     switch(c->dstFormat){
 736     case PIX_FMT_BGR32:
 737       MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
 738       return altivec_uyvy_rgb32;
 739     default: return NULL;
 740     }
 741     break;
 742
 743   }
 744   return NULL;
 745 }
 746
 747 static uint16_t roundToInt16(int64_t f){
 748         int r= (f + (1<<15))>>16;
 749              if(r<-0x7FFF) return 0x8000;
 750         else if(r> 0x7FFF) return 0x7FFF;
 751         else               return r;
 752 }
 753
 754 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
 755 {
 756   union {
 757         signed short tmp[8] __attribute__ ((aligned(16)));
 758         vector signed short vec;
 759         } buf;
 760
 761   buf.tmp[0] =  ( (0xffffLL) * contrast>>8 )>>9;                        //cy
 762   buf.tmp[1] =  -256*brightness;                                        //oy
 763   buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);     //crv
 764   buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);     //cbu
 765   buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));    //cgu
 766   buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));    //cgv
 767
 768
 769   c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
 770   c->CY  = vec_splat ((vector signed short)buf.vec, 0);
 771   c->OY  = vec_splat ((vector signed short)buf.vec, 1);
 772   c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
 773   c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
 774   c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
 775   c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
 776 #if 0
 777 {
 778 int i;
 779 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
 780 for (i=0; i<6;i++)
 781   printf("%s %d ", v[i],buf.tmp[i] );
 782   printf("\n");
 783 }
 784 #endif
 785  return;
 786 }
 787
 788
 789 void
 790 altivec_yuv2packedX (SwsContext *c,
 791                        int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 792                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 793                        uint8_t *dest, int dstW, int dstY)
 794 {
 795   int i,j;
 796   vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
 797   vector signed short R0,G0,B0,R1,G1,B1;
 798
 799   vector unsigned char R,G,B;
 800   vector unsigned char *out,*nout;
 801
 802   vector signed short   RND = vec_splat_s16(1<<3);
 803   vector unsigned short SCL = vec_splat_u16(4);
 804   unsigned long scratch[16] __attribute__ ((aligned (16)));
 805
 806   vector signed short *YCoeffs, *CCoeffs;
 807
 808   YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
 809   CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
 810
 811   out = (vector unsigned char *)dest;
 812
 813   for(i=0; i<dstW; i+=16){
 814     Y0 = RND;
 815     Y1 = RND;
 816     /* extract 16 coeffs from lumSrc */
 817     for(j=0; j<lumFilterSize; j++) {
 818       X0 = vec_ld (0,  &lumSrc[j][i]);
 819       X1 = vec_ld (16, &lumSrc[j][i]);
 820       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 821       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 822     }
 823
 824     U = RND;
 825     V = RND;
 826     /* extract 8 coeffs from U,V */
 827     for(j=0; j<chrFilterSize; j++) {
 828       X  = vec_ld (0, &chrSrc[j][i/2]);
 829       U  = vec_mradds (X, CCoeffs[j], U);
 830       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
 831       V  = vec_mradds (X, CCoeffs[j], V);
 832     }
 833
 834     /* scale and clip signals */
 835     Y0 = vec_sra (Y0, SCL);
 836     Y1 = vec_sra (Y1, SCL);
 837     U  = vec_sra (U,  SCL);
 838     V  = vec_sra (V,  SCL);
 839
 840     Y0 = vec_clip_s16 (Y0);
 841     Y1 = vec_clip_s16 (Y1);
 842     U  = vec_clip_s16 (U);
 843     V  = vec_clip_s16 (V);
 844
 845     /* now we have
 846       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 847       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
 848
 849       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 850       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 851       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 852     */
 853
 854     U0 = vec_mergeh (U,U);
 855     V0 = vec_mergeh (V,V);
 856
 857     U1 = vec_mergel (U,U);
 858     V1 = vec_mergel (V,V);
 859
 860     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 861     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 862
 863     R  = vec_packclp (R0,R1);
 864     G  = vec_packclp (G0,G1);
 865     B  = vec_packclp (B0,B1);
 866
 867     switch(c->dstFormat) {
 868       case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
 869       case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
 870       case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
 871       case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
 872       case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
 873       case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
 874       default:
 875         {
 876           /* If this is reached, the caller should have called yuv2packedXinC
 877              instead. */
 878           static int printed_error_message;
 879           if(!printed_error_message) {
 880             MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
 881                     sws_format_name(c->dstFormat));
 882             printed_error_message=1;
 883           }
 884           return;
 885         }
 886     }
 887   }
 888
 889   if (i < dstW) {
 890     i -= 16;
 891
 892     Y0 = RND;
 893     Y1 = RND;
 894     /* extract 16 coeffs from lumSrc */
 895     for(j=0; j<lumFilterSize; j++) {
 896       X0 = vec_ld (0,  &lumSrc[j][i]);
 897       X1 = vec_ld (16, &lumSrc[j][i]);
 898       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 899       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 900     }
 901
 902     U = RND;
 903     V = RND;
 904     /* extract 8 coeffs from U,V */
 905     for(j=0; j<chrFilterSize; j++) {
 906       X  = vec_ld (0, &chrSrc[j][i/2]);
 907       U  = vec_mradds (X, CCoeffs[j], U);
 908       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
 909       V  = vec_mradds (X, CCoeffs[j], V);
 910     }
 911
 912     /* scale and clip signals */
 913     Y0 = vec_sra (Y0, SCL);
 914     Y1 = vec_sra (Y1, SCL);
 915     U  = vec_sra (U,  SCL);
 916     V  = vec_sra (V,  SCL);
 917
 918     Y0 = vec_clip_s16 (Y0);
 919     Y1 = vec_clip_s16 (Y1);
 920     U  = vec_clip_s16 (U);
 921     V  = vec_clip_s16 (V);
 922
 923     /* now we have
 924        Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 925        U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
 926
 927        Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 928        U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 929        V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 930     */
 931
 932     U0 = vec_mergeh (U,U);
 933     V0 = vec_mergeh (V,V);
 934
 935     U1 = vec_mergel (U,U);
 936     V1 = vec_mergel (V,V);
 937
 938     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 939     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 940
 941     R  = vec_packclp (R0,R1);
 942     G  = vec_packclp (G0,G1);
 943     B  = vec_packclp (B0,B1);
 944
 945     nout = (vector unsigned char *)scratch;
 946     switch(c->dstFormat) {
 947       case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
 948       case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
 949       case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
 950       case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
 951       case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
 952       case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
 953       default:
 954         /* Unreachable, I think. */
 955         MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
 956                 sws_format_name(c->dstFormat));
 957         return;
 958     }
 959
 960     memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
 961   }
 962
 963 }