libswscale/yuv2rgb_altivec.c

   1 /*
   2  * AltiVec acceleration for colorspace conversion
   3  *
   4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /*
  24 Convert I420 YV12 to RGB in various formats,
  25   it rejects images that are not in 420 formats,
  26   it rejects images that don't have widths of multiples of 16,
  27   it rejects images that don't have heights of multiples of 2.
  28 Reject defers to C simulation code.
  29
  30 Lots of optimizations to be done here.
  31
  32 1. Need to fix saturation code. I just couldn't get it to fly with packs
  33    and adds, so we currently use max/min to clip.
  34
  35 2. The inefficient use of chroma loading needs a bit of brushing up.
  36
  37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
  38    pipeline stalls.
  39
  40
  41 MODIFIED to calculate coeffs from currently selected color space.
  42 MODIFIED core to be a macro where you specify the output format.
  43 ADDED UYVY conversion which is never called due to some thing in swscale.
  44 CORRECTED algorithim selection to be strict on input formats.
  45 ADDED runtime detection of AltiVec.
  46
  47 ADDED altivec_yuv2packedX vertical scl + RGB converter
  48
  49 March 27,2004
  50 PERFORMANCE ANALYSIS
  51
  52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
  53 used as test.
  54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
  55 same sequence.
  56
  57 720 * 480 * 30  ~10MPS
  58
  59 so we have roughly 10 clocks per pixel. This is too high, something has
  60 to be wrong.
  61
  62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
  63 need for vec_min.
  64
  65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
  66 the input video frame, it was just decompressed so it probably resides in L1
  67 caches. However, we are creating the output video stream. This needs to use the
  68 DSTST instruction to optimize for the cache. We couple this with the fact that
  69 we are not going to be visiting the input buffer again so we mark it Least
  70 Recently Used. This shaves 25% of the processor cycles off.
  71
  72 Now memcpy is the largest mips consumer in the system, probably due
  73 to the inefficient X11 stuff.
  74
  75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
  76 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
  77 a versioning issue, however I have libGL.1.2.dylib for both
  78 machines. (We need to figure this out now.)
  79
  80 GL2 libraries work now with patch for RGB32.
  81
  82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
  83
  84 Integrated luma prescaling adjustment for saturation/contrast/brightness
  85 adjustment.
  86 */
  87
  88 #include <stdio.h>
  89 #include <stdlib.h>
  90 #include <string.h>
  91 #include <inttypes.h>
  92 #include <assert.h>
  93 #include "config.h"
  94 #ifdef HAVE_MALLOC_H
  95 #include <malloc.h>
  96 #endif
  97 #include "rgb2rgb.h"
  98 #include "swscale.h"
  99 #include "swscale_internal.h"
 100
 101 #undef PROFILE_THE_BEAST
 102 #undef INC_SCALING
 103
 104 typedef unsigned char ubyte;
 105 typedef signed char   sbyte;
 106
 107
 108 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
 109    homogeneous vector registers x0,x1,x2 are interleaved with the
 110    following technique:
 111
 112       o0 = vec_mergeh (x0,x1);
 113       o1 = vec_perm (o0, x2, perm_rgb_0);
 114       o2 = vec_perm (o0, x2, perm_rgb_1);
 115       o3 = vec_mergel (x0,x1);
 116       o4 = vec_perm (o3,o2,perm_rgb_2);
 117       o5 = vec_perm (o3,o2,perm_rgb_3);
 118
 119   perm_rgb_0:   o0(RG).h v1(B) --> o1*
 120               0   1  2   3   4
 121              rgbr|gbrg|brgb|rgbr
 122              0010 0100 1001 0010
 123              0102 3145 2673 894A
 124
 125   perm_rgb_1:   o0(RG).h v1(B) --> o2
 126               0   1  2   3   4
 127              gbrg|brgb|bbbb|bbbb
 128              0100 1001 1111 1111
 129              B5CD 6EF7 89AB CDEF
 130
 131   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
 132               0   1  2   3   4
 133              gbrg|brgb|rgbr|gbrg
 134              1111 1111 0010 0100
 135              89AB CDEF 0182 3945
 136
 137   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
 138               0   1  2   3   4
 139              brgb|rgbr|gbrg|brgb
 140              1001 0010 0100 1001
 141              a67b 89cA BdCD eEFf
 142
 143 */
 144 static
 145 const vector unsigned char
 146   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
 147                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
 148   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
 149                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
 150   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 151                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
 152   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
 153                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
 154
 155 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
 156 do {                                        \
 157     __typeof__(x0) o0,o2,o3;                \
 158         o0 = vec_mergeh (x0,x1);            \
 159         y0 = vec_perm (o0, x2, perm_rgb_0); \
 160         o2 = vec_perm (o0, x2, perm_rgb_1); \
 161         o3 = vec_mergel (x0,x1);            \
 162         y1 = vec_perm (o3,o2,perm_rgb_2);   \
 163         y2 = vec_perm (o3,o2,perm_rgb_3);   \
 164 } while(0)
 165
 166 #define vec_mstbgr24(x0,x1,x2,ptr)      \
 167 do {                                    \
 168     __typeof__(x0) _0,_1,_2;            \
 169     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
 170     vec_st (_0, 0, ptr++);              \
 171     vec_st (_1, 0, ptr++);              \
 172     vec_st (_2, 0, ptr++);              \
 173 }  while (0);
 174
 175 #define vec_mstrgb24(x0,x1,x2,ptr)      \
 176 do {                                    \
 177     __typeof__(x0) _0,_1,_2;            \
 178     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
 179     vec_st (_0, 0, ptr++);              \
 180     vec_st (_1, 0, ptr++);              \
 181     vec_st (_2, 0, ptr++);              \
 182 }  while (0);
 183
 184 /* pack the pixels in rgb0 format
 185    msb R
 186    lsb 0
 187 */
 188 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
 189 do {                                                                          \
 190     T _0,_1,_2,_3;                                                            \
 191     _0 = vec_mergeh (x0,x1);                                                  \
 192     _1 = vec_mergeh (x2,x3);                                                  \
 193     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 194     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 195     vec_st (_2, 0*16, (T *)ptr);                                              \
 196     vec_st (_3, 1*16, (T *)ptr);                                              \
 197     _0 = vec_mergel (x0,x1);                                                  \
 198     _1 = vec_mergel (x2,x3);                                                  \
 199     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 200     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 201     vec_st (_2, 2*16, (T *)ptr);                                              \
 202     vec_st (_3, 3*16, (T *)ptr);                                              \
 203     ptr += 4;                                                                 \
 204 }  while (0);
 205
 206 /*
 207
 208   | 1     0       1.4021   | | Y |
 209   | 1    -0.3441 -0.7142   |x| Cb|
 210   | 1     1.7718  0        | | Cr|
 211
 212
 213   Y:      [-128 127]
 214   Cb/Cr : [-128 127]
 215
 216   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
 217
 218 */
 219
 220
 221
 222
 223 #define vec_unh(x) \
 224     (vector signed short) \
 225         vec_perm(x,(__typeof__(x)){0}, \
 226                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
 227                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
 228 #define vec_unl(x) \
 229     (vector signed short) \
 230         vec_perm(x,(__typeof__(x)){0}, \
 231                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
 232                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
 233
 234 #define vec_clip_s16(x) \
 235     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
 236                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
 237
 238 #define vec_packclp(x,y) \
 239     (vector unsigned char)vec_packs \
 240         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
 241          (vector unsigned short)vec_max (y,((vector signed short) {0})))
 242
 243 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0}),a,a,a,ptr)
 244
 245
 246 static inline void cvtyuvtoRGB (SwsContext *c,
 247                                 vector signed short Y, vector signed short U, vector signed short V,
 248                                 vector signed short *R, vector signed short *G, vector signed short *B)
 249 {
 250     vector signed   short vx,ux,uvx;
 251
 252     Y = vec_mradds (Y, c->CY, c->OY);
 253     U  = vec_sub (U,(vector signed short)
 254                     vec_splat((vector signed short){128},0));
 255     V  = vec_sub (V,(vector signed short)
 256                     vec_splat((vector signed short){128},0));
 257
 258     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
 259     ux = vec_sl (U, c->CSHIFT);
 260     *B = vec_mradds (ux, c->CBU, Y);
 261
 262     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
 263     vx = vec_sl (V, c->CSHIFT);
 264     *R = vec_mradds (vx, c->CRV, Y);
 265
 266     // uvx = ((CGU*u) + (CGV*v))>>15;
 267     uvx = vec_mradds (U, c->CGU, Y);
 268     *G  = vec_mradds (V, c->CGV, uvx);
 269 }
 270
 271
 272 /*
 273   ------------------------------------------------------------------------------
 274   CS converters
 275   ------------------------------------------------------------------------------
 276 */
 277
 278
 279 #define DEFCSP420_CVT(name,out_pixels)                                  \
 280 static int altivec_##name (SwsContext *c,                               \
 281                            unsigned char **in, int *instrides,          \
 282                            int srcSliceY,        int srcSliceH,         \
 283                            unsigned char **oplanes, int *outstrides)    \
 284 {                                                                       \
 285     int w = c->srcW;                                                    \
 286     int h = srcSliceH;                                                  \
 287     int i,j;                                                            \
 288     int instrides_scl[3];                                               \
 289     vector unsigned char y0,y1;                                         \
 290                                                                         \
 291     vector signed char  u,v;                                            \
 292                                                                         \
 293     vector signed short Y0,Y1,Y2,Y3;                                    \
 294     vector signed short U,V;                                            \
 295     vector signed short vx,ux,uvx;                                      \
 296     vector signed short vx0,ux0,uvx0;                                   \
 297     vector signed short vx1,ux1,uvx1;                                   \
 298     vector signed short R0,G0,B0;                                       \
 299     vector signed short R1,G1,B1;                                       \
 300     vector unsigned char R,G,B;                                         \
 301                                                                         \
 302     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
 303     vector unsigned char align_perm;                                    \
 304                                                                         \
 305     vector signed short                                                 \
 306         lCY  = c->CY,                                                   \
 307         lOY  = c->OY,                                                   \
 308         lCRV = c->CRV,                                                  \
 309         lCBU = c->CBU,                                                  \
 310         lCGU = c->CGU,                                                  \
 311         lCGV = c->CGV;                                                  \
 312                                                                         \
 313     vector unsigned short lCSHIFT = c->CSHIFT;                          \
 314                                                                         \
 315     ubyte *y1i   = in[0];                                               \
 316     ubyte *y2i   = in[0]+instrides[0];                                  \
 317     ubyte *ui    = in[1];                                               \
 318     ubyte *vi    = in[2];                                               \
 319                                                                         \
 320     vector unsigned char *oute                                          \
 321         = (vector unsigned char *)                                      \
 322             (oplanes[0]+srcSliceY*outstrides[0]);                       \
 323     vector unsigned char *outo                                          \
 324         = (vector unsigned char *)                                      \
 325             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
 326                                                                         \
 327                                                                         \
 328     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
 329     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
 330     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
 331                                                                         \
 332                                                                         \
 333     for (i=0;i<h/2;i++) {                                               \
 334         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
 335         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
 336                                                                         \
 337         for (j=0;j<w/16;j++) {                                          \
 338                                                                         \
 339             y1ivP = (vector unsigned char *)y1i;                        \
 340             y2ivP = (vector unsigned char *)y2i;                        \
 341             uivP  = (vector unsigned char *)ui;                         \
 342             vivP  = (vector unsigned char *)vi;                         \
 343                                                                         \
 344             align_perm = vec_lvsl (0, y1i);                             \
 345             y0 = (vector unsigned char)                                 \
 346                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
 347                                                                         \
 348             align_perm = vec_lvsl (0, y2i);                             \
 349             y1 = (vector unsigned char)                                 \
 350                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
 351                                                                         \
 352             align_perm = vec_lvsl (0, ui);                              \
 353             u = (vector signed char)                                    \
 354                 vec_perm (uivP[0], uivP[1], align_perm);                \
 355                                                                         \
 356             align_perm = vec_lvsl (0, vi);                              \
 357             v = (vector signed char)                                    \
 358                 vec_perm (vivP[0], vivP[1], align_perm);                \
 359                                                                         \
 360             u  = (vector signed char)                                   \
 361                  vec_sub (u,(vector signed char)                        \
 362                           vec_splat((vector signed char){128},0));      \
 363             v  = (vector signed char)                                   \
 364                  vec_sub (v,(vector signed char)                        \
 365                           vec_splat((vector signed char){128},0));      \
 366                                                                         \
 367             U  = vec_unpackh (u);                                       \
 368             V  = vec_unpackh (v);                                       \
 369                                                                         \
 370                                                                         \
 371             Y0 = vec_unh (y0);                                          \
 372             Y1 = vec_unl (y0);                                          \
 373             Y2 = vec_unh (y1);                                          \
 374             Y3 = vec_unl (y1);                                          \
 375                                                                         \
 376             Y0 = vec_mradds (Y0, lCY, lOY);                             \
 377             Y1 = vec_mradds (Y1, lCY, lOY);                             \
 378             Y2 = vec_mradds (Y2, lCY, lOY);                             \
 379             Y3 = vec_mradds (Y3, lCY, lOY);                             \
 380                                                                         \
 381             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
 382             ux = vec_sl (U, lCSHIFT);                                   \
 383             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
 384             ux0  = vec_mergeh (ux,ux);                                  \
 385             ux1  = vec_mergel (ux,ux);                                  \
 386                                                                         \
 387             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
 388             vx = vec_sl (V, lCSHIFT);                                   \
 389             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
 390             vx0  = vec_mergeh (vx,vx);                                  \
 391             vx1  = vec_mergel (vx,vx);                                  \
 392                                                                         \
 393             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
 394             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
 395             uvx = vec_mradds (V, lCGV, uvx);                            \
 396             uvx0 = vec_mergeh (uvx,uvx);                                \
 397             uvx1 = vec_mergel (uvx,uvx);                                \
 398                                                                         \
 399             R0 = vec_add (Y0,vx0);                                      \
 400             G0 = vec_add (Y0,uvx0);                                     \
 401             B0 = vec_add (Y0,ux0);                                      \
 402             R1 = vec_add (Y1,vx1);                                      \
 403             G1 = vec_add (Y1,uvx1);                                     \
 404             B1 = vec_add (Y1,ux1);                                      \
 405                                                                         \
 406             R  = vec_packclp (R0,R1);                                   \
 407             G  = vec_packclp (G0,G1);                                   \
 408             B  = vec_packclp (B0,B1);                                   \
 409                                                                         \
 410             out_pixels(R,G,B,oute);                                     \
 411                                                                         \
 412             R0 = vec_add (Y2,vx0);                                      \
 413             G0 = vec_add (Y2,uvx0);                                     \
 414             B0 = vec_add (Y2,ux0);                                      \
 415             R1 = vec_add (Y3,vx1);                                      \
 416             G1 = vec_add (Y3,uvx1);                                     \
 417             B1 = vec_add (Y3,ux1);                                      \
 418             R  = vec_packclp (R0,R1);                                   \
 419             G  = vec_packclp (G0,G1);                                   \
 420             B  = vec_packclp (B0,B1);                                   \
 421                                                                         \
 422                                                                         \
 423             out_pixels(R,G,B,outo);                                     \
 424                                                                         \
 425             y1i  += 16;                                                 \
 426             y2i  += 16;                                                 \
 427             ui   += 8;                                                  \
 428             vi   += 8;                                                  \
 429                                                                         \
 430         }                                                               \
 431                                                                         \
 432         outo  += (outstrides[0])>>4;                                    \
 433         oute  += (outstrides[0])>>4;                                    \
 434                                                                         \
 435         ui    += instrides_scl[1];                                      \
 436         vi    += instrides_scl[2];                                      \
 437         y1i   += instrides_scl[0];                                      \
 438         y2i   += instrides_scl[0];                                      \
 439     }                                                                   \
 440     return srcSliceH;                                                   \
 441 }
 442
 443
 444 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0}),c,b,a,ptr)
 445 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){0}),ptr)
 446 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){0}),ptr)
 447 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0}),a,b,c,ptr)
 448 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
 449 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
 450
 451 DEFCSP420_CVT (yuv2_abgr, out_abgr)
 452 #if 1
 453 DEFCSP420_CVT (yuv2_bgra, out_bgra)
 454 #else
 455 static int altivec_yuv2_bgra32 (SwsContext *c,
 456                                 unsigned char **in, int *instrides,
 457                                 int srcSliceY,        int srcSliceH,
 458                                 unsigned char **oplanes, int *outstrides)
 459 {
 460     int w = c->srcW;
 461     int h = srcSliceH;
 462     int i,j;
 463     int instrides_scl[3];
 464     vector unsigned char y0,y1;
 465
 466     vector signed char  u,v;
 467
 468     vector signed short Y0,Y1,Y2,Y3;
 469     vector signed short U,V;
 470     vector signed short vx,ux,uvx;
 471     vector signed short vx0,ux0,uvx0;
 472     vector signed short vx1,ux1,uvx1;
 473     vector signed short R0,G0,B0;
 474     vector signed short R1,G1,B1;
 475     vector unsigned char R,G,B;
 476
 477     vector unsigned char *uivP, *vivP;
 478     vector unsigned char align_perm;
 479
 480     vector signed short
 481         lCY  = c->CY,
 482         lOY  = c->OY,
 483         lCRV = c->CRV,
 484         lCBU = c->CBU,
 485         lCGU = c->CGU,
 486         lCGV = c->CGV;
 487
 488     vector unsigned short lCSHIFT = c->CSHIFT;
 489
 490     ubyte *y1i   = in[0];
 491     ubyte *y2i   = in[0]+w;
 492     ubyte *ui    = in[1];
 493     ubyte *vi    = in[2];
 494
 495     vector unsigned char *oute
 496         = (vector unsigned char *)
 497           (oplanes[0]+srcSliceY*outstrides[0]);
 498     vector unsigned char *outo
 499         = (vector unsigned char *)
 500           (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
 501
 502
 503     instrides_scl[0] = instrides[0];
 504     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
 505     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
 506
 507
 508     for (i=0;i<h/2;i++) {
 509         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
 510         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
 511
 512         for (j=0;j<w/16;j++) {
 513
 514             y0 = vec_ldl (0,y1i);
 515             y1 = vec_ldl (0,y2i);
 516             uivP = (vector unsigned char *)ui;
 517             vivP = (vector unsigned char *)vi;
 518
 519             align_perm = vec_lvsl (0, ui);
 520             u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
 521
 522             align_perm = vec_lvsl (0, vi);
 523             v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
 524             u  = (vector signed char)
 525                  vec_sub (u,(vector signed char)
 526                           vec_splat((vector signed char){128},0));
 527
 528             v  = (vector signed char)
 529                  vec_sub (v, (vector signed char)
 530                           vec_splat((vector signed char){128},0));
 531
 532             U  = vec_unpackh (u);
 533             V  = vec_unpackh (v);
 534
 535
 536             Y0 = vec_unh (y0);
 537             Y1 = vec_unl (y0);
 538             Y2 = vec_unh (y1);
 539             Y3 = vec_unl (y1);
 540
 541             Y0 = vec_mradds (Y0, lCY, lOY);
 542             Y1 = vec_mradds (Y1, lCY, lOY);
 543             Y2 = vec_mradds (Y2, lCY, lOY);
 544             Y3 = vec_mradds (Y3, lCY, lOY);
 545
 546             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
 547             ux = vec_sl (U, lCSHIFT);
 548             ux = vec_mradds (ux, lCBU, (vector signed short){0});
 549             ux0  = vec_mergeh (ux,ux);
 550             ux1  = vec_mergel (ux,ux);
 551
 552             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
 553             vx = vec_sl (V, lCSHIFT);
 554             vx = vec_mradds (vx, lCRV, (vector signed short){0});
 555             vx0  = vec_mergeh (vx,vx);
 556             vx1  = vec_mergel (vx,vx);
 557             /* uvx = ((CGU*u) + (CGV*v))>>15 */
 558             uvx = vec_mradds (U, lCGU, (vector signed short){0});
 559             uvx = vec_mradds (V, lCGV, uvx);
 560             uvx0 = vec_mergeh (uvx,uvx);
 561             uvx1 = vec_mergel (uvx,uvx);
 562             R0 = vec_add (Y0,vx0);
 563             G0 = vec_add (Y0,uvx0);
 564             B0 = vec_add (Y0,ux0);
 565             R1 = vec_add (Y1,vx1);
 566             G1 = vec_add (Y1,uvx1);
 567             B1 = vec_add (Y1,ux1);
 568             R  = vec_packclp (R0,R1);
 569             G  = vec_packclp (G0,G1);
 570             B  = vec_packclp (B0,B1);
 571
 572             out_argb(R,G,B,oute);
 573             R0 = vec_add (Y2,vx0);
 574             G0 = vec_add (Y2,uvx0);
 575             B0 = vec_add (Y2,ux0);
 576             R1 = vec_add (Y3,vx1);
 577             G1 = vec_add (Y3,uvx1);
 578             B1 = vec_add (Y3,ux1);
 579             R  = vec_packclp (R0,R1);
 580             G  = vec_packclp (G0,G1);
 581             B  = vec_packclp (B0,B1);
 582
 583             out_argb(R,G,B,outo);
 584             y1i  += 16;
 585             y2i  += 16;
 586             ui   += 8;
 587             vi   += 8;
 588
 589         }
 590
 591         outo  += (outstrides[0])>>4;
 592         oute  += (outstrides[0])>>4;
 593
 594         ui    += instrides_scl[1];
 595         vi    += instrides_scl[2];
 596         y1i   += instrides_scl[0];
 597         y2i   += instrides_scl[0];
 598     }
 599     return srcSliceH;
 600 }
 601
 602 #endif
 603
 604
 605 DEFCSP420_CVT (yuv2_rgba, out_rgba)
 606 DEFCSP420_CVT (yuv2_argb, out_argb)
 607 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
 608 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
 609
 610
 611 // uyvy|uyvy|uyvy|uyvy
 612 // 0123 4567 89ab cdef
 613 static
 614 const vector unsigned char
 615     demux_u = {0x10,0x00,0x10,0x00,
 616                0x10,0x04,0x10,0x04,
 617                0x10,0x08,0x10,0x08,
 618                0x10,0x0c,0x10,0x0c},
 619     demux_v = {0x10,0x02,0x10,0x02,
 620                0x10,0x06,0x10,0x06,
 621                0x10,0x0A,0x10,0x0A,
 622                0x10,0x0E,0x10,0x0E},
 623     demux_y = {0x10,0x01,0x10,0x03,
 624                0x10,0x05,0x10,0x07,
 625                0x10,0x09,0x10,0x0B,
 626                0x10,0x0D,0x10,0x0F};
 627
 628 /*
 629   this is so I can play live CCIR raw video
 630 */
 631 static int altivec_uyvy_rgb32 (SwsContext *c,
 632                                unsigned char **in, int *instrides,
 633                                int srcSliceY,        int srcSliceH,
 634                                unsigned char **oplanes, int *outstrides)
 635 {
 636     int w = c->srcW;
 637     int h = srcSliceH;
 638     int i,j;
 639     vector unsigned char uyvy;
 640     vector signed   short Y,U,V;
 641     vector signed   short R0,G0,B0,R1,G1,B1;
 642     vector unsigned char  R,G,B;
 643     vector unsigned char *out;
 644     ubyte *img;
 645
 646     img = in[0];
 647     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
 648
 649     for (i=0;i<h;i++) {
 650         for (j=0;j<w/16;j++) {
 651             uyvy = vec_ld (0, img);
 652             U = (vector signed short)
 653                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
 654
 655             V = (vector signed short)
 656                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
 657
 658             Y = (vector signed short)
 659                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
 660
 661             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
 662
 663             uyvy = vec_ld (16, img);
 664             U = (vector signed short)
 665                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
 666
 667             V = (vector signed short)
 668                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
 669
 670             Y = (vector signed short)
 671                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
 672
 673             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
 674
 675             R  = vec_packclp (R0,R1);
 676             G  = vec_packclp (G0,G1);
 677             B  = vec_packclp (B0,B1);
 678
 679             //      vec_mstbgr24 (R,G,B, out);
 680             out_rgba (R,G,B,out);
 681
 682             img += 32;
 683         }
 684     }
 685     return srcSliceH;
 686 }
 687
 688
 689
 690 /* Ok currently the acceleration routine only supports
 691    inputs of widths a multiple of 16
 692    and heights a multiple 2
 693
 694    So we just fall back to the C codes for this.
 695 */
 696 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
 697 {
 698     if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
 699         return NULL;
 700
 701     /*
 702       and this seems not to matter too much I tried a bunch of
 703       videos with abnormal widths and MPlayer crashes elsewhere.
 704       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
 705       boom with X11 bad match.
 706
 707     */
 708     if ((c->srcW & 0xf) != 0)    return NULL;
 709
 710     switch (c->srcFormat) {
 711     case PIX_FMT_YUV410P:
 712     case PIX_FMT_YUV420P:
 713     /*case IMGFMT_CLPL:        ??? */
 714     case PIX_FMT_GRAY8:
 715     case PIX_FMT_NV12:
 716     case PIX_FMT_NV21:
 717         if ((c->srcH & 0x1) != 0)
 718             return NULL;
 719
 720         switch(c->dstFormat){
 721         case PIX_FMT_RGB24:
 722             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
 723             return altivec_yuv2_rgb24;
 724         case PIX_FMT_BGR24:
 725             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
 726             return altivec_yuv2_bgr24;
 727         case PIX_FMT_ARGB:
 728             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
 729             return altivec_yuv2_argb;
 730         case PIX_FMT_ABGR:
 731             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
 732             return altivec_yuv2_abgr;
 733         case PIX_FMT_RGBA:
 734             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
 735             return altivec_yuv2_rgba;
 736         case PIX_FMT_BGRA:
 737             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
 738             return altivec_yuv2_bgra;
 739         default: return NULL;
 740         }
 741         break;
 742
 743     case PIX_FMT_UYVY422:
 744         switch(c->dstFormat){
 745         case PIX_FMT_BGR32:
 746             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
 747             return altivec_uyvy_rgb32;
 748         default: return NULL;
 749         }
 750         break;
 751
 752     }
 753     return NULL;
 754 }
 755
 756 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
 757 {
 758     union {
 759         signed short tmp[8] __attribute__ ((aligned(16)));
 760         vector signed short vec;
 761     } buf;
 762
 763     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
 764     buf.tmp[1] =  -256*brightness;                                      //oy
 765     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
 766     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
 767     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
 768     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
 769
 770
 771     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
 772     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
 773     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
 774     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
 775     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
 776     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
 777     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
 778 #if 0
 779     {
 780     int i;
 781     char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
 782     for (i=0; i<6; i++)
 783         printf("%s %d ", v[i],buf.tmp[i] );
 784         printf("\n");
 785     }
 786 #endif
 787     return;
 788 }
 789
 790
 791 void
 792 altivec_yuv2packedX (SwsContext *c,
 793                      int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 794                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 795                      uint8_t *dest, int dstW, int dstY)
 796 {
 797     int i,j;
 798     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
 799     vector signed short R0,G0,B0,R1,G1,B1;
 800
 801     vector unsigned char R,G,B;
 802     vector unsigned char *out,*nout;
 803
 804     vector signed short   RND = vec_splat_s16(1<<3);
 805     vector unsigned short SCL = vec_splat_u16(4);
 806     unsigned long scratch[16] __attribute__ ((aligned (16)));
 807
 808     vector signed short *YCoeffs, *CCoeffs;
 809
 810     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
 811     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
 812
 813     out = (vector unsigned char *)dest;
 814
 815     for (i=0; i<dstW; i+=16){
 816         Y0 = RND;
 817         Y1 = RND;
 818         /* extract 16 coeffs from lumSrc */
 819         for (j=0; j<lumFilterSize; j++) {
 820             X0 = vec_ld (0,  &lumSrc[j][i]);
 821             X1 = vec_ld (16, &lumSrc[j][i]);
 822             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 823             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 824         }
 825
 826         U = RND;
 827         V = RND;
 828         /* extract 8 coeffs from U,V */
 829         for (j=0; j<chrFilterSize; j++) {
 830             X  = vec_ld (0, &chrSrc[j][i/2]);
 831             U  = vec_mradds (X, CCoeffs[j], U);
 832             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
 833             V  = vec_mradds (X, CCoeffs[j], V);
 834         }
 835
 836         /* scale and clip signals */
 837         Y0 = vec_sra (Y0, SCL);
 838         Y1 = vec_sra (Y1, SCL);
 839         U  = vec_sra (U,  SCL);
 840         V  = vec_sra (V,  SCL);
 841
 842         Y0 = vec_clip_s16 (Y0);
 843         Y1 = vec_clip_s16 (Y1);
 844         U  = vec_clip_s16 (U);
 845         V  = vec_clip_s16 (V);
 846
 847         /* now we have
 848           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 849           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
 850
 851           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 852           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 853           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 854         */
 855
 856         U0 = vec_mergeh (U,U);
 857         V0 = vec_mergeh (V,V);
 858
 859         U1 = vec_mergel (U,U);
 860         V1 = vec_mergel (V,V);
 861
 862         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 863         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 864
 865         R  = vec_packclp (R0,R1);
 866         G  = vec_packclp (G0,G1);
 867         B  = vec_packclp (B0,B1);
 868
 869         switch(c->dstFormat) {
 870             case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
 871             case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
 872             case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
 873             case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
 874             case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
 875             case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
 876             default:
 877             {
 878                 /* If this is reached, the caller should have called yuv2packedXinC
 879                    instead. */
 880                 static int printed_error_message;
 881                 if (!printed_error_message) {
 882                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 883                            sws_format_name(c->dstFormat));
 884                     printed_error_message=1;
 885                 }
 886                 return;
 887             }
 888         }
 889     }
 890
 891     if (i < dstW) {
 892         i -= 16;
 893
 894         Y0 = RND;
 895         Y1 = RND;
 896         /* extract 16 coeffs from lumSrc */
 897         for (j=0; j<lumFilterSize; j++) {
 898             X0 = vec_ld (0,  &lumSrc[j][i]);
 899             X1 = vec_ld (16, &lumSrc[j][i]);
 900             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 901             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 902         }
 903
 904         U = RND;
 905         V = RND;
 906         /* extract 8 coeffs from U,V */
 907         for (j=0; j<chrFilterSize; j++) {
 908             X  = vec_ld (0, &chrSrc[j][i/2]);
 909             U  = vec_mradds (X, CCoeffs[j], U);
 910             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
 911             V  = vec_mradds (X, CCoeffs[j], V);
 912         }
 913
 914         /* scale and clip signals */
 915         Y0 = vec_sra (Y0, SCL);
 916         Y1 = vec_sra (Y1, SCL);
 917         U  = vec_sra (U,  SCL);
 918         V  = vec_sra (V,  SCL);
 919
 920         Y0 = vec_clip_s16 (Y0);
 921         Y1 = vec_clip_s16 (Y1);
 922         U  = vec_clip_s16 (U);
 923         V  = vec_clip_s16 (V);
 924
 925         /* now we have
 926            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 927            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
 928
 929            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 930            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 931            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 932         */
 933
 934         U0 = vec_mergeh (U,U);
 935         V0 = vec_mergeh (V,V);
 936
 937         U1 = vec_mergel (U,U);
 938         V1 = vec_mergel (V,V);
 939
 940         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 941         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 942
 943         R  = vec_packclp (R0,R1);
 944         G  = vec_packclp (G0,G1);
 945         B  = vec_packclp (B0,B1);
 946
 947         nout = (vector unsigned char *)scratch;
 948         switch(c->dstFormat) {
 949             case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
 950             case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
 951             case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
 952             case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
 953             case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
 954             case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
 955             default:
 956                 /* Unreachable, I think. */
 957                 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 958                        sws_format_name(c->dstFormat));
 959                 return;
 960         }
 961
 962         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
 963     }
 964
 965 }