synced with r21612
[mplayer/greg.git] / libswscale / yuv2rgb_altivec.c
blobca0680a4985b2ab6b0cfebc31ec3d2394d03b77e
1 /*
2 marc.hoffman@analog.com March 8, 2004
4 Altivec Acceleration for Color Space Conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
30 March 27,2004
31 PERFORMANCE ANALYSIS
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
36 720*480*30 ~10MPS
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
66 * This file is part of FFmpeg.
68 * FFmpeg is free software; you can redistribute it and/or modify
69 * it under the terms of the GNU General Public License as published by
70 * the Free Software Foundation; either version 2 of the License, or
71 * (at your option) any later version.
73 * FFmpeg is distributed in the hope that it will be useful,
74 * but WITHOUT ANY WARRANTY; without even the implied warranty of
75 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
76 * GNU General Public License for more details.
78 * You should have received a copy of the GNU General Public License
79 * along with FFmpeg; if not, write to the Free Software
80 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
83 #include <stdio.h>
84 #include <stdlib.h>
85 #include <string.h>
86 #include <inttypes.h>
87 #include <assert.h>
88 #include "config.h"
89 #ifdef HAVE_MALLOC_H
90 #include <malloc.h>
91 #endif
92 #include "rgb2rgb.h"
93 #include "swscale.h"
94 #include "swscale_internal.h"
96 #undef PROFILE_THE_BEAST
97 #undef INC_SCALING
99 typedef unsigned char ubyte;
100 typedef signed char sbyte;
103 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
104 homogeneous vector registers x0,x1,x2 are interleaved with the
105 following technique:
107 o0 = vec_mergeh (x0,x1);
108 o1 = vec_perm (o0, x2, perm_rgb_0);
109 o2 = vec_perm (o0, x2, perm_rgb_1);
110 o3 = vec_mergel (x0,x1);
111 o4 = vec_perm (o3,o2,perm_rgb_2);
112 o5 = vec_perm (o3,o2,perm_rgb_3);
114 perm_rgb_0: o0(RG).h v1(B) --> o1*
115 0 1 2 3 4
116 rgbr|gbrg|brgb|rgbr
117 0010 0100 1001 0010
118 0102 3145 2673 894A
120 perm_rgb_1: o0(RG).h v1(B) --> o2
121 0 1 2 3 4
122 gbrg|brgb|bbbb|bbbb
123 0100 1001 1111 1111
124 B5CD 6EF7 89AB CDEF
126 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
127 0 1 2 3 4
128 gbrg|brgb|rgbr|gbrg
129 1111 1111 0010 0100
130 89AB CDEF 0182 3945
132 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
133 0 1 2 3 4
134 brgb|rgbr|gbrg|brgb
135 1001 0010 0100 1001
136 a67b 89cA BdCD eEFf
139 static
140 const vector unsigned char
141 perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
142 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
143 perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
144 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
145 perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
146 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
147 perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
148 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
150 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
151 do { \
152 typeof(x0) o0,o2,o3; \
153 o0 = vec_mergeh (x0,x1); \
154 y0 = vec_perm (o0, x2, perm_rgb_0);\
155 o2 = vec_perm (o0, x2, perm_rgb_1);\
156 o3 = vec_mergel (x0,x1); \
157 y1 = vec_perm (o3,o2,perm_rgb_2); \
158 y2 = vec_perm (o3,o2,perm_rgb_3); \
159 } while(0)
161 #define vec_mstbgr24(x0,x1,x2,ptr) \
162 do { \
163 typeof(x0) _0,_1,_2; \
164 vec_merge3 (x0,x1,x2,_0,_1,_2); \
165 vec_st (_0, 0, ptr++); \
166 vec_st (_1, 0, ptr++); \
167 vec_st (_2, 0, ptr++); \
168 } while (0);
170 #define vec_mstrgb24(x0,x1,x2,ptr) \
171 do { \
172 typeof(x0) _0,_1,_2; \
173 vec_merge3 (x2,x1,x0,_0,_1,_2); \
174 vec_st (_0, 0, ptr++); \
175 vec_st (_1, 0, ptr++); \
176 vec_st (_2, 0, ptr++); \
177 } while (0);
179 /* pack the pixels in rgb0 format
180 msb R
181 lsb 0
183 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
184 do { \
185 T _0,_1,_2,_3; \
186 _0 = vec_mergeh (x0,x1); \
187 _1 = vec_mergeh (x2,x3); \
188 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
189 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
190 vec_st (_2, 0*16, (T *)ptr); \
191 vec_st (_3, 1*16, (T *)ptr); \
192 _0 = vec_mergel (x0,x1); \
193 _1 = vec_mergel (x2,x3); \
194 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
195 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
196 vec_st (_2, 2*16, (T *)ptr); \
197 vec_st (_3, 3*16, (T *)ptr); \
198 ptr += 4; \
199 } while (0);
203 | 1 0 1.4021 | | Y |
204 | 1 -0.3441 -0.7142 |x| Cb|
205 | 1 1.7718 0 | | Cr|
208 Y: [-128 127]
209 Cb/Cr : [-128 127]
211 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
218 #define vec_unh(x) \
219 (vector signed short) \
220 vec_perm(x,(typeof(x))AVV(0),\
221 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
222 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
223 #define vec_unl(x) \
224 (vector signed short) \
225 vec_perm(x,(typeof(x))AVV(0),\
226 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
227 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
229 #define vec_clip_s16(x) \
230 vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
231 (vector signed short)AVV(16, 16, 16, 16, 16, 16, 16, 16 ))
233 #define vec_packclp(x,y) \
234 (vector unsigned char)vec_packs \
235 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
236 (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
238 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
241 static inline void cvtyuvtoRGB (SwsContext *c,
242 vector signed short Y, vector signed short U, vector signed short V,
243 vector signed short *R, vector signed short *G, vector signed short *B)
245 vector signed short vx,ux,uvx;
247 Y = vec_mradds (Y, c->CY, c->OY);
248 U = vec_sub (U,(vector signed short)
249 vec_splat((vector signed short)AVV(128),0));
250 V = vec_sub (V,(vector signed short)
251 vec_splat((vector signed short)AVV(128),0));
253 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
254 ux = vec_sl (U, c->CSHIFT);
255 *B = vec_mradds (ux, c->CBU, Y);
257 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
258 vx = vec_sl (V, c->CSHIFT);
259 *R = vec_mradds (vx, c->CRV, Y);
261 // uvx = ((CGU*u) + (CGV*v))>>15;
262 uvx = vec_mradds (U, c->CGU, Y);
263 *G = vec_mradds (V, c->CGV, uvx);
268 ------------------------------------------------------------------------------
269 CS converters
270 ------------------------------------------------------------------------------
274 #define DEFCSP420_CVT(name,out_pixels) \
275 static int altivec_##name (SwsContext *c, \
276 unsigned char **in, int *instrides, \
277 int srcSliceY, int srcSliceH, \
278 unsigned char **oplanes, int *outstrides) \
280 int w = c->srcW; \
281 int h = srcSliceH; \
282 int i,j; \
283 int instrides_scl[3]; \
284 vector unsigned char y0,y1; \
286 vector signed char u,v; \
288 vector signed short Y0,Y1,Y2,Y3; \
289 vector signed short U,V; \
290 vector signed short vx,ux,uvx; \
291 vector signed short vx0,ux0,uvx0; \
292 vector signed short vx1,ux1,uvx1; \
293 vector signed short R0,G0,B0; \
294 vector signed short R1,G1,B1; \
295 vector unsigned char R,G,B; \
297 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
298 vector unsigned char align_perm; \
300 vector signed short \
301 lCY = c->CY, \
302 lOY = c->OY, \
303 lCRV = c->CRV, \
304 lCBU = c->CBU, \
305 lCGU = c->CGU, \
306 lCGV = c->CGV; \
308 vector unsigned short lCSHIFT = c->CSHIFT; \
310 ubyte *y1i = in[0]; \
311 ubyte *y2i = in[0]+instrides[0]; \
312 ubyte *ui = in[1]; \
313 ubyte *vi = in[2]; \
315 vector unsigned char *oute \
316 = (vector unsigned char *) \
317 (oplanes[0]+srcSliceY*outstrides[0]); \
318 vector unsigned char *outo \
319 = (vector unsigned char *) \
320 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
323 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
324 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
325 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
328 for (i=0;i<h/2;i++) { \
329 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
330 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
332 for (j=0;j<w/16;j++) { \
334 y1ivP = (vector unsigned char *)y1i; \
335 y2ivP = (vector unsigned char *)y2i; \
336 uivP = (vector unsigned char *)ui; \
337 vivP = (vector unsigned char *)vi; \
339 align_perm = vec_lvsl (0, y1i); \
340 y0 = (vector unsigned char)vec_perm (y1ivP[0], y1ivP[1], align_perm);\
342 align_perm = vec_lvsl (0, y2i); \
343 y1 = (vector unsigned char)vec_perm (y2ivP[0], y2ivP[1], align_perm);\
345 align_perm = vec_lvsl (0, ui); \
346 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
348 align_perm = vec_lvsl (0, vi); \
349 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
351 u = (vector signed char) \
352 vec_sub (u,(vector signed char) \
353 vec_splat((vector signed char)AVV(128),0));\
354 v = (vector signed char) \
355 vec_sub (v,(vector signed char) \
356 vec_splat((vector signed char)AVV(128),0));\
358 U = vec_unpackh (u); \
359 V = vec_unpackh (v); \
362 Y0 = vec_unh (y0); \
363 Y1 = vec_unl (y0); \
364 Y2 = vec_unh (y1); \
365 Y3 = vec_unl (y1); \
367 Y0 = vec_mradds (Y0, lCY, lOY); \
368 Y1 = vec_mradds (Y1, lCY, lOY); \
369 Y2 = vec_mradds (Y2, lCY, lOY); \
370 Y3 = vec_mradds (Y3, lCY, lOY); \
372 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
373 ux = vec_sl (U, lCSHIFT); \
374 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \
375 ux0 = vec_mergeh (ux,ux); \
376 ux1 = vec_mergel (ux,ux); \
378 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
379 vx = vec_sl (V, lCSHIFT); \
380 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \
381 vx0 = vec_mergeh (vx,vx); \
382 vx1 = vec_mergel (vx,vx); \
384 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
385 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \
386 uvx = vec_mradds (V, lCGV, uvx); \
387 uvx0 = vec_mergeh (uvx,uvx); \
388 uvx1 = vec_mergel (uvx,uvx); \
390 R0 = vec_add (Y0,vx0); \
391 G0 = vec_add (Y0,uvx0); \
392 B0 = vec_add (Y0,ux0); \
393 R1 = vec_add (Y1,vx1); \
394 G1 = vec_add (Y1,uvx1); \
395 B1 = vec_add (Y1,ux1); \
397 R = vec_packclp (R0,R1); \
398 G = vec_packclp (G0,G1); \
399 B = vec_packclp (B0,B1); \
401 out_pixels(R,G,B,oute); \
403 R0 = vec_add (Y2,vx0); \
404 G0 = vec_add (Y2,uvx0); \
405 B0 = vec_add (Y2,ux0); \
406 R1 = vec_add (Y3,vx1); \
407 G1 = vec_add (Y3,uvx1); \
408 B1 = vec_add (Y3,ux1); \
409 R = vec_packclp (R0,R1); \
410 G = vec_packclp (G0,G1); \
411 B = vec_packclp (B0,B1); \
414 out_pixels(R,G,B,outo); \
416 y1i += 16; \
417 y2i += 16; \
418 ui += 8; \
419 vi += 8; \
423 outo += (outstrides[0])>>4; \
424 oute += (outstrides[0])>>4; \
426 ui += instrides_scl[1]; \
427 vi += instrides_scl[2]; \
428 y1i += instrides_scl[0]; \
429 y2i += instrides_scl[0]; \
431 return srcSliceH; \
435 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
436 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
437 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
438 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
439 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
440 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
442 DEFCSP420_CVT (yuv2_abgr, out_abgr)
443 #if 1
444 DEFCSP420_CVT (yuv2_bgra, out_bgra)
445 #else
446 static int altivec_yuv2_bgra32 (SwsContext *c,
447 unsigned char **in, int *instrides,
448 int srcSliceY, int srcSliceH,
449 unsigned char **oplanes, int *outstrides)
451 int w = c->srcW;
452 int h = srcSliceH;
453 int i,j;
454 int instrides_scl[3];
455 vector unsigned char y0,y1;
457 vector signed char u,v;
459 vector signed short Y0,Y1,Y2,Y3;
460 vector signed short U,V;
461 vector signed short vx,ux,uvx;
462 vector signed short vx0,ux0,uvx0;
463 vector signed short vx1,ux1,uvx1;
464 vector signed short R0,G0,B0;
465 vector signed short R1,G1,B1;
466 vector unsigned char R,G,B;
468 vector unsigned char *uivP, *vivP;
469 vector unsigned char align_perm;
471 vector signed short
472 lCY = c->CY,
473 lOY = c->OY,
474 lCRV = c->CRV,
475 lCBU = c->CBU,
476 lCGU = c->CGU,
477 lCGV = c->CGV;
479 vector unsigned short lCSHIFT = c->CSHIFT;
481 ubyte *y1i = in[0];
482 ubyte *y2i = in[0]+w;
483 ubyte *ui = in[1];
484 ubyte *vi = in[2];
486 vector unsigned char *oute
487 = (vector unsigned char *)
488 (oplanes[0]+srcSliceY*outstrides[0]);
489 vector unsigned char *outo
490 = (vector unsigned char *)
491 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
494 instrides_scl[0] = instrides[0];
495 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
496 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
499 for (i=0;i<h/2;i++) {
500 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
501 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
503 for (j=0;j<w/16;j++) {
505 y0 = vec_ldl (0,y1i);
506 y1 = vec_ldl (0,y2i);
507 uivP = (vector unsigned char *)ui;
508 vivP = (vector unsigned char *)vi;
510 align_perm = vec_lvsl (0, ui);
511 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
513 align_perm = vec_lvsl (0, vi);
514 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
515 u = (vector signed char)
516 vec_sub (u,(vector signed char)
517 vec_splat((vector signed char)AVV(128),0));
519 v = (vector signed char)
520 vec_sub (v, (vector signed char)
521 vec_splat((vector signed char)AVV(128),0));
523 U = vec_unpackh (u);
524 V = vec_unpackh (v);
527 Y0 = vec_unh (y0);
528 Y1 = vec_unl (y0);
529 Y2 = vec_unh (y1);
530 Y3 = vec_unl (y1);
532 Y0 = vec_mradds (Y0, lCY, lOY);
533 Y1 = vec_mradds (Y1, lCY, lOY);
534 Y2 = vec_mradds (Y2, lCY, lOY);
535 Y3 = vec_mradds (Y3, lCY, lOY);
537 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
538 ux = vec_sl (U, lCSHIFT);
539 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
540 ux0 = vec_mergeh (ux,ux);
541 ux1 = vec_mergel (ux,ux);
543 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
544 vx = vec_sl (V, lCSHIFT);
545 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
546 vx0 = vec_mergeh (vx,vx);
547 vx1 = vec_mergel (vx,vx);
548 /* uvx = ((CGU*u) + (CGV*v))>>15 */
549 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
550 uvx = vec_mradds (V, lCGV, uvx);
551 uvx0 = vec_mergeh (uvx,uvx);
552 uvx1 = vec_mergel (uvx,uvx);
553 R0 = vec_add (Y0,vx0);
554 G0 = vec_add (Y0,uvx0);
555 B0 = vec_add (Y0,ux0);
556 R1 = vec_add (Y1,vx1);
557 G1 = vec_add (Y1,uvx1);
558 B1 = vec_add (Y1,ux1);
559 R = vec_packclp (R0,R1);
560 G = vec_packclp (G0,G1);
561 B = vec_packclp (B0,B1);
563 out_argb(R,G,B,oute);
564 R0 = vec_add (Y2,vx0);
565 G0 = vec_add (Y2,uvx0);
566 B0 = vec_add (Y2,ux0);
567 R1 = vec_add (Y3,vx1);
568 G1 = vec_add (Y3,uvx1);
569 B1 = vec_add (Y3,ux1);
570 R = vec_packclp (R0,R1);
571 G = vec_packclp (G0,G1);
572 B = vec_packclp (B0,B1);
574 out_argb(R,G,B,outo);
575 y1i += 16;
576 y2i += 16;
577 ui += 8;
578 vi += 8;
582 outo += (outstrides[0])>>4;
583 oute += (outstrides[0])>>4;
585 ui += instrides_scl[1];
586 vi += instrides_scl[2];
587 y1i += instrides_scl[0];
588 y2i += instrides_scl[0];
590 return srcSliceH;
593 #endif
596 DEFCSP420_CVT (yuv2_rgba, out_rgba)
597 DEFCSP420_CVT (yuv2_argb, out_argb)
598 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
599 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
602 // uyvy|uyvy|uyvy|uyvy
603 // 0123 4567 89ab cdef
604 static
605 const vector unsigned char
606 demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
607 0x10,0x04,0x10,0x04,
608 0x10,0x08,0x10,0x08,
609 0x10,0x0c,0x10,0x0c),
610 demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
611 0x10,0x06,0x10,0x06,
612 0x10,0x0A,0x10,0x0A,
613 0x10,0x0E,0x10,0x0E),
614 demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
615 0x10,0x05,0x10,0x07,
616 0x10,0x09,0x10,0x0B,
617 0x10,0x0D,0x10,0x0F);
620 this is so I can play live CCIR raw video
622 static int altivec_uyvy_rgb32 (SwsContext *c,
623 unsigned char **in, int *instrides,
624 int srcSliceY, int srcSliceH,
625 unsigned char **oplanes, int *outstrides)
627 int w = c->srcW;
628 int h = srcSliceH;
629 int i,j;
630 vector unsigned char uyvy;
631 vector signed short Y,U,V;
632 vector signed short R0,G0,B0,R1,G1,B1;
633 vector unsigned char R,G,B;
634 vector unsigned char *out;
635 ubyte *img;
637 img = in[0];
638 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
640 for (i=0;i<h;i++) {
641 for (j=0;j<w/16;j++) {
642 uyvy = vec_ld (0, img);
643 U = (vector signed short)
644 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
646 V = (vector signed short)
647 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
649 Y = (vector signed short)
650 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
652 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
654 uyvy = vec_ld (16, img);
655 U = (vector signed short)
656 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
658 V = (vector signed short)
659 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
661 Y = (vector signed short)
662 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
664 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
666 R = vec_packclp (R0,R1);
667 G = vec_packclp (G0,G1);
668 B = vec_packclp (B0,B1);
670 // vec_mstbgr24 (R,G,B, out);
671 out_rgba (R,G,B,out);
673 img += 32;
676 return srcSliceH;
681 /* Ok currently the acceleration routine only supports
682 inputs of widths a multiple of 16
683 and heights a multiple 2
685 So we just fall back to the C codes for this.
687 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
689 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
690 return NULL;
693 and this seems not to matter too much I tried a bunch of
694 videos with abnormal widths and mplayer crashes else where.
695 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
696 boom with X11 bad match.
699 if ((c->srcW & 0xf) != 0) return NULL;
701 switch (c->srcFormat) {
702 case PIX_FMT_YUV410P:
703 case PIX_FMT_YUV420P:
704 /*case IMGFMT_CLPL: ??? */
705 case PIX_FMT_GRAY8:
706 case PIX_FMT_NV12:
707 case PIX_FMT_NV21:
708 if ((c->srcH & 0x1) != 0)
709 return NULL;
711 switch(c->dstFormat){
712 case PIX_FMT_RGB24:
713 MSG_WARN("ALTIVEC: Color Space RGB24\n");
714 return altivec_yuv2_rgb24;
715 case PIX_FMT_BGR24:
716 MSG_WARN("ALTIVEC: Color Space BGR24\n");
717 return altivec_yuv2_bgr24;
718 case PIX_FMT_ARGB:
719 MSG_WARN("ALTIVEC: Color Space ARGB\n");
720 return altivec_yuv2_argb;
721 case PIX_FMT_ABGR:
722 MSG_WARN("ALTIVEC: Color Space ABGR\n");
723 return altivec_yuv2_abgr;
724 case PIX_FMT_RGBA:
725 MSG_WARN("ALTIVEC: Color Space RGBA\n");
726 return altivec_yuv2_rgba;
727 case PIX_FMT_BGRA:
728 MSG_WARN("ALTIVEC: Color Space BGRA\n");
729 return altivec_yuv2_bgra;
730 default: return NULL;
732 break;
734 case PIX_FMT_UYVY422:
735 switch(c->dstFormat){
736 case PIX_FMT_BGR32:
737 MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
738 return altivec_uyvy_rgb32;
739 default: return NULL;
741 break;
744 return NULL;
747 static uint16_t roundToInt16(int64_t f){
748 int r= (f + (1<<15))>>16;
749 if(r<-0x7FFF) return 0x8000;
750 else if(r> 0x7FFF) return 0x7FFF;
751 else return r;
754 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
756 union {
757 signed short tmp[8] __attribute__ ((aligned(16)));
758 vector signed short vec;
759 } buf;
761 buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy
762 buf.tmp[1] = -256*brightness; //oy
763 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
764 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
765 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
766 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
769 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
770 c->CY = vec_splat ((vector signed short)buf.vec, 0);
771 c->OY = vec_splat ((vector signed short)buf.vec, 1);
772 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
773 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
774 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
775 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
776 #if 0
778 int i;
779 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
780 for (i=0; i<6;i++)
781 printf("%s %d ", v[i],buf.tmp[i] );
782 printf("\n");
784 #endif
785 return;
789 void
790 altivec_yuv2packedX (SwsContext *c,
791 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
792 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
793 uint8_t *dest, int dstW, int dstY)
795 int i,j;
796 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
797 vector signed short R0,G0,B0,R1,G1,B1;
799 vector unsigned char R,G,B;
800 vector unsigned char *out,*nout;
802 vector signed short RND = vec_splat_s16(1<<3);
803 vector unsigned short SCL = vec_splat_u16(4);
804 unsigned long scratch[16] __attribute__ ((aligned (16)));
806 vector signed short *YCoeffs, *CCoeffs;
808 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
809 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
811 out = (vector unsigned char *)dest;
813 for(i=0; i<dstW; i+=16){
814 Y0 = RND;
815 Y1 = RND;
816 /* extract 16 coeffs from lumSrc */
817 for(j=0; j<lumFilterSize; j++) {
818 X0 = vec_ld (0, &lumSrc[j][i]);
819 X1 = vec_ld (16, &lumSrc[j][i]);
820 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
821 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
824 U = RND;
825 V = RND;
826 /* extract 8 coeffs from U,V */
827 for(j=0; j<chrFilterSize; j++) {
828 X = vec_ld (0, &chrSrc[j][i/2]);
829 U = vec_mradds (X, CCoeffs[j], U);
830 X = vec_ld (0, &chrSrc[j][i/2+2048]);
831 V = vec_mradds (X, CCoeffs[j], V);
834 /* scale and clip signals */
835 Y0 = vec_sra (Y0, SCL);
836 Y1 = vec_sra (Y1, SCL);
837 U = vec_sra (U, SCL);
838 V = vec_sra (V, SCL);
840 Y0 = vec_clip_s16 (Y0);
841 Y1 = vec_clip_s16 (Y1);
842 U = vec_clip_s16 (U);
843 V = vec_clip_s16 (V);
845 /* now we have
846 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
847 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
849 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
850 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
851 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
854 U0 = vec_mergeh (U,U);
855 V0 = vec_mergeh (V,V);
857 U1 = vec_mergel (U,U);
858 V1 = vec_mergel (V,V);
860 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
861 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
863 R = vec_packclp (R0,R1);
864 G = vec_packclp (G0,G1);
865 B = vec_packclp (B0,B1);
867 switch(c->dstFormat) {
868 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
869 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
870 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
871 case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
872 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
873 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
874 default:
876 /* If this is reached, the caller should have called yuv2packedXinC
877 instead. */
878 static int printed_error_message;
879 if(!printed_error_message) {
880 MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
881 sws_format_name(c->dstFormat));
882 printed_error_message=1;
884 return;
889 if (i < dstW) {
890 i -= 16;
892 Y0 = RND;
893 Y1 = RND;
894 /* extract 16 coeffs from lumSrc */
895 for(j=0; j<lumFilterSize; j++) {
896 X0 = vec_ld (0, &lumSrc[j][i]);
897 X1 = vec_ld (16, &lumSrc[j][i]);
898 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
899 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
902 U = RND;
903 V = RND;
904 /* extract 8 coeffs from U,V */
905 for(j=0; j<chrFilterSize; j++) {
906 X = vec_ld (0, &chrSrc[j][i/2]);
907 U = vec_mradds (X, CCoeffs[j], U);
908 X = vec_ld (0, &chrSrc[j][i/2+2048]);
909 V = vec_mradds (X, CCoeffs[j], V);
912 /* scale and clip signals */
913 Y0 = vec_sra (Y0, SCL);
914 Y1 = vec_sra (Y1, SCL);
915 U = vec_sra (U, SCL);
916 V = vec_sra (V, SCL);
918 Y0 = vec_clip_s16 (Y0);
919 Y1 = vec_clip_s16 (Y1);
920 U = vec_clip_s16 (U);
921 V = vec_clip_s16 (V);
923 /* now we have
924 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
925 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
927 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
928 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
929 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
932 U0 = vec_mergeh (U,U);
933 V0 = vec_mergeh (V,V);
935 U1 = vec_mergel (U,U);
936 V1 = vec_mergel (V,V);
938 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
939 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
941 R = vec_packclp (R0,R1);
942 G = vec_packclp (G0,G1);
943 B = vec_packclp (B0,B1);
945 nout = (vector unsigned char *)scratch;
946 switch(c->dstFormat) {
947 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
948 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
949 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
950 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
951 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
952 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
953 default:
954 /* Unreachable, I think. */
955 MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
956 sws_format_name(c->dstFormat));
957 return;
960 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);