better ao/vo profile examples
[mplayer/greg.git] / libswscale / yuv2rgb_altivec.c
blobccb3c73b92def2e435eadb19f7ead9efd63c37b8
1 /*
2 marc.hoffman@analog.com March 8, 2004
4 AltiVec acceleration for colorspace conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
30 March 27,2004
31 PERFORMANCE ANALYSIS
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
36 720*480*30 ~10MPS
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
66 * This file is part of FFmpeg.
68 * FFmpeg is free software; you can redistribute it and/or modify
69 * it under the terms of the GNU General Public License as published by
70 * the Free Software Foundation; either version 2 of the License, or
71 * (at your option) any later version.
73 * FFmpeg is distributed in the hope that it will be useful,
74 * but WITHOUT ANY WARRANTY; without even the implied warranty of
75 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
76 * GNU General Public License for more details.
78 * You should have received a copy of the GNU General Public License
79 * along with FFmpeg; if not, write to the Free Software
80 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
83 #include <stdio.h>
84 #include <stdlib.h>
85 #include <string.h>
86 #include <inttypes.h>
87 #include <assert.h>
88 #include "config.h"
89 #ifdef HAVE_MALLOC_H
90 #include <malloc.h>
91 #endif
92 #include "rgb2rgb.h"
93 #include "swscale.h"
94 #include "swscale_internal.h"
96 #undef PROFILE_THE_BEAST
97 #undef INC_SCALING
99 typedef unsigned char ubyte;
100 typedef signed char sbyte;
103 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
104 homogeneous vector registers x0,x1,x2 are interleaved with the
105 following technique:
107 o0 = vec_mergeh (x0,x1);
108 o1 = vec_perm (o0, x2, perm_rgb_0);
109 o2 = vec_perm (o0, x2, perm_rgb_1);
110 o3 = vec_mergel (x0,x1);
111 o4 = vec_perm (o3,o2,perm_rgb_2);
112 o5 = vec_perm (o3,o2,perm_rgb_3);
114 perm_rgb_0: o0(RG).h v1(B) --> o1*
115 0 1 2 3 4
116 rgbr|gbrg|brgb|rgbr
117 0010 0100 1001 0010
118 0102 3145 2673 894A
120 perm_rgb_1: o0(RG).h v1(B) --> o2
121 0 1 2 3 4
122 gbrg|brgb|bbbb|bbbb
123 0100 1001 1111 1111
124 B5CD 6EF7 89AB CDEF
126 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
127 0 1 2 3 4
128 gbrg|brgb|rgbr|gbrg
129 1111 1111 0010 0100
130 89AB CDEF 0182 3945
132 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
133 0 1 2 3 4
134 brgb|rgbr|gbrg|brgb
135 1001 0010 0100 1001
136 a67b 89cA BdCD eEFf
139 static
140 const vector unsigned char
141 perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
142 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
143 perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
144 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
145 perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
146 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
147 perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
148 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
150 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
151 do { \
152 typeof(x0) o0,o2,o3; \
153 o0 = vec_mergeh (x0,x1); \
154 y0 = vec_perm (o0, x2, perm_rgb_0); \
155 o2 = vec_perm (o0, x2, perm_rgb_1); \
156 o3 = vec_mergel (x0,x1); \
157 y1 = vec_perm (o3,o2,perm_rgb_2); \
158 y2 = vec_perm (o3,o2,perm_rgb_3); \
159 } while(0)
161 #define vec_mstbgr24(x0,x1,x2,ptr) \
162 do { \
163 typeof(x0) _0,_1,_2; \
164 vec_merge3 (x0,x1,x2,_0,_1,_2); \
165 vec_st (_0, 0, ptr++); \
166 vec_st (_1, 0, ptr++); \
167 vec_st (_2, 0, ptr++); \
168 } while (0);
170 #define vec_mstrgb24(x0,x1,x2,ptr) \
171 do { \
172 typeof(x0) _0,_1,_2; \
173 vec_merge3 (x2,x1,x0,_0,_1,_2); \
174 vec_st (_0, 0, ptr++); \
175 vec_st (_1, 0, ptr++); \
176 vec_st (_2, 0, ptr++); \
177 } while (0);
179 /* pack the pixels in rgb0 format
180 msb R
181 lsb 0
183 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
184 do { \
185 T _0,_1,_2,_3; \
186 _0 = vec_mergeh (x0,x1); \
187 _1 = vec_mergeh (x2,x3); \
188 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
189 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
190 vec_st (_2, 0*16, (T *)ptr); \
191 vec_st (_3, 1*16, (T *)ptr); \
192 _0 = vec_mergel (x0,x1); \
193 _1 = vec_mergel (x2,x3); \
194 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
195 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
196 vec_st (_2, 2*16, (T *)ptr); \
197 vec_st (_3, 3*16, (T *)ptr); \
198 ptr += 4; \
199 } while (0);
203 | 1 0 1.4021 | | Y |
204 | 1 -0.3441 -0.7142 |x| Cb|
205 | 1 1.7718 0 | | Cr|
208 Y: [-128 127]
209 Cb/Cr : [-128 127]
211 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
218 #define vec_unh(x) \
219 (vector signed short) \
220 vec_perm(x,(typeof(x))AVV(0),\
221 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
222 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
223 #define vec_unl(x) \
224 (vector signed short) \
225 vec_perm(x,(typeof(x))AVV(0),\
226 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
227 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
229 #define vec_clip_s16(x) \
230 vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
231 (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16))
233 #define vec_packclp(x,y) \
234 (vector unsigned char)vec_packs \
235 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
236 (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
238 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
241 static inline void cvtyuvtoRGB (SwsContext *c,
242 vector signed short Y, vector signed short U, vector signed short V,
243 vector signed short *R, vector signed short *G, vector signed short *B)
245 vector signed short vx,ux,uvx;
247 Y = vec_mradds (Y, c->CY, c->OY);
248 U = vec_sub (U,(vector signed short)
249 vec_splat((vector signed short)AVV(128),0));
250 V = vec_sub (V,(vector signed short)
251 vec_splat((vector signed short)AVV(128),0));
253 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
254 ux = vec_sl (U, c->CSHIFT);
255 *B = vec_mradds (ux, c->CBU, Y);
257 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
258 vx = vec_sl (V, c->CSHIFT);
259 *R = vec_mradds (vx, c->CRV, Y);
261 // uvx = ((CGU*u) + (CGV*v))>>15;
262 uvx = vec_mradds (U, c->CGU, Y);
263 *G = vec_mradds (V, c->CGV, uvx);
268 ------------------------------------------------------------------------------
269 CS converters
270 ------------------------------------------------------------------------------
274 #define DEFCSP420_CVT(name,out_pixels) \
275 static int altivec_##name (SwsContext *c, \
276 unsigned char **in, int *instrides, \
277 int srcSliceY, int srcSliceH, \
278 unsigned char **oplanes, int *outstrides) \
280 int w = c->srcW; \
281 int h = srcSliceH; \
282 int i,j; \
283 int instrides_scl[3]; \
284 vector unsigned char y0,y1; \
286 vector signed char u,v; \
288 vector signed short Y0,Y1,Y2,Y3; \
289 vector signed short U,V; \
290 vector signed short vx,ux,uvx; \
291 vector signed short vx0,ux0,uvx0; \
292 vector signed short vx1,ux1,uvx1; \
293 vector signed short R0,G0,B0; \
294 vector signed short R1,G1,B1; \
295 vector unsigned char R,G,B; \
297 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
298 vector unsigned char align_perm; \
300 vector signed short \
301 lCY = c->CY, \
302 lOY = c->OY, \
303 lCRV = c->CRV, \
304 lCBU = c->CBU, \
305 lCGU = c->CGU, \
306 lCGV = c->CGV; \
308 vector unsigned short lCSHIFT = c->CSHIFT; \
310 ubyte *y1i = in[0]; \
311 ubyte *y2i = in[0]+instrides[0]; \
312 ubyte *ui = in[1]; \
313 ubyte *vi = in[2]; \
315 vector unsigned char *oute \
316 = (vector unsigned char *) \
317 (oplanes[0]+srcSliceY*outstrides[0]); \
318 vector unsigned char *outo \
319 = (vector unsigned char *) \
320 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
323 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
324 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
325 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
328 for (i=0;i<h/2;i++) { \
329 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
330 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
332 for (j=0;j<w/16;j++) { \
334 y1ivP = (vector unsigned char *)y1i; \
335 y2ivP = (vector unsigned char *)y2i; \
336 uivP = (vector unsigned char *)ui; \
337 vivP = (vector unsigned char *)vi; \
339 align_perm = vec_lvsl (0, y1i); \
340 y0 = (vector unsigned char) \
341 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
343 align_perm = vec_lvsl (0, y2i); \
344 y1 = (vector unsigned char) \
345 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
347 align_perm = vec_lvsl (0, ui); \
348 u = (vector signed char) \
349 vec_perm (uivP[0], uivP[1], align_perm); \
351 align_perm = vec_lvsl (0, vi); \
352 v = (vector signed char) \
353 vec_perm (vivP[0], vivP[1], align_perm); \
355 u = (vector signed char) \
356 vec_sub (u,(vector signed char) \
357 vec_splat((vector signed char)AVV(128),0)); \
358 v = (vector signed char) \
359 vec_sub (v,(vector signed char) \
360 vec_splat((vector signed char)AVV(128),0)); \
362 U = vec_unpackh (u); \
363 V = vec_unpackh (v); \
366 Y0 = vec_unh (y0); \
367 Y1 = vec_unl (y0); \
368 Y2 = vec_unh (y1); \
369 Y3 = vec_unl (y1); \
371 Y0 = vec_mradds (Y0, lCY, lOY); \
372 Y1 = vec_mradds (Y1, lCY, lOY); \
373 Y2 = vec_mradds (Y2, lCY, lOY); \
374 Y3 = vec_mradds (Y3, lCY, lOY); \
376 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
377 ux = vec_sl (U, lCSHIFT); \
378 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \
379 ux0 = vec_mergeh (ux,ux); \
380 ux1 = vec_mergel (ux,ux); \
382 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
383 vx = vec_sl (V, lCSHIFT); \
384 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \
385 vx0 = vec_mergeh (vx,vx); \
386 vx1 = vec_mergel (vx,vx); \
388 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
389 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \
390 uvx = vec_mradds (V, lCGV, uvx); \
391 uvx0 = vec_mergeh (uvx,uvx); \
392 uvx1 = vec_mergel (uvx,uvx); \
394 R0 = vec_add (Y0,vx0); \
395 G0 = vec_add (Y0,uvx0); \
396 B0 = vec_add (Y0,ux0); \
397 R1 = vec_add (Y1,vx1); \
398 G1 = vec_add (Y1,uvx1); \
399 B1 = vec_add (Y1,ux1); \
401 R = vec_packclp (R0,R1); \
402 G = vec_packclp (G0,G1); \
403 B = vec_packclp (B0,B1); \
405 out_pixels(R,G,B,oute); \
407 R0 = vec_add (Y2,vx0); \
408 G0 = vec_add (Y2,uvx0); \
409 B0 = vec_add (Y2,ux0); \
410 R1 = vec_add (Y3,vx1); \
411 G1 = vec_add (Y3,uvx1); \
412 B1 = vec_add (Y3,ux1); \
413 R = vec_packclp (R0,R1); \
414 G = vec_packclp (G0,G1); \
415 B = vec_packclp (B0,B1); \
418 out_pixels(R,G,B,outo); \
420 y1i += 16; \
421 y2i += 16; \
422 ui += 8; \
423 vi += 8; \
427 outo += (outstrides[0])>>4; \
428 oute += (outstrides[0])>>4; \
430 ui += instrides_scl[1]; \
431 vi += instrides_scl[2]; \
432 y1i += instrides_scl[0]; \
433 y2i += instrides_scl[0]; \
435 return srcSliceH; \
439 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
440 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
441 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
442 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
443 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
444 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
446 DEFCSP420_CVT (yuv2_abgr, out_abgr)
447 #if 1
448 DEFCSP420_CVT (yuv2_bgra, out_bgra)
449 #else
450 static int altivec_yuv2_bgra32 (SwsContext *c,
451 unsigned char **in, int *instrides,
452 int srcSliceY, int srcSliceH,
453 unsigned char **oplanes, int *outstrides)
455 int w = c->srcW;
456 int h = srcSliceH;
457 int i,j;
458 int instrides_scl[3];
459 vector unsigned char y0,y1;
461 vector signed char u,v;
463 vector signed short Y0,Y1,Y2,Y3;
464 vector signed short U,V;
465 vector signed short vx,ux,uvx;
466 vector signed short vx0,ux0,uvx0;
467 vector signed short vx1,ux1,uvx1;
468 vector signed short R0,G0,B0;
469 vector signed short R1,G1,B1;
470 vector unsigned char R,G,B;
472 vector unsigned char *uivP, *vivP;
473 vector unsigned char align_perm;
475 vector signed short
476 lCY = c->CY,
477 lOY = c->OY,
478 lCRV = c->CRV,
479 lCBU = c->CBU,
480 lCGU = c->CGU,
481 lCGV = c->CGV;
483 vector unsigned short lCSHIFT = c->CSHIFT;
485 ubyte *y1i = in[0];
486 ubyte *y2i = in[0]+w;
487 ubyte *ui = in[1];
488 ubyte *vi = in[2];
490 vector unsigned char *oute
491 = (vector unsigned char *)
492 (oplanes[0]+srcSliceY*outstrides[0]);
493 vector unsigned char *outo
494 = (vector unsigned char *)
495 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
498 instrides_scl[0] = instrides[0];
499 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
500 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
503 for (i=0;i<h/2;i++) {
504 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
505 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
507 for (j=0;j<w/16;j++) {
509 y0 = vec_ldl (0,y1i);
510 y1 = vec_ldl (0,y2i);
511 uivP = (vector unsigned char *)ui;
512 vivP = (vector unsigned char *)vi;
514 align_perm = vec_lvsl (0, ui);
515 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
517 align_perm = vec_lvsl (0, vi);
518 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
519 u = (vector signed char)
520 vec_sub (u,(vector signed char)
521 vec_splat((vector signed char)AVV(128),0));
523 v = (vector signed char)
524 vec_sub (v, (vector signed char)
525 vec_splat((vector signed char)AVV(128),0));
527 U = vec_unpackh (u);
528 V = vec_unpackh (v);
531 Y0 = vec_unh (y0);
532 Y1 = vec_unl (y0);
533 Y2 = vec_unh (y1);
534 Y3 = vec_unl (y1);
536 Y0 = vec_mradds (Y0, lCY, lOY);
537 Y1 = vec_mradds (Y1, lCY, lOY);
538 Y2 = vec_mradds (Y2, lCY, lOY);
539 Y3 = vec_mradds (Y3, lCY, lOY);
541 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
542 ux = vec_sl (U, lCSHIFT);
543 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
544 ux0 = vec_mergeh (ux,ux);
545 ux1 = vec_mergel (ux,ux);
547 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
548 vx = vec_sl (V, lCSHIFT);
549 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
550 vx0 = vec_mergeh (vx,vx);
551 vx1 = vec_mergel (vx,vx);
552 /* uvx = ((CGU*u) + (CGV*v))>>15 */
553 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
554 uvx = vec_mradds (V, lCGV, uvx);
555 uvx0 = vec_mergeh (uvx,uvx);
556 uvx1 = vec_mergel (uvx,uvx);
557 R0 = vec_add (Y0,vx0);
558 G0 = vec_add (Y0,uvx0);
559 B0 = vec_add (Y0,ux0);
560 R1 = vec_add (Y1,vx1);
561 G1 = vec_add (Y1,uvx1);
562 B1 = vec_add (Y1,ux1);
563 R = vec_packclp (R0,R1);
564 G = vec_packclp (G0,G1);
565 B = vec_packclp (B0,B1);
567 out_argb(R,G,B,oute);
568 R0 = vec_add (Y2,vx0);
569 G0 = vec_add (Y2,uvx0);
570 B0 = vec_add (Y2,ux0);
571 R1 = vec_add (Y3,vx1);
572 G1 = vec_add (Y3,uvx1);
573 B1 = vec_add (Y3,ux1);
574 R = vec_packclp (R0,R1);
575 G = vec_packclp (G0,G1);
576 B = vec_packclp (B0,B1);
578 out_argb(R,G,B,outo);
579 y1i += 16;
580 y2i += 16;
581 ui += 8;
582 vi += 8;
586 outo += (outstrides[0])>>4;
587 oute += (outstrides[0])>>4;
589 ui += instrides_scl[1];
590 vi += instrides_scl[2];
591 y1i += instrides_scl[0];
592 y2i += instrides_scl[0];
594 return srcSliceH;
597 #endif
600 DEFCSP420_CVT (yuv2_rgba, out_rgba)
601 DEFCSP420_CVT (yuv2_argb, out_argb)
602 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
603 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
606 // uyvy|uyvy|uyvy|uyvy
607 // 0123 4567 89ab cdef
608 static
609 const vector unsigned char
610 demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
611 0x10,0x04,0x10,0x04,
612 0x10,0x08,0x10,0x08,
613 0x10,0x0c,0x10,0x0c),
614 demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
615 0x10,0x06,0x10,0x06,
616 0x10,0x0A,0x10,0x0A,
617 0x10,0x0E,0x10,0x0E),
618 demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
619 0x10,0x05,0x10,0x07,
620 0x10,0x09,0x10,0x0B,
621 0x10,0x0D,0x10,0x0F);
624 this is so I can play live CCIR raw video
626 static int altivec_uyvy_rgb32 (SwsContext *c,
627 unsigned char **in, int *instrides,
628 int srcSliceY, int srcSliceH,
629 unsigned char **oplanes, int *outstrides)
631 int w = c->srcW;
632 int h = srcSliceH;
633 int i,j;
634 vector unsigned char uyvy;
635 vector signed short Y,U,V;
636 vector signed short R0,G0,B0,R1,G1,B1;
637 vector unsigned char R,G,B;
638 vector unsigned char *out;
639 ubyte *img;
641 img = in[0];
642 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
644 for (i=0;i<h;i++) {
645 for (j=0;j<w/16;j++) {
646 uyvy = vec_ld (0, img);
647 U = (vector signed short)
648 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
650 V = (vector signed short)
651 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
653 Y = (vector signed short)
654 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
656 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
658 uyvy = vec_ld (16, img);
659 U = (vector signed short)
660 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
662 V = (vector signed short)
663 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
665 Y = (vector signed short)
666 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
668 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
670 R = vec_packclp (R0,R1);
671 G = vec_packclp (G0,G1);
672 B = vec_packclp (B0,B1);
674 // vec_mstbgr24 (R,G,B, out);
675 out_rgba (R,G,B,out);
677 img += 32;
680 return srcSliceH;
685 /* Ok currently the acceleration routine only supports
686 inputs of widths a multiple of 16
687 and heights a multiple 2
689 So we just fall back to the C codes for this.
691 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
693 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
694 return NULL;
697 and this seems not to matter too much I tried a bunch of
698 videos with abnormal widths and mplayer crashes else where.
699 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
700 boom with X11 bad match.
703 if ((c->srcW & 0xf) != 0) return NULL;
705 switch (c->srcFormat) {
706 case PIX_FMT_YUV410P:
707 case PIX_FMT_YUV420P:
708 /*case IMGFMT_CLPL: ??? */
709 case PIX_FMT_GRAY8:
710 case PIX_FMT_NV12:
711 case PIX_FMT_NV21:
712 if ((c->srcH & 0x1) != 0)
713 return NULL;
715 switch(c->dstFormat){
716 case PIX_FMT_RGB24:
717 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
718 return altivec_yuv2_rgb24;
719 case PIX_FMT_BGR24:
720 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
721 return altivec_yuv2_bgr24;
722 case PIX_FMT_ARGB:
723 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
724 return altivec_yuv2_argb;
725 case PIX_FMT_ABGR:
726 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
727 return altivec_yuv2_abgr;
728 case PIX_FMT_RGBA:
729 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
730 return altivec_yuv2_rgba;
731 case PIX_FMT_BGRA:
732 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
733 return altivec_yuv2_bgra;
734 default: return NULL;
736 break;
738 case PIX_FMT_UYVY422:
739 switch(c->dstFormat){
740 case PIX_FMT_BGR32:
741 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
742 return altivec_uyvy_rgb32;
743 default: return NULL;
745 break;
748 return NULL;
751 static uint16_t roundToInt16(int64_t f){
752 int r= (f + (1<<15))>>16;
753 if (r<-0x7FFF) return 0x8000;
754 else if (r> 0x7FFF) return 0x7FFF;
755 else return r;
758 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
760 union {
761 signed short tmp[8] __attribute__ ((aligned(16)));
762 vector signed short vec;
763 } buf;
765 buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy
766 buf.tmp[1] = -256*brightness; //oy
767 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
768 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
769 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
770 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
773 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
774 c->CY = vec_splat ((vector signed short)buf.vec, 0);
775 c->OY = vec_splat ((vector signed short)buf.vec, 1);
776 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
777 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
778 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
779 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
780 #if 0
782 int i;
783 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
784 for (i=0; i<6; i++)
785 printf("%s %d ", v[i],buf.tmp[i] );
786 printf("\n");
788 #endif
789 return;
793 void
794 altivec_yuv2packedX (SwsContext *c,
795 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
796 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
797 uint8_t *dest, int dstW, int dstY)
799 int i,j;
800 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
801 vector signed short R0,G0,B0,R1,G1,B1;
803 vector unsigned char R,G,B;
804 vector unsigned char *out,*nout;
806 vector signed short RND = vec_splat_s16(1<<3);
807 vector unsigned short SCL = vec_splat_u16(4);
808 unsigned long scratch[16] __attribute__ ((aligned (16)));
810 vector signed short *YCoeffs, *CCoeffs;
812 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
813 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
815 out = (vector unsigned char *)dest;
817 for (i=0; i<dstW; i+=16){
818 Y0 = RND;
819 Y1 = RND;
820 /* extract 16 coeffs from lumSrc */
821 for (j=0; j<lumFilterSize; j++) {
822 X0 = vec_ld (0, &lumSrc[j][i]);
823 X1 = vec_ld (16, &lumSrc[j][i]);
824 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
825 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
828 U = RND;
829 V = RND;
830 /* extract 8 coeffs from U,V */
831 for (j=0; j<chrFilterSize; j++) {
832 X = vec_ld (0, &chrSrc[j][i/2]);
833 U = vec_mradds (X, CCoeffs[j], U);
834 X = vec_ld (0, &chrSrc[j][i/2+2048]);
835 V = vec_mradds (X, CCoeffs[j], V);
838 /* scale and clip signals */
839 Y0 = vec_sra (Y0, SCL);
840 Y1 = vec_sra (Y1, SCL);
841 U = vec_sra (U, SCL);
842 V = vec_sra (V, SCL);
844 Y0 = vec_clip_s16 (Y0);
845 Y1 = vec_clip_s16 (Y1);
846 U = vec_clip_s16 (U);
847 V = vec_clip_s16 (V);
849 /* now we have
850 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
851 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
853 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
854 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
855 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
858 U0 = vec_mergeh (U,U);
859 V0 = vec_mergeh (V,V);
861 U1 = vec_mergel (U,U);
862 V1 = vec_mergel (V,V);
864 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
865 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
867 R = vec_packclp (R0,R1);
868 G = vec_packclp (G0,G1);
869 B = vec_packclp (B0,B1);
871 switch(c->dstFormat) {
872 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
873 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
874 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
875 case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
876 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
877 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
878 default:
880 /* If this is reached, the caller should have called yuv2packedXinC
881 instead. */
882 static int printed_error_message;
883 if (!printed_error_message) {
884 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
885 sws_format_name(c->dstFormat));
886 printed_error_message=1;
888 return;
893 if (i < dstW) {
894 i -= 16;
896 Y0 = RND;
897 Y1 = RND;
898 /* extract 16 coeffs from lumSrc */
899 for (j=0; j<lumFilterSize; j++) {
900 X0 = vec_ld (0, &lumSrc[j][i]);
901 X1 = vec_ld (16, &lumSrc[j][i]);
902 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
903 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
906 U = RND;
907 V = RND;
908 /* extract 8 coeffs from U,V */
909 for (j=0; j<chrFilterSize; j++) {
910 X = vec_ld (0, &chrSrc[j][i/2]);
911 U = vec_mradds (X, CCoeffs[j], U);
912 X = vec_ld (0, &chrSrc[j][i/2+2048]);
913 V = vec_mradds (X, CCoeffs[j], V);
916 /* scale and clip signals */
917 Y0 = vec_sra (Y0, SCL);
918 Y1 = vec_sra (Y1, SCL);
919 U = vec_sra (U, SCL);
920 V = vec_sra (V, SCL);
922 Y0 = vec_clip_s16 (Y0);
923 Y1 = vec_clip_s16 (Y1);
924 U = vec_clip_s16 (U);
925 V = vec_clip_s16 (V);
927 /* now we have
928 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
929 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
931 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
932 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
933 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
936 U0 = vec_mergeh (U,U);
937 V0 = vec_mergeh (V,V);
939 U1 = vec_mergel (U,U);
940 V1 = vec_mergel (V,V);
942 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
943 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
945 R = vec_packclp (R0,R1);
946 G = vec_packclp (G0,G1);
947 B = vec_packclp (B0,B1);
949 nout = (vector unsigned char *)scratch;
950 switch(c->dstFormat) {
951 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
952 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
953 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
954 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
955 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
956 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
957 default:
958 /* Unreachable, I think. */
959 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
960 sws_format_name(c->dstFormat));
961 return;
964 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);