Use svn.ffmpeg.org for the externals which is both more correct and more reliable.
[mplayer/glamo.git] / libswscale / yuv2rgb_altivec.c
blob0223fdd54dee1a075f0edde9656bbbbf3ef90c46
1 /*
2 * AltiVec acceleration for colorspace conversion
4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 Convert I420 YV12 to RGB in various formats,
25 it rejects images that are not in 420 formats,
26 it rejects images that don't have widths of multiples of 16,
27 it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
30 Lots of optimizations to be done here.
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33 and adds, so we currently use max/min to clip.
35 2. The inefficient use of chroma loading needs a bit of brushing up.
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38 pipeline stalls.
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
49 March 27,2004
50 PERFORMANCE ANALYSIS
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53 used as test.
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55 same sequence.
57 720 * 480 * 30 ~10MPS
59 so we have roughly 10 clocks per pixel. This is too high, something has
60 to be wrong.
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63 need for vec_min.
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
80 GL2 libraries work now with patch for RGB32.
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
85 adjustment.
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 #include <assert.h>
93 #include "config.h"
94 #ifdef HAVE_MALLOC_H
95 #include <malloc.h>
96 #endif
97 #include "rgb2rgb.h"
98 #include "swscale.h"
99 #include "swscale_internal.h"
101 #undef PROFILE_THE_BEAST
102 #undef INC_SCALING
104 typedef unsigned char ubyte;
105 typedef signed char sbyte;
108 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
109 homogeneous vector registers x0,x1,x2 are interleaved with the
110 following technique:
112 o0 = vec_mergeh (x0,x1);
113 o1 = vec_perm (o0, x2, perm_rgb_0);
114 o2 = vec_perm (o0, x2, perm_rgb_1);
115 o3 = vec_mergel (x0,x1);
116 o4 = vec_perm (o3,o2,perm_rgb_2);
117 o5 = vec_perm (o3,o2,perm_rgb_3);
119 perm_rgb_0: o0(RG).h v1(B) --> o1*
120 0 1 2 3 4
121 rgbr|gbrg|brgb|rgbr
122 0010 0100 1001 0010
123 0102 3145 2673 894A
125 perm_rgb_1: o0(RG).h v1(B) --> o2
126 0 1 2 3 4
127 gbrg|brgb|bbbb|bbbb
128 0100 1001 1111 1111
129 B5CD 6EF7 89AB CDEF
131 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
132 0 1 2 3 4
133 gbrg|brgb|rgbr|gbrg
134 1111 1111 0010 0100
135 89AB CDEF 0182 3945
137 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
138 0 1 2 3 4
139 brgb|rgbr|gbrg|brgb
140 1001 0010 0100 1001
141 a67b 89cA BdCD eEFf
144 static
145 const vector unsigned char
146 perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
147 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
148 perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
149 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
150 perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
151 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
152 perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
153 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
155 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
156 do { \
157 __typeof__(x0) o0,o2,o3; \
158 o0 = vec_mergeh (x0,x1); \
159 y0 = vec_perm (o0, x2, perm_rgb_0); \
160 o2 = vec_perm (o0, x2, perm_rgb_1); \
161 o3 = vec_mergel (x0,x1); \
162 y1 = vec_perm (o3,o2,perm_rgb_2); \
163 y2 = vec_perm (o3,o2,perm_rgb_3); \
164 } while(0)
166 #define vec_mstbgr24(x0,x1,x2,ptr) \
167 do { \
168 __typeof__(x0) _0,_1,_2; \
169 vec_merge3 (x0,x1,x2,_0,_1,_2); \
170 vec_st (_0, 0, ptr++); \
171 vec_st (_1, 0, ptr++); \
172 vec_st (_2, 0, ptr++); \
173 } while (0);
175 #define vec_mstrgb24(x0,x1,x2,ptr) \
176 do { \
177 __typeof__(x0) _0,_1,_2; \
178 vec_merge3 (x2,x1,x0,_0,_1,_2); \
179 vec_st (_0, 0, ptr++); \
180 vec_st (_1, 0, ptr++); \
181 vec_st (_2, 0, ptr++); \
182 } while (0);
184 /* pack the pixels in rgb0 format
185 msb R
186 lsb 0
188 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
189 do { \
190 T _0,_1,_2,_3; \
191 _0 = vec_mergeh (x0,x1); \
192 _1 = vec_mergeh (x2,x3); \
193 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
194 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
195 vec_st (_2, 0*16, (T *)ptr); \
196 vec_st (_3, 1*16, (T *)ptr); \
197 _0 = vec_mergel (x0,x1); \
198 _1 = vec_mergel (x2,x3); \
199 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
200 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
201 vec_st (_2, 2*16, (T *)ptr); \
202 vec_st (_3, 3*16, (T *)ptr); \
203 ptr += 4; \
204 } while (0);
208 | 1 0 1.4021 | | Y |
209 | 1 -0.3441 -0.7142 |x| Cb|
210 | 1 1.7718 0 | | Cr|
213 Y: [-128 127]
214 Cb/Cr : [-128 127]
216 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
223 #define vec_unh(x) \
224 (vector signed short) \
225 vec_perm(x,(__typeof__(x)){0}, \
226 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
227 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
228 #define vec_unl(x) \
229 (vector signed short) \
230 vec_perm(x,(__typeof__(x)){0}, \
231 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
232 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
234 #define vec_clip_s16(x) \
235 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
236 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
238 #define vec_packclp(x,y) \
239 (vector unsigned char)vec_packs \
240 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
241 (vector unsigned short)vec_max (y,((vector signed short) {0})))
243 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0}),a,a,a,ptr)
246 static inline void cvtyuvtoRGB (SwsContext *c,
247 vector signed short Y, vector signed short U, vector signed short V,
248 vector signed short *R, vector signed short *G, vector signed short *B)
250 vector signed short vx,ux,uvx;
252 Y = vec_mradds (Y, c->CY, c->OY);
253 U = vec_sub (U,(vector signed short)
254 vec_splat((vector signed short){128},0));
255 V = vec_sub (V,(vector signed short)
256 vec_splat((vector signed short){128},0));
258 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
259 ux = vec_sl (U, c->CSHIFT);
260 *B = vec_mradds (ux, c->CBU, Y);
262 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
263 vx = vec_sl (V, c->CSHIFT);
264 *R = vec_mradds (vx, c->CRV, Y);
266 // uvx = ((CGU*u) + (CGV*v))>>15;
267 uvx = vec_mradds (U, c->CGU, Y);
268 *G = vec_mradds (V, c->CGV, uvx);
273 ------------------------------------------------------------------------------
274 CS converters
275 ------------------------------------------------------------------------------
279 #define DEFCSP420_CVT(name,out_pixels) \
280 static int altivec_##name (SwsContext *c, \
281 unsigned char **in, int *instrides, \
282 int srcSliceY, int srcSliceH, \
283 unsigned char **oplanes, int *outstrides) \
285 int w = c->srcW; \
286 int h = srcSliceH; \
287 int i,j; \
288 int instrides_scl[3]; \
289 vector unsigned char y0,y1; \
291 vector signed char u,v; \
293 vector signed short Y0,Y1,Y2,Y3; \
294 vector signed short U,V; \
295 vector signed short vx,ux,uvx; \
296 vector signed short vx0,ux0,uvx0; \
297 vector signed short vx1,ux1,uvx1; \
298 vector signed short R0,G0,B0; \
299 vector signed short R1,G1,B1; \
300 vector unsigned char R,G,B; \
302 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
303 vector unsigned char align_perm; \
305 vector signed short \
306 lCY = c->CY, \
307 lOY = c->OY, \
308 lCRV = c->CRV, \
309 lCBU = c->CBU, \
310 lCGU = c->CGU, \
311 lCGV = c->CGV; \
313 vector unsigned short lCSHIFT = c->CSHIFT; \
315 ubyte *y1i = in[0]; \
316 ubyte *y2i = in[0]+instrides[0]; \
317 ubyte *ui = in[1]; \
318 ubyte *vi = in[2]; \
320 vector unsigned char *oute \
321 = (vector unsigned char *) \
322 (oplanes[0]+srcSliceY*outstrides[0]); \
323 vector unsigned char *outo \
324 = (vector unsigned char *) \
325 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
328 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
329 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
330 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
333 for (i=0;i<h/2;i++) { \
334 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
335 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
337 for (j=0;j<w/16;j++) { \
339 y1ivP = (vector unsigned char *)y1i; \
340 y2ivP = (vector unsigned char *)y2i; \
341 uivP = (vector unsigned char *)ui; \
342 vivP = (vector unsigned char *)vi; \
344 align_perm = vec_lvsl (0, y1i); \
345 y0 = (vector unsigned char) \
346 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
348 align_perm = vec_lvsl (0, y2i); \
349 y1 = (vector unsigned char) \
350 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
352 align_perm = vec_lvsl (0, ui); \
353 u = (vector signed char) \
354 vec_perm (uivP[0], uivP[1], align_perm); \
356 align_perm = vec_lvsl (0, vi); \
357 v = (vector signed char) \
358 vec_perm (vivP[0], vivP[1], align_perm); \
360 u = (vector signed char) \
361 vec_sub (u,(vector signed char) \
362 vec_splat((vector signed char){128},0)); \
363 v = (vector signed char) \
364 vec_sub (v,(vector signed char) \
365 vec_splat((vector signed char){128},0)); \
367 U = vec_unpackh (u); \
368 V = vec_unpackh (v); \
371 Y0 = vec_unh (y0); \
372 Y1 = vec_unl (y0); \
373 Y2 = vec_unh (y1); \
374 Y3 = vec_unl (y1); \
376 Y0 = vec_mradds (Y0, lCY, lOY); \
377 Y1 = vec_mradds (Y1, lCY, lOY); \
378 Y2 = vec_mradds (Y2, lCY, lOY); \
379 Y3 = vec_mradds (Y3, lCY, lOY); \
381 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
382 ux = vec_sl (U, lCSHIFT); \
383 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \
384 ux0 = vec_mergeh (ux,ux); \
385 ux1 = vec_mergel (ux,ux); \
387 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
388 vx = vec_sl (V, lCSHIFT); \
389 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \
390 vx0 = vec_mergeh (vx,vx); \
391 vx1 = vec_mergel (vx,vx); \
393 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
394 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \
395 uvx = vec_mradds (V, lCGV, uvx); \
396 uvx0 = vec_mergeh (uvx,uvx); \
397 uvx1 = vec_mergel (uvx,uvx); \
399 R0 = vec_add (Y0,vx0); \
400 G0 = vec_add (Y0,uvx0); \
401 B0 = vec_add (Y0,ux0); \
402 R1 = vec_add (Y1,vx1); \
403 G1 = vec_add (Y1,uvx1); \
404 B1 = vec_add (Y1,ux1); \
406 R = vec_packclp (R0,R1); \
407 G = vec_packclp (G0,G1); \
408 B = vec_packclp (B0,B1); \
410 out_pixels(R,G,B,oute); \
412 R0 = vec_add (Y2,vx0); \
413 G0 = vec_add (Y2,uvx0); \
414 B0 = vec_add (Y2,ux0); \
415 R1 = vec_add (Y3,vx1); \
416 G1 = vec_add (Y3,uvx1); \
417 B1 = vec_add (Y3,ux1); \
418 R = vec_packclp (R0,R1); \
419 G = vec_packclp (G0,G1); \
420 B = vec_packclp (B0,B1); \
423 out_pixels(R,G,B,outo); \
425 y1i += 16; \
426 y2i += 16; \
427 ui += 8; \
428 vi += 8; \
432 outo += (outstrides[0])>>4; \
433 oute += (outstrides[0])>>4; \
435 ui += instrides_scl[1]; \
436 vi += instrides_scl[2]; \
437 y1i += instrides_scl[0]; \
438 y2i += instrides_scl[0]; \
440 return srcSliceH; \
444 #define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0}),c,b,a,ptr)
445 #define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){0}),ptr)
446 #define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){0}),ptr)
447 #define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0}),a,b,c,ptr)
448 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
449 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
451 DEFCSP420_CVT (yuv2_abgr, out_abgr)
452 #if 1
453 DEFCSP420_CVT (yuv2_bgra, out_bgra)
454 #else
455 static int altivec_yuv2_bgra32 (SwsContext *c,
456 unsigned char **in, int *instrides,
457 int srcSliceY, int srcSliceH,
458 unsigned char **oplanes, int *outstrides)
460 int w = c->srcW;
461 int h = srcSliceH;
462 int i,j;
463 int instrides_scl[3];
464 vector unsigned char y0,y1;
466 vector signed char u,v;
468 vector signed short Y0,Y1,Y2,Y3;
469 vector signed short U,V;
470 vector signed short vx,ux,uvx;
471 vector signed short vx0,ux0,uvx0;
472 vector signed short vx1,ux1,uvx1;
473 vector signed short R0,G0,B0;
474 vector signed short R1,G1,B1;
475 vector unsigned char R,G,B;
477 vector unsigned char *uivP, *vivP;
478 vector unsigned char align_perm;
480 vector signed short
481 lCY = c->CY,
482 lOY = c->OY,
483 lCRV = c->CRV,
484 lCBU = c->CBU,
485 lCGU = c->CGU,
486 lCGV = c->CGV;
488 vector unsigned short lCSHIFT = c->CSHIFT;
490 ubyte *y1i = in[0];
491 ubyte *y2i = in[0]+w;
492 ubyte *ui = in[1];
493 ubyte *vi = in[2];
495 vector unsigned char *oute
496 = (vector unsigned char *)
497 (oplanes[0]+srcSliceY*outstrides[0]);
498 vector unsigned char *outo
499 = (vector unsigned char *)
500 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
503 instrides_scl[0] = instrides[0];
504 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
505 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
508 for (i=0;i<h/2;i++) {
509 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
510 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
512 for (j=0;j<w/16;j++) {
514 y0 = vec_ldl (0,y1i);
515 y1 = vec_ldl (0,y2i);
516 uivP = (vector unsigned char *)ui;
517 vivP = (vector unsigned char *)vi;
519 align_perm = vec_lvsl (0, ui);
520 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
522 align_perm = vec_lvsl (0, vi);
523 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
524 u = (vector signed char)
525 vec_sub (u,(vector signed char)
526 vec_splat((vector signed char){128},0));
528 v = (vector signed char)
529 vec_sub (v, (vector signed char)
530 vec_splat((vector signed char){128},0));
532 U = vec_unpackh (u);
533 V = vec_unpackh (v);
536 Y0 = vec_unh (y0);
537 Y1 = vec_unl (y0);
538 Y2 = vec_unh (y1);
539 Y3 = vec_unl (y1);
541 Y0 = vec_mradds (Y0, lCY, lOY);
542 Y1 = vec_mradds (Y1, lCY, lOY);
543 Y2 = vec_mradds (Y2, lCY, lOY);
544 Y3 = vec_mradds (Y3, lCY, lOY);
546 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
547 ux = vec_sl (U, lCSHIFT);
548 ux = vec_mradds (ux, lCBU, (vector signed short){0});
549 ux0 = vec_mergeh (ux,ux);
550 ux1 = vec_mergel (ux,ux);
552 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
553 vx = vec_sl (V, lCSHIFT);
554 vx = vec_mradds (vx, lCRV, (vector signed short){0});
555 vx0 = vec_mergeh (vx,vx);
556 vx1 = vec_mergel (vx,vx);
557 /* uvx = ((CGU*u) + (CGV*v))>>15 */
558 uvx = vec_mradds (U, lCGU, (vector signed short){0});
559 uvx = vec_mradds (V, lCGV, uvx);
560 uvx0 = vec_mergeh (uvx,uvx);
561 uvx1 = vec_mergel (uvx,uvx);
562 R0 = vec_add (Y0,vx0);
563 G0 = vec_add (Y0,uvx0);
564 B0 = vec_add (Y0,ux0);
565 R1 = vec_add (Y1,vx1);
566 G1 = vec_add (Y1,uvx1);
567 B1 = vec_add (Y1,ux1);
568 R = vec_packclp (R0,R1);
569 G = vec_packclp (G0,G1);
570 B = vec_packclp (B0,B1);
572 out_argb(R,G,B,oute);
573 R0 = vec_add (Y2,vx0);
574 G0 = vec_add (Y2,uvx0);
575 B0 = vec_add (Y2,ux0);
576 R1 = vec_add (Y3,vx1);
577 G1 = vec_add (Y3,uvx1);
578 B1 = vec_add (Y3,ux1);
579 R = vec_packclp (R0,R1);
580 G = vec_packclp (G0,G1);
581 B = vec_packclp (B0,B1);
583 out_argb(R,G,B,outo);
584 y1i += 16;
585 y2i += 16;
586 ui += 8;
587 vi += 8;
591 outo += (outstrides[0])>>4;
592 oute += (outstrides[0])>>4;
594 ui += instrides_scl[1];
595 vi += instrides_scl[2];
596 y1i += instrides_scl[0];
597 y2i += instrides_scl[0];
599 return srcSliceH;
602 #endif
605 DEFCSP420_CVT (yuv2_rgba, out_rgba)
606 DEFCSP420_CVT (yuv2_argb, out_argb)
607 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
608 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
611 // uyvy|uyvy|uyvy|uyvy
612 // 0123 4567 89ab cdef
613 static
614 const vector unsigned char
615 demux_u = {0x10,0x00,0x10,0x00,
616 0x10,0x04,0x10,0x04,
617 0x10,0x08,0x10,0x08,
618 0x10,0x0c,0x10,0x0c},
619 demux_v = {0x10,0x02,0x10,0x02,
620 0x10,0x06,0x10,0x06,
621 0x10,0x0A,0x10,0x0A,
622 0x10,0x0E,0x10,0x0E},
623 demux_y = {0x10,0x01,0x10,0x03,
624 0x10,0x05,0x10,0x07,
625 0x10,0x09,0x10,0x0B,
626 0x10,0x0D,0x10,0x0F};
629 this is so I can play live CCIR raw video
631 static int altivec_uyvy_rgb32 (SwsContext *c,
632 unsigned char **in, int *instrides,
633 int srcSliceY, int srcSliceH,
634 unsigned char **oplanes, int *outstrides)
636 int w = c->srcW;
637 int h = srcSliceH;
638 int i,j;
639 vector unsigned char uyvy;
640 vector signed short Y,U,V;
641 vector signed short R0,G0,B0,R1,G1,B1;
642 vector unsigned char R,G,B;
643 vector unsigned char *out;
644 ubyte *img;
646 img = in[0];
647 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
649 for (i=0;i<h;i++) {
650 for (j=0;j<w/16;j++) {
651 uyvy = vec_ld (0, img);
652 U = (vector signed short)
653 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
655 V = (vector signed short)
656 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
658 Y = (vector signed short)
659 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
661 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
663 uyvy = vec_ld (16, img);
664 U = (vector signed short)
665 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
667 V = (vector signed short)
668 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
670 Y = (vector signed short)
671 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
673 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
675 R = vec_packclp (R0,R1);
676 G = vec_packclp (G0,G1);
677 B = vec_packclp (B0,B1);
679 // vec_mstbgr24 (R,G,B, out);
680 out_rgba (R,G,B,out);
682 img += 32;
685 return srcSliceH;
690 /* Ok currently the acceleration routine only supports
691 inputs of widths a multiple of 16
692 and heights a multiple 2
694 So we just fall back to the C codes for this.
696 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
698 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
699 return NULL;
702 and this seems not to matter too much I tried a bunch of
703 videos with abnormal widths and MPlayer crashes elsewhere.
704 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
705 boom with X11 bad match.
708 if ((c->srcW & 0xf) != 0) return NULL;
710 switch (c->srcFormat) {
711 case PIX_FMT_YUV410P:
712 case PIX_FMT_YUV420P:
713 /*case IMGFMT_CLPL: ??? */
714 case PIX_FMT_GRAY8:
715 case PIX_FMT_NV12:
716 case PIX_FMT_NV21:
717 if ((c->srcH & 0x1) != 0)
718 return NULL;
720 switch(c->dstFormat){
721 case PIX_FMT_RGB24:
722 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
723 return altivec_yuv2_rgb24;
724 case PIX_FMT_BGR24:
725 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
726 return altivec_yuv2_bgr24;
727 case PIX_FMT_ARGB:
728 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
729 return altivec_yuv2_argb;
730 case PIX_FMT_ABGR:
731 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
732 return altivec_yuv2_abgr;
733 case PIX_FMT_RGBA:
734 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
735 return altivec_yuv2_rgba;
736 case PIX_FMT_BGRA:
737 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
738 return altivec_yuv2_bgra;
739 default: return NULL;
741 break;
743 case PIX_FMT_UYVY422:
744 switch(c->dstFormat){
745 case PIX_FMT_BGR32:
746 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
747 return altivec_uyvy_rgb32;
748 default: return NULL;
750 break;
753 return NULL;
756 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
758 union {
759 signed short tmp[8] __attribute__ ((aligned(16)));
760 vector signed short vec;
761 } buf;
763 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy
764 buf.tmp[1] = -256*brightness; //oy
765 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
766 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
767 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
768 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
771 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
772 c->CY = vec_splat ((vector signed short)buf.vec, 0);
773 c->OY = vec_splat ((vector signed short)buf.vec, 1);
774 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
775 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
776 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
777 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
778 #if 0
780 int i;
781 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
782 for (i=0; i<6; i++)
783 printf("%s %d ", v[i],buf.tmp[i] );
784 printf("\n");
786 #endif
787 return;
791 void
792 altivec_yuv2packedX (SwsContext *c,
793 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
794 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
795 uint8_t *dest, int dstW, int dstY)
797 int i,j;
798 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
799 vector signed short R0,G0,B0,R1,G1,B1;
801 vector unsigned char R,G,B;
802 vector unsigned char *out,*nout;
804 vector signed short RND = vec_splat_s16(1<<3);
805 vector unsigned short SCL = vec_splat_u16(4);
806 unsigned long scratch[16] __attribute__ ((aligned (16)));
808 vector signed short *YCoeffs, *CCoeffs;
810 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
811 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
813 out = (vector unsigned char *)dest;
815 for (i=0; i<dstW; i+=16){
816 Y0 = RND;
817 Y1 = RND;
818 /* extract 16 coeffs from lumSrc */
819 for (j=0; j<lumFilterSize; j++) {
820 X0 = vec_ld (0, &lumSrc[j][i]);
821 X1 = vec_ld (16, &lumSrc[j][i]);
822 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
823 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
826 U = RND;
827 V = RND;
828 /* extract 8 coeffs from U,V */
829 for (j=0; j<chrFilterSize; j++) {
830 X = vec_ld (0, &chrSrc[j][i/2]);
831 U = vec_mradds (X, CCoeffs[j], U);
832 X = vec_ld (0, &chrSrc[j][i/2+2048]);
833 V = vec_mradds (X, CCoeffs[j], V);
836 /* scale and clip signals */
837 Y0 = vec_sra (Y0, SCL);
838 Y1 = vec_sra (Y1, SCL);
839 U = vec_sra (U, SCL);
840 V = vec_sra (V, SCL);
842 Y0 = vec_clip_s16 (Y0);
843 Y1 = vec_clip_s16 (Y1);
844 U = vec_clip_s16 (U);
845 V = vec_clip_s16 (V);
847 /* now we have
848 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
849 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
851 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
852 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
853 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
856 U0 = vec_mergeh (U,U);
857 V0 = vec_mergeh (V,V);
859 U1 = vec_mergel (U,U);
860 V1 = vec_mergel (V,V);
862 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
863 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
865 R = vec_packclp (R0,R1);
866 G = vec_packclp (G0,G1);
867 B = vec_packclp (B0,B1);
869 switch(c->dstFormat) {
870 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
871 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
872 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
873 case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
874 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
875 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
876 default:
878 /* If this is reached, the caller should have called yuv2packedXinC
879 instead. */
880 static int printed_error_message;
881 if (!printed_error_message) {
882 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
883 sws_format_name(c->dstFormat));
884 printed_error_message=1;
886 return;
891 if (i < dstW) {
892 i -= 16;
894 Y0 = RND;
895 Y1 = RND;
896 /* extract 16 coeffs from lumSrc */
897 for (j=0; j<lumFilterSize; j++) {
898 X0 = vec_ld (0, &lumSrc[j][i]);
899 X1 = vec_ld (16, &lumSrc[j][i]);
900 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
901 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
904 U = RND;
905 V = RND;
906 /* extract 8 coeffs from U,V */
907 for (j=0; j<chrFilterSize; j++) {
908 X = vec_ld (0, &chrSrc[j][i/2]);
909 U = vec_mradds (X, CCoeffs[j], U);
910 X = vec_ld (0, &chrSrc[j][i/2+2048]);
911 V = vec_mradds (X, CCoeffs[j], V);
914 /* scale and clip signals */
915 Y0 = vec_sra (Y0, SCL);
916 Y1 = vec_sra (Y1, SCL);
917 U = vec_sra (U, SCL);
918 V = vec_sra (V, SCL);
920 Y0 = vec_clip_s16 (Y0);
921 Y1 = vec_clip_s16 (Y1);
922 U = vec_clip_s16 (U);
923 V = vec_clip_s16 (V);
925 /* now we have
926 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
927 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
929 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
930 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
931 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
934 U0 = vec_mergeh (U,U);
935 V0 = vec_mergeh (V,V);
937 U1 = vec_mergel (U,U);
938 V1 = vec_mergel (V,V);
940 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
941 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
943 R = vec_packclp (R0,R1);
944 G = vec_packclp (G0,G1);
945 B = vec_packclp (B0,B1);
947 nout = (vector unsigned char *)scratch;
948 switch(c->dstFormat) {
949 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
950 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
951 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
952 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
953 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
954 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
955 default:
956 /* Unreachable, I think. */
957 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
958 sws_format_name(c->dstFormat));
959 return;
962 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);