2 * AltiVec acceleration for colorspace conversion
4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 Convert I420 YV12 to RGB in various formats,
25 it rejects images that are not in 420 formats,
26 it rejects images that don't have widths of multiples of 16,
27 it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
30 Lots of optimizations to be done here.
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33 and adds, so we currently use max/min to clip.
35 2. The inefficient use of chroma loading needs a bit of brushing up.
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
59 so we have roughly 10 clocks per pixel. This is too high, something has
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
80 GL2 libraries work now with patch for RGB32.
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
96 #include "libswscale/swscale_internal.h"
98 #undef PROFILE_THE_BEAST
101 typedef unsigned char ubyte
;
102 typedef signed char sbyte
;
105 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
106 homogeneous vector registers x0,x1,x2 are interleaved with the
109 o0 = vec_mergeh (x0,x1);
110 o1 = vec_perm (o0, x2, perm_rgb_0);
111 o2 = vec_perm (o0, x2, perm_rgb_1);
112 o3 = vec_mergel (x0,x1);
113 o4 = vec_perm (o3,o2,perm_rgb_2);
114 o5 = vec_perm (o3,o2,perm_rgb_3);
116 perm_rgb_0: o0(RG).h v1(B) --> o1*
122 perm_rgb_1: o0(RG).h v1(B) --> o2
128 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
134 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
142 const vector
unsigned char
143 perm_rgb_0
= {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
144 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
145 perm_rgb_1
= {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
146 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
147 perm_rgb_2
= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
148 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
149 perm_rgb_3
= {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
150 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
152 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
154 __typeof__(x0) o0,o2,o3; \
155 o0 = vec_mergeh (x0,x1); \
156 y0 = vec_perm (o0, x2, perm_rgb_0); \
157 o2 = vec_perm (o0, x2, perm_rgb_1); \
158 o3 = vec_mergel (x0,x1); \
159 y1 = vec_perm (o3,o2,perm_rgb_2); \
160 y2 = vec_perm (o3,o2,perm_rgb_3); \
163 #define vec_mstbgr24(x0,x1,x2,ptr) \
165 __typeof__(x0) _0,_1,_2; \
166 vec_merge3 (x0,x1,x2,_0,_1,_2); \
167 vec_st (_0, 0, ptr++); \
168 vec_st (_1, 0, ptr++); \
169 vec_st (_2, 0, ptr++); \
172 #define vec_mstrgb24(x0,x1,x2,ptr) \
174 __typeof__(x0) _0,_1,_2; \
175 vec_merge3 (x2,x1,x0,_0,_1,_2); \
176 vec_st (_0, 0, ptr++); \
177 vec_st (_1, 0, ptr++); \
178 vec_st (_2, 0, ptr++); \
181 /* pack the pixels in rgb0 format
185 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
188 _0 = vec_mergeh (x0,x1); \
189 _1 = vec_mergeh (x2,x3); \
190 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
191 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
192 vec_st (_2, 0*16, (T *)ptr); \
193 vec_st (_3, 1*16, (T *)ptr); \
194 _0 = vec_mergel (x0,x1); \
195 _1 = vec_mergel (x2,x3); \
196 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
197 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
198 vec_st (_2, 2*16, (T *)ptr); \
199 vec_st (_3, 3*16, (T *)ptr); \
206 | 1 -0.3441 -0.7142 |x| Cb|
213 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
221 (vector signed short) \
222 vec_perm(x,(__typeof__(x)){0}, \
223 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
224 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
226 (vector signed short) \
227 vec_perm(x,(__typeof__(x)){0}, \
228 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
229 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
231 #define vec_clip_s16(x) \
232 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
233 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
235 #define vec_packclp(x,y) \
236 (vector unsigned char)vec_packs \
237 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
238 (vector unsigned short)vec_max (y,((vector signed short) {0})))
240 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
243 static inline void cvtyuvtoRGB (SwsContext
*c
,
244 vector
signed short Y
, vector
signed short U
, vector
signed short V
,
245 vector
signed short *R
, vector
signed short *G
, vector
signed short *B
)
247 vector
signed short vx
,ux
,uvx
;
249 Y
= vec_mradds (Y
, c
->CY
, c
->OY
);
250 U
= vec_sub (U
,(vector
signed short)
251 vec_splat((vector
signed short){128},0));
252 V
= vec_sub (V
,(vector
signed short)
253 vec_splat((vector
signed short){128},0));
255 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
256 ux
= vec_sl (U
, c
->CSHIFT
);
257 *B
= vec_mradds (ux
, c
->CBU
, Y
);
259 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
260 vx
= vec_sl (V
, c
->CSHIFT
);
261 *R
= vec_mradds (vx
, c
->CRV
, Y
);
263 // uvx = ((CGU*u) + (CGV*v))>>15;
264 uvx
= vec_mradds (U
, c
->CGU
, Y
);
265 *G
= vec_mradds (V
, c
->CGV
, uvx
);
270 ------------------------------------------------------------------------------
272 ------------------------------------------------------------------------------
276 #define DEFCSP420_CVT(name,out_pixels) \
277 static int altivec_##name (SwsContext *c, \
278 unsigned char **in, int *instrides, \
279 int srcSliceY, int srcSliceH, \
280 unsigned char **oplanes, int *outstrides) \
285 int instrides_scl[3]; \
286 vector unsigned char y0,y1; \
288 vector signed char u,v; \
290 vector signed short Y0,Y1,Y2,Y3; \
291 vector signed short U,V; \
292 vector signed short vx,ux,uvx; \
293 vector signed short vx0,ux0,uvx0; \
294 vector signed short vx1,ux1,uvx1; \
295 vector signed short R0,G0,B0; \
296 vector signed short R1,G1,B1; \
297 vector unsigned char R,G,B; \
299 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
300 vector unsigned char align_perm; \
302 vector signed short \
310 vector unsigned short lCSHIFT = c->CSHIFT; \
312 ubyte *y1i = in[0]; \
313 ubyte *y2i = in[0]+instrides[0]; \
317 vector unsigned char *oute \
318 = (vector unsigned char *) \
319 (oplanes[0]+srcSliceY*outstrides[0]); \
320 vector unsigned char *outo \
321 = (vector unsigned char *) \
322 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
325 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
326 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
327 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
330 for (i=0;i<h/2;i++) { \
331 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
332 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
334 for (j=0;j<w/16;j++) { \
336 y1ivP = (vector unsigned char *)y1i; \
337 y2ivP = (vector unsigned char *)y2i; \
338 uivP = (vector unsigned char *)ui; \
339 vivP = (vector unsigned char *)vi; \
341 align_perm = vec_lvsl (0, y1i); \
342 y0 = (vector unsigned char) \
343 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
345 align_perm = vec_lvsl (0, y2i); \
346 y1 = (vector unsigned char) \
347 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
349 align_perm = vec_lvsl (0, ui); \
350 u = (vector signed char) \
351 vec_perm (uivP[0], uivP[1], align_perm); \
353 align_perm = vec_lvsl (0, vi); \
354 v = (vector signed char) \
355 vec_perm (vivP[0], vivP[1], align_perm); \
357 u = (vector signed char) \
358 vec_sub (u,(vector signed char) \
359 vec_splat((vector signed char){128},0)); \
360 v = (vector signed char) \
361 vec_sub (v,(vector signed char) \
362 vec_splat((vector signed char){128},0)); \
364 U = vec_unpackh (u); \
365 V = vec_unpackh (v); \
373 Y0 = vec_mradds (Y0, lCY, lOY); \
374 Y1 = vec_mradds (Y1, lCY, lOY); \
375 Y2 = vec_mradds (Y2, lCY, lOY); \
376 Y3 = vec_mradds (Y3, lCY, lOY); \
378 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
379 ux = vec_sl (U, lCSHIFT); \
380 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \
381 ux0 = vec_mergeh (ux,ux); \
382 ux1 = vec_mergel (ux,ux); \
384 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
385 vx = vec_sl (V, lCSHIFT); \
386 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \
387 vx0 = vec_mergeh (vx,vx); \
388 vx1 = vec_mergel (vx,vx); \
390 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
391 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \
392 uvx = vec_mradds (V, lCGV, uvx); \
393 uvx0 = vec_mergeh (uvx,uvx); \
394 uvx1 = vec_mergel (uvx,uvx); \
396 R0 = vec_add (Y0,vx0); \
397 G0 = vec_add (Y0,uvx0); \
398 B0 = vec_add (Y0,ux0); \
399 R1 = vec_add (Y1,vx1); \
400 G1 = vec_add (Y1,uvx1); \
401 B1 = vec_add (Y1,ux1); \
403 R = vec_packclp (R0,R1); \
404 G = vec_packclp (G0,G1); \
405 B = vec_packclp (B0,B1); \
407 out_pixels(R,G,B,oute); \
409 R0 = vec_add (Y2,vx0); \
410 G0 = vec_add (Y2,uvx0); \
411 B0 = vec_add (Y2,ux0); \
412 R1 = vec_add (Y3,vx1); \
413 G1 = vec_add (Y3,uvx1); \
414 B1 = vec_add (Y3,ux1); \
415 R = vec_packclp (R0,R1); \
416 G = vec_packclp (G0,G1); \
417 B = vec_packclp (B0,B1); \
420 out_pixels(R,G,B,outo); \
429 outo += (outstrides[0])>>4; \
430 oute += (outstrides[0])>>4; \
432 ui += instrides_scl[1]; \
433 vi += instrides_scl[2]; \
434 y1i += instrides_scl[0]; \
435 y2i += instrides_scl[0]; \
441 #define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
442 #define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
443 #define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
444 #define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
445 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
446 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
448 DEFCSP420_CVT (yuv2_abgr
, out_abgr
)
450 DEFCSP420_CVT (yuv2_bgra
, out_bgra
)
452 static int altivec_yuv2_bgra32 (SwsContext
*c
,
453 unsigned char **in
, int *instrides
,
454 int srcSliceY
, int srcSliceH
,
455 unsigned char **oplanes
, int *outstrides
)
460 int instrides_scl
[3];
461 vector
unsigned char y0
,y1
;
463 vector
signed char u
,v
;
465 vector
signed short Y0
,Y1
,Y2
,Y3
;
466 vector
signed short U
,V
;
467 vector
signed short vx
,ux
,uvx
;
468 vector
signed short vx0
,ux0
,uvx0
;
469 vector
signed short vx1
,ux1
,uvx1
;
470 vector
signed short R0
,G0
,B0
;
471 vector
signed short R1
,G1
,B1
;
472 vector
unsigned char R
,G
,B
;
474 vector
unsigned char *uivP
, *vivP
;
475 vector
unsigned char align_perm
;
485 vector
unsigned short lCSHIFT
= c
->CSHIFT
;
488 ubyte
*y2i
= in
[0]+w
;
492 vector
unsigned char *oute
493 = (vector
unsigned char *)
494 (oplanes
[0]+srcSliceY
*outstrides
[0]);
495 vector
unsigned char *outo
496 = (vector
unsigned char *)
497 (oplanes
[0]+srcSliceY
*outstrides
[0]+outstrides
[0]);
500 instrides_scl
[0] = instrides
[0];
501 instrides_scl
[1] = instrides
[1]-w
/2; /* the loop moves ui by w/2 */
502 instrides_scl
[2] = instrides
[2]-w
/2; /* the loop moves vi by w/2 */
505 for (i
=0;i
<h
/2;i
++) {
506 vec_dstst (outo
, (0x02000002|(((w
*3+32)/32)<<16)), 0);
507 vec_dstst (oute
, (0x02000002|(((w
*3+32)/32)<<16)), 1);
509 for (j
=0;j
<w
/16;j
++) {
511 y0
= vec_ldl (0,y1i
);
512 y1
= vec_ldl (0,y2i
);
513 uivP
= (vector
unsigned char *)ui
;
514 vivP
= (vector
unsigned char *)vi
;
516 align_perm
= vec_lvsl (0, ui
);
517 u
= (vector
signed char)vec_perm (uivP
[0], uivP
[1], align_perm
);
519 align_perm
= vec_lvsl (0, vi
);
520 v
= (vector
signed char)vec_perm (vivP
[0], vivP
[1], align_perm
);
521 u
= (vector
signed char)
522 vec_sub (u
,(vector
signed char)
523 vec_splat((vector
signed char){128},0));
525 v
= (vector
signed char)
526 vec_sub (v
, (vector
signed char)
527 vec_splat((vector
signed char){128},0));
538 Y0
= vec_mradds (Y0
, lCY
, lOY
);
539 Y1
= vec_mradds (Y1
, lCY
, lOY
);
540 Y2
= vec_mradds (Y2
, lCY
, lOY
);
541 Y3
= vec_mradds (Y3
, lCY
, lOY
);
543 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
544 ux
= vec_sl (U
, lCSHIFT
);
545 ux
= vec_mradds (ux
, lCBU
, (vector
signed short){0});
546 ux0
= vec_mergeh (ux
,ux
);
547 ux1
= vec_mergel (ux
,ux
);
549 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
550 vx
= vec_sl (V
, lCSHIFT
);
551 vx
= vec_mradds (vx
, lCRV
, (vector
signed short){0});
552 vx0
= vec_mergeh (vx
,vx
);
553 vx1
= vec_mergel (vx
,vx
);
554 /* uvx = ((CGU*u) + (CGV*v))>>15 */
555 uvx
= vec_mradds (U
, lCGU
, (vector
signed short){0});
556 uvx
= vec_mradds (V
, lCGV
, uvx
);
557 uvx0
= vec_mergeh (uvx
,uvx
);
558 uvx1
= vec_mergel (uvx
,uvx
);
559 R0
= vec_add (Y0
,vx0
);
560 G0
= vec_add (Y0
,uvx0
);
561 B0
= vec_add (Y0
,ux0
);
562 R1
= vec_add (Y1
,vx1
);
563 G1
= vec_add (Y1
,uvx1
);
564 B1
= vec_add (Y1
,ux1
);
565 R
= vec_packclp (R0
,R1
);
566 G
= vec_packclp (G0
,G1
);
567 B
= vec_packclp (B0
,B1
);
569 out_argb(R
,G
,B
,oute
);
570 R0
= vec_add (Y2
,vx0
);
571 G0
= vec_add (Y2
,uvx0
);
572 B0
= vec_add (Y2
,ux0
);
573 R1
= vec_add (Y3
,vx1
);
574 G1
= vec_add (Y3
,uvx1
);
575 B1
= vec_add (Y3
,ux1
);
576 R
= vec_packclp (R0
,R1
);
577 G
= vec_packclp (G0
,G1
);
578 B
= vec_packclp (B0
,B1
);
580 out_argb(R
,G
,B
,outo
);
588 outo
+= (outstrides
[0])>>4;
589 oute
+= (outstrides
[0])>>4;
591 ui
+= instrides_scl
[1];
592 vi
+= instrides_scl
[2];
593 y1i
+= instrides_scl
[0];
594 y2i
+= instrides_scl
[0];
602 DEFCSP420_CVT (yuv2_rgba
, out_rgba
)
603 DEFCSP420_CVT (yuv2_argb
, out_argb
)
604 DEFCSP420_CVT (yuv2_rgb24
, out_rgb24
)
605 DEFCSP420_CVT (yuv2_bgr24
, out_bgr24
)
608 // uyvy|uyvy|uyvy|uyvy
609 // 0123 4567 89ab cdef
611 const vector
unsigned char
612 demux_u
= {0x10,0x00,0x10,0x00,
615 0x10,0x0c,0x10,0x0c},
616 demux_v
= {0x10,0x02,0x10,0x02,
619 0x10,0x0E,0x10,0x0E},
620 demux_y
= {0x10,0x01,0x10,0x03,
623 0x10,0x0D,0x10,0x0F};
626 this is so I can play live CCIR raw video
628 static int altivec_uyvy_rgb32 (SwsContext
*c
,
629 unsigned char **in
, int *instrides
,
630 int srcSliceY
, int srcSliceH
,
631 unsigned char **oplanes
, int *outstrides
)
636 vector
unsigned char uyvy
;
637 vector
signed short Y
,U
,V
;
638 vector
signed short R0
,G0
,B0
,R1
,G1
,B1
;
639 vector
unsigned char R
,G
,B
;
640 vector
unsigned char *out
;
644 out
= (vector
unsigned char *)(oplanes
[0]+srcSliceY
*outstrides
[0]);
647 for (j
=0;j
<w
/16;j
++) {
648 uyvy
= vec_ld (0, img
);
649 U
= (vector
signed short)
650 vec_perm (uyvy
, (vector
unsigned char){0}, demux_u
);
652 V
= (vector
signed short)
653 vec_perm (uyvy
, (vector
unsigned char){0}, demux_v
);
655 Y
= (vector
signed short)
656 vec_perm (uyvy
, (vector
unsigned char){0}, demux_y
);
658 cvtyuvtoRGB (c
, Y
,U
,V
,&R0
,&G0
,&B0
);
660 uyvy
= vec_ld (16, img
);
661 U
= (vector
signed short)
662 vec_perm (uyvy
, (vector
unsigned char){0}, demux_u
);
664 V
= (vector
signed short)
665 vec_perm (uyvy
, (vector
unsigned char){0}, demux_v
);
667 Y
= (vector
signed short)
668 vec_perm (uyvy
, (vector
unsigned char){0}, demux_y
);
670 cvtyuvtoRGB (c
, Y
,U
,V
,&R1
,&G1
,&B1
);
672 R
= vec_packclp (R0
,R1
);
673 G
= vec_packclp (G0
,G1
);
674 B
= vec_packclp (B0
,B1
);
676 // vec_mstbgr24 (R,G,B, out);
677 out_rgba (R
,G
,B
,out
);
687 /* Ok currently the acceleration routine only supports
688 inputs of widths a multiple of 16
689 and heights a multiple 2
691 So we just fall back to the C codes for this.
693 SwsFunc
ff_yuv2rgb_init_altivec(SwsContext
*c
)
695 if (!(c
->flags
& SWS_CPU_CAPS_ALTIVEC
))
699 and this seems not to matter too much I tried a bunch of
700 videos with abnormal widths and MPlayer crashes elsewhere.
701 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
702 boom with X11 bad match.
705 if ((c
->srcW
& 0xf) != 0) return NULL
;
707 switch (c
->srcFormat
) {
708 case PIX_FMT_YUV410P
:
709 case PIX_FMT_YUV420P
:
710 /*case IMGFMT_CLPL: ??? */
714 if ((c
->srcH
& 0x1) != 0)
717 switch(c
->dstFormat
) {
719 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space RGB24\n");
720 return altivec_yuv2_rgb24
;
722 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space BGR24\n");
723 return altivec_yuv2_bgr24
;
725 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space ARGB\n");
726 return altivec_yuv2_argb
;
728 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space ABGR\n");
729 return altivec_yuv2_abgr
;
731 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space RGBA\n");
732 return altivec_yuv2_rgba
;
734 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space BGRA\n");
735 return altivec_yuv2_bgra
;
736 default: return NULL
;
740 case PIX_FMT_UYVY422
:
741 switch(c
->dstFormat
) {
743 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space UYVY -> RGB32\n");
744 return altivec_uyvy_rgb32
;
745 default: return NULL
;
753 void ff_yuv2rgb_init_tables_altivec(SwsContext
*c
, const int inv_table
[4], int brightness
, int contrast
, int saturation
)
756 DECLARE_ALIGNED(16, signed short, tmp
)[8];
757 vector
signed short vec
;
760 buf
.tmp
[0] = ((0xffffLL
) * contrast
>>8)>>9; //cy
761 buf
.tmp
[1] = -256*brightness
; //oy
762 buf
.tmp
[2] = (inv_table
[0]>>3) *(contrast
>>16)*(saturation
>>16); //crv
763 buf
.tmp
[3] = (inv_table
[1]>>3) *(contrast
>>16)*(saturation
>>16); //cbu
764 buf
.tmp
[4] = -((inv_table
[2]>>1)*(contrast
>>16)*(saturation
>>16)); //cgu
765 buf
.tmp
[5] = -((inv_table
[3]>>1)*(contrast
>>16)*(saturation
>>16)); //cgv
768 c
->CSHIFT
= (vector
unsigned short)vec_splat_u16(2);
769 c
->CY
= vec_splat ((vector
signed short)buf
.vec
, 0);
770 c
->OY
= vec_splat ((vector
signed short)buf
.vec
, 1);
771 c
->CRV
= vec_splat ((vector
signed short)buf
.vec
, 2);
772 c
->CBU
= vec_splat ((vector
signed short)buf
.vec
, 3);
773 c
->CGU
= vec_splat ((vector
signed short)buf
.vec
, 4);
774 c
->CGV
= vec_splat ((vector
signed short)buf
.vec
, 5);
780 ff_yuv2packedX_altivec(SwsContext
*c
,
781 const int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
782 const int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
783 uint8_t *dest
, int dstW
, int dstY
)
786 vector
signed short X
,X0
,X1
,Y0
,U0
,V0
,Y1
,U1
,V1
,U
,V
;
787 vector
signed short R0
,G0
,B0
,R1
,G1
,B1
;
789 vector
unsigned char R
,G
,B
;
790 vector
unsigned char *out
,*nout
;
792 vector
signed short RND
= vec_splat_s16(1<<3);
793 vector
unsigned short SCL
= vec_splat_u16(4);
794 DECLARE_ALIGNED(16, unsigned long, scratch
)[16];
796 vector
signed short *YCoeffs
, *CCoeffs
;
798 YCoeffs
= c
->vYCoeffsBank
+dstY
*lumFilterSize
;
799 CCoeffs
= c
->vCCoeffsBank
+dstY
*chrFilterSize
;
801 out
= (vector
unsigned char *)dest
;
803 for (i
=0; i
<dstW
; i
+=16) {
806 /* extract 16 coeffs from lumSrc */
807 for (j
=0; j
<lumFilterSize
; j
++) {
808 X0
= vec_ld (0, &lumSrc
[j
][i
]);
809 X1
= vec_ld (16, &lumSrc
[j
][i
]);
810 Y0
= vec_mradds (X0
, YCoeffs
[j
], Y0
);
811 Y1
= vec_mradds (X1
, YCoeffs
[j
], Y1
);
816 /* extract 8 coeffs from U,V */
817 for (j
=0; j
<chrFilterSize
; j
++) {
818 X
= vec_ld (0, &chrSrc
[j
][i
/2]);
819 U
= vec_mradds (X
, CCoeffs
[j
], U
);
820 X
= vec_ld (0, &chrSrc
[j
][i
/2+2048]);
821 V
= vec_mradds (X
, CCoeffs
[j
], V
);
824 /* scale and clip signals */
825 Y0
= vec_sra (Y0
, SCL
);
826 Y1
= vec_sra (Y1
, SCL
);
827 U
= vec_sra (U
, SCL
);
828 V
= vec_sra (V
, SCL
);
830 Y0
= vec_clip_s16 (Y0
);
831 Y1
= vec_clip_s16 (Y1
);
832 U
= vec_clip_s16 (U
);
833 V
= vec_clip_s16 (V
);
836 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
837 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
839 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
840 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
841 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
844 U0
= vec_mergeh (U
,U
);
845 V0
= vec_mergeh (V
,V
);
847 U1
= vec_mergel (U
,U
);
848 V1
= vec_mergel (V
,V
);
850 cvtyuvtoRGB (c
, Y0
,U0
,V0
,&R0
,&G0
,&B0
);
851 cvtyuvtoRGB (c
, Y1
,U1
,V1
,&R1
,&G1
,&B1
);
853 R
= vec_packclp (R0
,R1
);
854 G
= vec_packclp (G0
,G1
);
855 B
= vec_packclp (B0
,B1
);
857 switch(c
->dstFormat
) {
858 case PIX_FMT_ABGR
: out_abgr (R
,G
,B
,out
); break;
859 case PIX_FMT_BGRA
: out_bgra (R
,G
,B
,out
); break;
860 case PIX_FMT_RGBA
: out_rgba (R
,G
,B
,out
); break;
861 case PIX_FMT_ARGB
: out_argb (R
,G
,B
,out
); break;
862 case PIX_FMT_RGB24
: out_rgb24 (R
,G
,B
,out
); break;
863 case PIX_FMT_BGR24
: out_bgr24 (R
,G
,B
,out
); break;
866 /* If this is reached, the caller should have called yuv2packedXinC
868 static int printed_error_message
;
869 if (!printed_error_message
) {
870 av_log(c
, AV_LOG_ERROR
, "altivec_yuv2packedX doesn't support %s output\n",
871 sws_format_name(c
->dstFormat
));
872 printed_error_message
=1;
884 /* extract 16 coeffs from lumSrc */
885 for (j
=0; j
<lumFilterSize
; j
++) {
886 X0
= vec_ld (0, &lumSrc
[j
][i
]);
887 X1
= vec_ld (16, &lumSrc
[j
][i
]);
888 Y0
= vec_mradds (X0
, YCoeffs
[j
], Y0
);
889 Y1
= vec_mradds (X1
, YCoeffs
[j
], Y1
);
894 /* extract 8 coeffs from U,V */
895 for (j
=0; j
<chrFilterSize
; j
++) {
896 X
= vec_ld (0, &chrSrc
[j
][i
/2]);
897 U
= vec_mradds (X
, CCoeffs
[j
], U
);
898 X
= vec_ld (0, &chrSrc
[j
][i
/2+2048]);
899 V
= vec_mradds (X
, CCoeffs
[j
], V
);
902 /* scale and clip signals */
903 Y0
= vec_sra (Y0
, SCL
);
904 Y1
= vec_sra (Y1
, SCL
);
905 U
= vec_sra (U
, SCL
);
906 V
= vec_sra (V
, SCL
);
908 Y0
= vec_clip_s16 (Y0
);
909 Y1
= vec_clip_s16 (Y1
);
910 U
= vec_clip_s16 (U
);
911 V
= vec_clip_s16 (V
);
914 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
915 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
917 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
918 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
919 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
922 U0
= vec_mergeh (U
,U
);
923 V0
= vec_mergeh (V
,V
);
925 U1
= vec_mergel (U
,U
);
926 V1
= vec_mergel (V
,V
);
928 cvtyuvtoRGB (c
, Y0
,U0
,V0
,&R0
,&G0
,&B0
);
929 cvtyuvtoRGB (c
, Y1
,U1
,V1
,&R1
,&G1
,&B1
);
931 R
= vec_packclp (R0
,R1
);
932 G
= vec_packclp (G0
,G1
);
933 B
= vec_packclp (B0
,B1
);
935 nout
= (vector
unsigned char *)scratch
;
936 switch(c
->dstFormat
) {
937 case PIX_FMT_ABGR
: out_abgr (R
,G
,B
,nout
); break;
938 case PIX_FMT_BGRA
: out_bgra (R
,G
,B
,nout
); break;
939 case PIX_FMT_RGBA
: out_rgba (R
,G
,B
,nout
); break;
940 case PIX_FMT_ARGB
: out_argb (R
,G
,B
,nout
); break;
941 case PIX_FMT_RGB24
: out_rgb24 (R
,G
,B
,nout
); break;
942 case PIX_FMT_BGR24
: out_bgr24 (R
,G
,B
,nout
); break;
944 /* Unreachable, I think. */
945 av_log(c
, AV_LOG_ERROR
, "altivec_yuv2packedX doesn't support %s output\n",
946 sws_format_name(c
->dstFormat
));
950 memcpy (&((uint32_t*)dest
)[i
], scratch
, (dstW
-i
)/4);