2 marc.hoffman@analog.com March 8, 2004
4 Altivec Acceleration for Color Space Conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
66 * This file is part of FFmpeg.
68 * FFmpeg is free software; you can redistribute it and/or modify
69 * it under the terms of the GNU General Public License as published by
70 * the Free Software Foundation; either version 2 of the License, or
71 * (at your option) any later version.
73 * FFmpeg is distributed in the hope that it will be useful,
74 * but WITHOUT ANY WARRANTY; without even the implied warranty of
75 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
76 * GNU General Public License for more details.
78 * You should have received a copy of the GNU General Public License
79 * along with FFmpeg; if not, write to the Free Software
80 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
94 #include "swscale_internal.h"
96 #undef PROFILE_THE_BEAST
99 typedef unsigned char ubyte
;
100 typedef signed char sbyte
;
103 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
104 homogeneous vector registers x0,x1,x2 are interleaved with the
107 o0 = vec_mergeh (x0,x1);
108 o1 = vec_perm (o0, x2, perm_rgb_0);
109 o2 = vec_perm (o0, x2, perm_rgb_1);
110 o3 = vec_mergel (x0,x1);
111 o4 = vec_perm (o3,o2,perm_rgb_2);
112 o5 = vec_perm (o3,o2,perm_rgb_3);
114 perm_rgb_0: o0(RG).h v1(B) --> o1*
120 perm_rgb_1: o0(RG).h v1(B) --> o2
126 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
132 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
140 const vector
unsigned char
141 perm_rgb_0
= (const vector
unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
142 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
143 perm_rgb_1
= (const vector
unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
144 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
145 perm_rgb_2
= (const vector
unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
146 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
147 perm_rgb_3
= (const vector
unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
148 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
150 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
152 typeof(x0) o0,o2,o3; \
153 o0 = vec_mergeh (x0,x1); \
154 y0 = vec_perm (o0, x2, perm_rgb_0);\
155 o2 = vec_perm (o0, x2, perm_rgb_1);\
156 o3 = vec_mergel (x0,x1); \
157 y1 = vec_perm (o3,o2,perm_rgb_2); \
158 y2 = vec_perm (o3,o2,perm_rgb_3); \
161 #define vec_mstbgr24(x0,x1,x2,ptr) \
163 typeof(x0) _0,_1,_2; \
164 vec_merge3 (x0,x1,x2,_0,_1,_2); \
165 vec_st (_0, 0, ptr++); \
166 vec_st (_1, 0, ptr++); \
167 vec_st (_2, 0, ptr++); \
170 #define vec_mstrgb24(x0,x1,x2,ptr) \
172 typeof(x0) _0,_1,_2; \
173 vec_merge3 (x2,x1,x0,_0,_1,_2); \
174 vec_st (_0, 0, ptr++); \
175 vec_st (_1, 0, ptr++); \
176 vec_st (_2, 0, ptr++); \
179 /* pack the pixels in rgb0 format
183 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
186 _0 = vec_mergeh (x0,x1); \
187 _1 = vec_mergeh (x2,x3); \
188 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
189 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
190 vec_st (_2, 0*16, (T *)ptr); \
191 vec_st (_3, 1*16, (T *)ptr); \
192 _0 = vec_mergel (x0,x1); \
193 _1 = vec_mergel (x2,x3); \
194 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
195 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
196 vec_st (_2, 2*16, (T *)ptr); \
197 vec_st (_3, 3*16, (T *)ptr); \
204 | 1 -0.3441 -0.7142 |x| Cb|
211 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
219 (vector signed short) \
220 vec_perm(x,(typeof(x))AVV(0),\
221 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
222 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
224 (vector signed short) \
225 vec_perm(x,(typeof(x))AVV(0),\
226 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
227 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
229 #define vec_clip_s16(x) \
230 vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
231 (vector signed short)AVV(16, 16, 16, 16, 16, 16, 16, 16 ))
233 #define vec_packclp(x,y) \
234 (vector unsigned char)vec_packs \
235 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
236 (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
238 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
241 static inline void cvtyuvtoRGB (SwsContext
*c
,
242 vector
signed short Y
, vector
signed short U
, vector
signed short V
,
243 vector
signed short *R
, vector
signed short *G
, vector
signed short *B
)
245 vector
signed short vx
,ux
,uvx
;
247 Y
= vec_mradds (Y
, c
->CY
, c
->OY
);
248 U
= vec_sub (U
,(vector
signed short)
249 vec_splat((vector
signed short)AVV(128),0));
250 V
= vec_sub (V
,(vector
signed short)
251 vec_splat((vector
signed short)AVV(128),0));
253 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
254 ux
= vec_sl (U
, c
->CSHIFT
);
255 *B
= vec_mradds (ux
, c
->CBU
, Y
);
257 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
258 vx
= vec_sl (V
, c
->CSHIFT
);
259 *R
= vec_mradds (vx
, c
->CRV
, Y
);
261 // uvx = ((CGU*u) + (CGV*v))>>15;
262 uvx
= vec_mradds (U
, c
->CGU
, Y
);
263 *G
= vec_mradds (V
, c
->CGV
, uvx
);
268 ------------------------------------------------------------------------------
270 ------------------------------------------------------------------------------
274 #define DEFCSP420_CVT(name,out_pixels) \
275 static int altivec_##name (SwsContext *c, \
276 unsigned char **in, int *instrides, \
277 int srcSliceY, int srcSliceH, \
278 unsigned char **oplanes, int *outstrides) \
283 int instrides_scl[3]; \
284 vector unsigned char y0,y1; \
286 vector signed char u,v; \
288 vector signed short Y0,Y1,Y2,Y3; \
289 vector signed short U,V; \
290 vector signed short vx,ux,uvx; \
291 vector signed short vx0,ux0,uvx0; \
292 vector signed short vx1,ux1,uvx1; \
293 vector signed short R0,G0,B0; \
294 vector signed short R1,G1,B1; \
295 vector unsigned char R,G,B; \
297 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
298 vector unsigned char align_perm; \
300 vector signed short \
308 vector unsigned short lCSHIFT = c->CSHIFT; \
310 ubyte *y1i = in[0]; \
311 ubyte *y2i = in[0]+instrides[0]; \
315 vector unsigned char *oute \
316 = (vector unsigned char *) \
317 (oplanes[0]+srcSliceY*outstrides[0]); \
318 vector unsigned char *outo \
319 = (vector unsigned char *) \
320 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
323 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
324 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
325 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
328 for (i=0;i<h/2;i++) { \
329 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
330 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
332 for (j=0;j<w/16;j++) { \
334 y1ivP = (vector unsigned char *)y1i; \
335 y2ivP = (vector unsigned char *)y2i; \
336 uivP = (vector unsigned char *)ui; \
337 vivP = (vector unsigned char *)vi; \
339 align_perm = vec_lvsl (0, y1i); \
340 y0 = (vector unsigned char)vec_perm (y1ivP[0], y1ivP[1], align_perm);\
342 align_perm = vec_lvsl (0, y2i); \
343 y1 = (vector unsigned char)vec_perm (y2ivP[0], y2ivP[1], align_perm);\
345 align_perm = vec_lvsl (0, ui); \
346 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
348 align_perm = vec_lvsl (0, vi); \
349 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
351 u = (vector signed char) \
352 vec_sub (u,(vector signed char) \
353 vec_splat((vector signed char)AVV(128),0));\
354 v = (vector signed char) \
355 vec_sub (v,(vector signed char) \
356 vec_splat((vector signed char)AVV(128),0));\
358 U = vec_unpackh (u); \
359 V = vec_unpackh (v); \
367 Y0 = vec_mradds (Y0, lCY, lOY); \
368 Y1 = vec_mradds (Y1, lCY, lOY); \
369 Y2 = vec_mradds (Y2, lCY, lOY); \
370 Y3 = vec_mradds (Y3, lCY, lOY); \
372 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
373 ux = vec_sl (U, lCSHIFT); \
374 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \
375 ux0 = vec_mergeh (ux,ux); \
376 ux1 = vec_mergel (ux,ux); \
378 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
379 vx = vec_sl (V, lCSHIFT); \
380 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \
381 vx0 = vec_mergeh (vx,vx); \
382 vx1 = vec_mergel (vx,vx); \
384 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
385 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \
386 uvx = vec_mradds (V, lCGV, uvx); \
387 uvx0 = vec_mergeh (uvx,uvx); \
388 uvx1 = vec_mergel (uvx,uvx); \
390 R0 = vec_add (Y0,vx0); \
391 G0 = vec_add (Y0,uvx0); \
392 B0 = vec_add (Y0,ux0); \
393 R1 = vec_add (Y1,vx1); \
394 G1 = vec_add (Y1,uvx1); \
395 B1 = vec_add (Y1,ux1); \
397 R = vec_packclp (R0,R1); \
398 G = vec_packclp (G0,G1); \
399 B = vec_packclp (B0,B1); \
401 out_pixels(R,G,B,oute); \
403 R0 = vec_add (Y2,vx0); \
404 G0 = vec_add (Y2,uvx0); \
405 B0 = vec_add (Y2,ux0); \
406 R1 = vec_add (Y3,vx1); \
407 G1 = vec_add (Y3,uvx1); \
408 B1 = vec_add (Y3,ux1); \
409 R = vec_packclp (R0,R1); \
410 G = vec_packclp (G0,G1); \
411 B = vec_packclp (B0,B1); \
414 out_pixels(R,G,B,outo); \
423 outo += (outstrides[0])>>4; \
424 oute += (outstrides[0])>>4; \
426 ui += instrides_scl[1]; \
427 vi += instrides_scl[2]; \
428 y1i += instrides_scl[0]; \
429 y2i += instrides_scl[0]; \
435 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
436 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
437 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
438 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
439 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
440 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
442 DEFCSP420_CVT (yuv2_abgr
, out_abgr
)
444 DEFCSP420_CVT (yuv2_bgra
, out_bgra
)
446 static int altivec_yuv2_bgra32 (SwsContext
*c
,
447 unsigned char **in
, int *instrides
,
448 int srcSliceY
, int srcSliceH
,
449 unsigned char **oplanes
, int *outstrides
)
454 int instrides_scl
[3];
455 vector
unsigned char y0
,y1
;
457 vector
signed char u
,v
;
459 vector
signed short Y0
,Y1
,Y2
,Y3
;
460 vector
signed short U
,V
;
461 vector
signed short vx
,ux
,uvx
;
462 vector
signed short vx0
,ux0
,uvx0
;
463 vector
signed short vx1
,ux1
,uvx1
;
464 vector
signed short R0
,G0
,B0
;
465 vector
signed short R1
,G1
,B1
;
466 vector
unsigned char R
,G
,B
;
468 vector
unsigned char *uivP
, *vivP
;
469 vector
unsigned char align_perm
;
479 vector
unsigned short lCSHIFT
= c
->CSHIFT
;
482 ubyte
*y2i
= in
[0]+w
;
486 vector
unsigned char *oute
487 = (vector
unsigned char *)
488 (oplanes
[0]+srcSliceY
*outstrides
[0]);
489 vector
unsigned char *outo
490 = (vector
unsigned char *)
491 (oplanes
[0]+srcSliceY
*outstrides
[0]+outstrides
[0]);
494 instrides_scl
[0] = instrides
[0];
495 instrides_scl
[1] = instrides
[1]-w
/2; /* the loop moves ui by w/2 */
496 instrides_scl
[2] = instrides
[2]-w
/2; /* the loop moves vi by w/2 */
499 for (i
=0;i
<h
/2;i
++) {
500 vec_dstst (outo
, (0x02000002|(((w
*3+32)/32)<<16)), 0);
501 vec_dstst (oute
, (0x02000002|(((w
*3+32)/32)<<16)), 1);
503 for (j
=0;j
<w
/16;j
++) {
505 y0
= vec_ldl (0,y1i
);
506 y1
= vec_ldl (0,y2i
);
507 uivP
= (vector
unsigned char *)ui
;
508 vivP
= (vector
unsigned char *)vi
;
510 align_perm
= vec_lvsl (0, ui
);
511 u
= (vector
signed char)vec_perm (uivP
[0], uivP
[1], align_perm
);
513 align_perm
= vec_lvsl (0, vi
);
514 v
= (vector
signed char)vec_perm (vivP
[0], vivP
[1], align_perm
);
515 u
= (vector
signed char)
516 vec_sub (u
,(vector
signed char)
517 vec_splat((vector
signed char)AVV(128),0));
519 v
= (vector
signed char)
520 vec_sub (v
, (vector
signed char)
521 vec_splat((vector
signed char)AVV(128),0));
532 Y0
= vec_mradds (Y0
, lCY
, lOY
);
533 Y1
= vec_mradds (Y1
, lCY
, lOY
);
534 Y2
= vec_mradds (Y2
, lCY
, lOY
);
535 Y3
= vec_mradds (Y3
, lCY
, lOY
);
537 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
538 ux
= vec_sl (U
, lCSHIFT
);
539 ux
= vec_mradds (ux
, lCBU
, (vector
signed short)AVV(0));
540 ux0
= vec_mergeh (ux
,ux
);
541 ux1
= vec_mergel (ux
,ux
);
543 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
544 vx
= vec_sl (V
, lCSHIFT
);
545 vx
= vec_mradds (vx
, lCRV
, (vector
signed short)AVV(0));
546 vx0
= vec_mergeh (vx
,vx
);
547 vx1
= vec_mergel (vx
,vx
);
548 /* uvx = ((CGU*u) + (CGV*v))>>15 */
549 uvx
= vec_mradds (U
, lCGU
, (vector
signed short)AVV(0));
550 uvx
= vec_mradds (V
, lCGV
, uvx
);
551 uvx0
= vec_mergeh (uvx
,uvx
);
552 uvx1
= vec_mergel (uvx
,uvx
);
553 R0
= vec_add (Y0
,vx0
);
554 G0
= vec_add (Y0
,uvx0
);
555 B0
= vec_add (Y0
,ux0
);
556 R1
= vec_add (Y1
,vx1
);
557 G1
= vec_add (Y1
,uvx1
);
558 B1
= vec_add (Y1
,ux1
);
559 R
= vec_packclp (R0
,R1
);
560 G
= vec_packclp (G0
,G1
);
561 B
= vec_packclp (B0
,B1
);
563 out_argb(R
,G
,B
,oute
);
564 R0
= vec_add (Y2
,vx0
);
565 G0
= vec_add (Y2
,uvx0
);
566 B0
= vec_add (Y2
,ux0
);
567 R1
= vec_add (Y3
,vx1
);
568 G1
= vec_add (Y3
,uvx1
);
569 B1
= vec_add (Y3
,ux1
);
570 R
= vec_packclp (R0
,R1
);
571 G
= vec_packclp (G0
,G1
);
572 B
= vec_packclp (B0
,B1
);
574 out_argb(R
,G
,B
,outo
);
582 outo
+= (outstrides
[0])>>4;
583 oute
+= (outstrides
[0])>>4;
585 ui
+= instrides_scl
[1];
586 vi
+= instrides_scl
[2];
587 y1i
+= instrides_scl
[0];
588 y2i
+= instrides_scl
[0];
596 DEFCSP420_CVT (yuv2_rgba
, out_rgba
)
597 DEFCSP420_CVT (yuv2_argb
, out_argb
)
598 DEFCSP420_CVT (yuv2_rgb24
, out_rgb24
)
599 DEFCSP420_CVT (yuv2_bgr24
, out_bgr24
)
602 // uyvy|uyvy|uyvy|uyvy
603 // 0123 4567 89ab cdef
605 const vector
unsigned char
606 demux_u
= (const vector
unsigned char)AVV(0x10,0x00,0x10,0x00,
609 0x10,0x0c,0x10,0x0c),
610 demux_v
= (const vector
unsigned char)AVV(0x10,0x02,0x10,0x02,
613 0x10,0x0E,0x10,0x0E),
614 demux_y
= (const vector
unsigned char)AVV(0x10,0x01,0x10,0x03,
617 0x10,0x0D,0x10,0x0F);
620 this is so I can play live CCIR raw video
622 static int altivec_uyvy_rgb32 (SwsContext
*c
,
623 unsigned char **in
, int *instrides
,
624 int srcSliceY
, int srcSliceH
,
625 unsigned char **oplanes
, int *outstrides
)
630 vector
unsigned char uyvy
;
631 vector
signed short Y
,U
,V
;
632 vector
signed short R0
,G0
,B0
,R1
,G1
,B1
;
633 vector
unsigned char R
,G
,B
;
634 vector
unsigned char *out
;
638 out
= (vector
unsigned char *)(oplanes
[0]+srcSliceY
*outstrides
[0]);
641 for (j
=0;j
<w
/16;j
++) {
642 uyvy
= vec_ld (0, img
);
643 U
= (vector
signed short)
644 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_u
);
646 V
= (vector
signed short)
647 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_v
);
649 Y
= (vector
signed short)
650 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_y
);
652 cvtyuvtoRGB (c
, Y
,U
,V
,&R0
,&G0
,&B0
);
654 uyvy
= vec_ld (16, img
);
655 U
= (vector
signed short)
656 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_u
);
658 V
= (vector
signed short)
659 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_v
);
661 Y
= (vector
signed short)
662 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_y
);
664 cvtyuvtoRGB (c
, Y
,U
,V
,&R1
,&G1
,&B1
);
666 R
= vec_packclp (R0
,R1
);
667 G
= vec_packclp (G0
,G1
);
668 B
= vec_packclp (B0
,B1
);
670 // vec_mstbgr24 (R,G,B, out);
671 out_rgba (R
,G
,B
,out
);
681 /* Ok currently the acceleration routine only supports
682 inputs of widths a multiple of 16
683 and heights a multiple 2
685 So we just fall back to the C codes for this.
687 SwsFunc
yuv2rgb_init_altivec (SwsContext
*c
)
689 if (!(c
->flags
& SWS_CPU_CAPS_ALTIVEC
))
693 and this seems not to matter too much I tried a bunch of
694 videos with abnormal widths and mplayer crashes else where.
695 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
696 boom with X11 bad match.
699 if ((c
->srcW
& 0xf) != 0) return NULL
;
701 switch (c
->srcFormat
) {
702 case PIX_FMT_YUV410P
:
703 case PIX_FMT_YUV420P
:
704 /*case IMGFMT_CLPL: ??? */
708 if ((c
->srcH
& 0x1) != 0)
711 switch(c
->dstFormat
){
713 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space RGB24\n");
714 return altivec_yuv2_rgb24
;
716 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space BGR24\n");
717 return altivec_yuv2_bgr24
;
719 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space ARGB\n");
720 return altivec_yuv2_argb
;
722 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space ABGR\n");
723 return altivec_yuv2_abgr
;
725 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space RGBA\n");
726 return altivec_yuv2_rgba
;
728 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space BGRA\n");
729 return altivec_yuv2_bgra
;
730 default: return NULL
;
734 case PIX_FMT_UYVY422
:
735 switch(c
->dstFormat
){
737 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space UYVY -> RGB32\n");
738 return altivec_uyvy_rgb32
;
739 default: return NULL
;
747 static uint16_t roundToInt16(int64_t f
){
748 int r
= (f
+ (1<<15))>>16;
749 if(r
<-0x7FFF) return 0x8000;
750 else if(r
> 0x7FFF) return 0x7FFF;
754 void yuv2rgb_altivec_init_tables (SwsContext
*c
, const int inv_table
[4],int brightness
,int contrast
, int saturation
)
757 signed short tmp
[8] __attribute__ ((aligned(16)));
758 vector
signed short vec
;
761 buf
.tmp
[0] = ( (0xffffLL
) * contrast
>>8 )>>9; //cy
762 buf
.tmp
[1] = -256*brightness
; //oy
763 buf
.tmp
[2] = (inv_table
[0]>>3) *(contrast
>>16)*(saturation
>>16); //crv
764 buf
.tmp
[3] = (inv_table
[1]>>3) *(contrast
>>16)*(saturation
>>16); //cbu
765 buf
.tmp
[4] = -((inv_table
[2]>>1)*(contrast
>>16)*(saturation
>>16)); //cgu
766 buf
.tmp
[5] = -((inv_table
[3]>>1)*(contrast
>>16)*(saturation
>>16)); //cgv
769 c
->CSHIFT
= (vector
unsigned short)vec_splat_u16(2);
770 c
->CY
= vec_splat ((vector
signed short)buf
.vec
, 0);
771 c
->OY
= vec_splat ((vector
signed short)buf
.vec
, 1);
772 c
->CRV
= vec_splat ((vector
signed short)buf
.vec
, 2);
773 c
->CBU
= vec_splat ((vector
signed short)buf
.vec
, 3);
774 c
->CGU
= vec_splat ((vector
signed short)buf
.vec
, 4);
775 c
->CGV
= vec_splat ((vector
signed short)buf
.vec
, 5);
779 char *v
[6]={"cy","oy","crv","cbu","cgu","cgv"};
781 printf("%s %d ", v
[i
],buf
.tmp
[i
] );
790 altivec_yuv2packedX (SwsContext
*c
,
791 int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
792 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
793 uint8_t *dest
, int dstW
, int dstY
)
796 vector
signed short X
,X0
,X1
,Y0
,U0
,V0
,Y1
,U1
,V1
,U
,V
;
797 vector
signed short R0
,G0
,B0
,R1
,G1
,B1
;
799 vector
unsigned char R
,G
,B
;
800 vector
unsigned char *out
,*nout
;
802 vector
signed short RND
= vec_splat_s16(1<<3);
803 vector
unsigned short SCL
= vec_splat_u16(4);
804 unsigned long scratch
[16] __attribute__ ((aligned (16)));
806 vector
signed short *YCoeffs
, *CCoeffs
;
808 YCoeffs
= c
->vYCoeffsBank
+dstY
*lumFilterSize
;
809 CCoeffs
= c
->vCCoeffsBank
+dstY
*chrFilterSize
;
811 out
= (vector
unsigned char *)dest
;
813 for(i
=0; i
<dstW
; i
+=16){
816 /* extract 16 coeffs from lumSrc */
817 for(j
=0; j
<lumFilterSize
; j
++) {
818 X0
= vec_ld (0, &lumSrc
[j
][i
]);
819 X1
= vec_ld (16, &lumSrc
[j
][i
]);
820 Y0
= vec_mradds (X0
, YCoeffs
[j
], Y0
);
821 Y1
= vec_mradds (X1
, YCoeffs
[j
], Y1
);
826 /* extract 8 coeffs from U,V */
827 for(j
=0; j
<chrFilterSize
; j
++) {
828 X
= vec_ld (0, &chrSrc
[j
][i
/2]);
829 U
= vec_mradds (X
, CCoeffs
[j
], U
);
830 X
= vec_ld (0, &chrSrc
[j
][i
/2+2048]);
831 V
= vec_mradds (X
, CCoeffs
[j
], V
);
834 /* scale and clip signals */
835 Y0
= vec_sra (Y0
, SCL
);
836 Y1
= vec_sra (Y1
, SCL
);
837 U
= vec_sra (U
, SCL
);
838 V
= vec_sra (V
, SCL
);
840 Y0
= vec_clip_s16 (Y0
);
841 Y1
= vec_clip_s16 (Y1
);
842 U
= vec_clip_s16 (U
);
843 V
= vec_clip_s16 (V
);
846 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
847 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
849 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
850 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
851 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
854 U0
= vec_mergeh (U
,U
);
855 V0
= vec_mergeh (V
,V
);
857 U1
= vec_mergel (U
,U
);
858 V1
= vec_mergel (V
,V
);
860 cvtyuvtoRGB (c
, Y0
,U0
,V0
,&R0
,&G0
,&B0
);
861 cvtyuvtoRGB (c
, Y1
,U1
,V1
,&R1
,&G1
,&B1
);
863 R
= vec_packclp (R0
,R1
);
864 G
= vec_packclp (G0
,G1
);
865 B
= vec_packclp (B0
,B1
);
867 switch(c
->dstFormat
) {
868 case PIX_FMT_ABGR
: out_abgr (R
,G
,B
,out
); break;
869 case PIX_FMT_BGRA
: out_bgra (R
,G
,B
,out
); break;
870 case PIX_FMT_RGBA
: out_rgba (R
,G
,B
,out
); break;
871 case PIX_FMT_ARGB
: out_argb (R
,G
,B
,out
); break;
872 case PIX_FMT_RGB24
: out_rgb24 (R
,G
,B
,out
); break;
873 case PIX_FMT_BGR24
: out_bgr24 (R
,G
,B
,out
); break;
876 /* If this is reached, the caller should have called yuv2packedXinC
878 static int printed_error_message
;
879 if(!printed_error_message
) {
880 av_log(c
, AV_LOG_ERROR
, "altivec_yuv2packedX doesn't support %s output\n",
881 sws_format_name(c
->dstFormat
));
882 printed_error_message
=1;
894 /* extract 16 coeffs from lumSrc */
895 for(j
=0; j
<lumFilterSize
; j
++) {
896 X0
= vec_ld (0, &lumSrc
[j
][i
]);
897 X1
= vec_ld (16, &lumSrc
[j
][i
]);
898 Y0
= vec_mradds (X0
, YCoeffs
[j
], Y0
);
899 Y1
= vec_mradds (X1
, YCoeffs
[j
], Y1
);
904 /* extract 8 coeffs from U,V */
905 for(j
=0; j
<chrFilterSize
; j
++) {
906 X
= vec_ld (0, &chrSrc
[j
][i
/2]);
907 U
= vec_mradds (X
, CCoeffs
[j
], U
);
908 X
= vec_ld (0, &chrSrc
[j
][i
/2+2048]);
909 V
= vec_mradds (X
, CCoeffs
[j
], V
);
912 /* scale and clip signals */
913 Y0
= vec_sra (Y0
, SCL
);
914 Y1
= vec_sra (Y1
, SCL
);
915 U
= vec_sra (U
, SCL
);
916 V
= vec_sra (V
, SCL
);
918 Y0
= vec_clip_s16 (Y0
);
919 Y1
= vec_clip_s16 (Y1
);
920 U
= vec_clip_s16 (U
);
921 V
= vec_clip_s16 (V
);
924 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
925 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
927 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
928 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
929 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
932 U0
= vec_mergeh (U
,U
);
933 V0
= vec_mergeh (V
,V
);
935 U1
= vec_mergel (U
,U
);
936 V1
= vec_mergel (V
,V
);
938 cvtyuvtoRGB (c
, Y0
,U0
,V0
,&R0
,&G0
,&B0
);
939 cvtyuvtoRGB (c
, Y1
,U1
,V1
,&R1
,&G1
,&B1
);
941 R
= vec_packclp (R0
,R1
);
942 G
= vec_packclp (G0
,G1
);
943 B
= vec_packclp (B0
,B1
);
945 nout
= (vector
unsigned char *)scratch
;
946 switch(c
->dstFormat
) {
947 case PIX_FMT_ABGR
: out_abgr (R
,G
,B
,nout
); break;
948 case PIX_FMT_BGRA
: out_bgra (R
,G
,B
,nout
); break;
949 case PIX_FMT_RGBA
: out_rgba (R
,G
,B
,nout
); break;
950 case PIX_FMT_ARGB
: out_argb (R
,G
,B
,nout
); break;
951 case PIX_FMT_RGB24
: out_rgb24 (R
,G
,B
,nout
); break;
952 case PIX_FMT_BGR24
: out_bgr24 (R
,G
,B
,nout
); break;
954 /* Unreachable, I think. */
955 av_log(c
, AV_LOG_ERROR
, "altivec_yuv2packedX doesn't support %s output\n",
956 sws_format_name(c
->dstFormat
));
960 memcpy (&((uint32_t*)dest
)[i
], scratch
, (dstW
-i
)/4);