2 marc.hoffman@analog.com March 8, 2004
4 AltiVec acceleration for colorspace conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
66 * This file is part of FFmpeg.
68 * FFmpeg is free software; you can redistribute it and/or modify
69 * it under the terms of the GNU General Public License as published by
70 * the Free Software Foundation; either version 2 of the License, or
71 * (at your option) any later version.
73 * FFmpeg is distributed in the hope that it will be useful,
74 * but WITHOUT ANY WARRANTY; without even the implied warranty of
75 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
76 * GNU General Public License for more details.
78 * You should have received a copy of the GNU General Public License
79 * along with FFmpeg; if not, write to the Free Software
80 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
94 #include "swscale_internal.h"
96 #undef PROFILE_THE_BEAST
99 typedef unsigned char ubyte
;
100 typedef signed char sbyte
;
103 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
104 homogeneous vector registers x0,x1,x2 are interleaved with the
107 o0 = vec_mergeh (x0,x1);
108 o1 = vec_perm (o0, x2, perm_rgb_0);
109 o2 = vec_perm (o0, x2, perm_rgb_1);
110 o3 = vec_mergel (x0,x1);
111 o4 = vec_perm (o3,o2,perm_rgb_2);
112 o5 = vec_perm (o3,o2,perm_rgb_3);
114 perm_rgb_0: o0(RG).h v1(B) --> o1*
120 perm_rgb_1: o0(RG).h v1(B) --> o2
126 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
132 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
140 const vector
unsigned char
141 perm_rgb_0
= (const vector
unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
142 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
143 perm_rgb_1
= (const vector
unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
144 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
145 perm_rgb_2
= (const vector
unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
146 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
147 perm_rgb_3
= (const vector
unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
148 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
150 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
152 typeof(x0) o0,o2,o3; \
153 o0 = vec_mergeh (x0,x1); \
154 y0 = vec_perm (o0, x2, perm_rgb_0); \
155 o2 = vec_perm (o0, x2, perm_rgb_1); \
156 o3 = vec_mergel (x0,x1); \
157 y1 = vec_perm (o3,o2,perm_rgb_2); \
158 y2 = vec_perm (o3,o2,perm_rgb_3); \
161 #define vec_mstbgr24(x0,x1,x2,ptr) \
163 typeof(x0) _0,_1,_2; \
164 vec_merge3 (x0,x1,x2,_0,_1,_2); \
165 vec_st (_0, 0, ptr++); \
166 vec_st (_1, 0, ptr++); \
167 vec_st (_2, 0, ptr++); \
170 #define vec_mstrgb24(x0,x1,x2,ptr) \
172 typeof(x0) _0,_1,_2; \
173 vec_merge3 (x2,x1,x0,_0,_1,_2); \
174 vec_st (_0, 0, ptr++); \
175 vec_st (_1, 0, ptr++); \
176 vec_st (_2, 0, ptr++); \
179 /* pack the pixels in rgb0 format
183 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
186 _0 = vec_mergeh (x0,x1); \
187 _1 = vec_mergeh (x2,x3); \
188 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
189 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
190 vec_st (_2, 0*16, (T *)ptr); \
191 vec_st (_3, 1*16, (T *)ptr); \
192 _0 = vec_mergel (x0,x1); \
193 _1 = vec_mergel (x2,x3); \
194 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
195 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
196 vec_st (_2, 2*16, (T *)ptr); \
197 vec_st (_3, 3*16, (T *)ptr); \
204 | 1 -0.3441 -0.7142 |x| Cb|
211 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
219 (vector signed short) \
220 vec_perm(x,(typeof(x))AVV(0),\
221 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
222 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
224 (vector signed short) \
225 vec_perm(x,(typeof(x))AVV(0),\
226 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
227 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
229 #define vec_clip_s16(x) \
230 vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
231 (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16))
233 #define vec_packclp(x,y) \
234 (vector unsigned char)vec_packs \
235 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
236 (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
238 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
241 static inline void cvtyuvtoRGB (SwsContext
*c
,
242 vector
signed short Y
, vector
signed short U
, vector
signed short V
,
243 vector
signed short *R
, vector
signed short *G
, vector
signed short *B
)
245 vector
signed short vx
,ux
,uvx
;
247 Y
= vec_mradds (Y
, c
->CY
, c
->OY
);
248 U
= vec_sub (U
,(vector
signed short)
249 vec_splat((vector
signed short)AVV(128),0));
250 V
= vec_sub (V
,(vector
signed short)
251 vec_splat((vector
signed short)AVV(128),0));
253 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
254 ux
= vec_sl (U
, c
->CSHIFT
);
255 *B
= vec_mradds (ux
, c
->CBU
, Y
);
257 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
258 vx
= vec_sl (V
, c
->CSHIFT
);
259 *R
= vec_mradds (vx
, c
->CRV
, Y
);
261 // uvx = ((CGU*u) + (CGV*v))>>15;
262 uvx
= vec_mradds (U
, c
->CGU
, Y
);
263 *G
= vec_mradds (V
, c
->CGV
, uvx
);
268 ------------------------------------------------------------------------------
270 ------------------------------------------------------------------------------
274 #define DEFCSP420_CVT(name,out_pixels) \
275 static int altivec_##name (SwsContext *c, \
276 unsigned char **in, int *instrides, \
277 int srcSliceY, int srcSliceH, \
278 unsigned char **oplanes, int *outstrides) \
283 int instrides_scl[3]; \
284 vector unsigned char y0,y1; \
286 vector signed char u,v; \
288 vector signed short Y0,Y1,Y2,Y3; \
289 vector signed short U,V; \
290 vector signed short vx,ux,uvx; \
291 vector signed short vx0,ux0,uvx0; \
292 vector signed short vx1,ux1,uvx1; \
293 vector signed short R0,G0,B0; \
294 vector signed short R1,G1,B1; \
295 vector unsigned char R,G,B; \
297 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
298 vector unsigned char align_perm; \
300 vector signed short \
308 vector unsigned short lCSHIFT = c->CSHIFT; \
310 ubyte *y1i = in[0]; \
311 ubyte *y2i = in[0]+instrides[0]; \
315 vector unsigned char *oute \
316 = (vector unsigned char *) \
317 (oplanes[0]+srcSliceY*outstrides[0]); \
318 vector unsigned char *outo \
319 = (vector unsigned char *) \
320 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
323 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
324 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
325 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
328 for (i=0;i<h/2;i++) { \
329 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
330 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
332 for (j=0;j<w/16;j++) { \
334 y1ivP = (vector unsigned char *)y1i; \
335 y2ivP = (vector unsigned char *)y2i; \
336 uivP = (vector unsigned char *)ui; \
337 vivP = (vector unsigned char *)vi; \
339 align_perm = vec_lvsl (0, y1i); \
340 y0 = (vector unsigned char) \
341 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
343 align_perm = vec_lvsl (0, y2i); \
344 y1 = (vector unsigned char) \
345 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
347 align_perm = vec_lvsl (0, ui); \
348 u = (vector signed char) \
349 vec_perm (uivP[0], uivP[1], align_perm); \
351 align_perm = vec_lvsl (0, vi); \
352 v = (vector signed char) \
353 vec_perm (vivP[0], vivP[1], align_perm); \
355 u = (vector signed char) \
356 vec_sub (u,(vector signed char) \
357 vec_splat((vector signed char)AVV(128),0)); \
358 v = (vector signed char) \
359 vec_sub (v,(vector signed char) \
360 vec_splat((vector signed char)AVV(128),0)); \
362 U = vec_unpackh (u); \
363 V = vec_unpackh (v); \
371 Y0 = vec_mradds (Y0, lCY, lOY); \
372 Y1 = vec_mradds (Y1, lCY, lOY); \
373 Y2 = vec_mradds (Y2, lCY, lOY); \
374 Y3 = vec_mradds (Y3, lCY, lOY); \
376 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
377 ux = vec_sl (U, lCSHIFT); \
378 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \
379 ux0 = vec_mergeh (ux,ux); \
380 ux1 = vec_mergel (ux,ux); \
382 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
383 vx = vec_sl (V, lCSHIFT); \
384 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \
385 vx0 = vec_mergeh (vx,vx); \
386 vx1 = vec_mergel (vx,vx); \
388 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
389 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \
390 uvx = vec_mradds (V, lCGV, uvx); \
391 uvx0 = vec_mergeh (uvx,uvx); \
392 uvx1 = vec_mergel (uvx,uvx); \
394 R0 = vec_add (Y0,vx0); \
395 G0 = vec_add (Y0,uvx0); \
396 B0 = vec_add (Y0,ux0); \
397 R1 = vec_add (Y1,vx1); \
398 G1 = vec_add (Y1,uvx1); \
399 B1 = vec_add (Y1,ux1); \
401 R = vec_packclp (R0,R1); \
402 G = vec_packclp (G0,G1); \
403 B = vec_packclp (B0,B1); \
405 out_pixels(R,G,B,oute); \
407 R0 = vec_add (Y2,vx0); \
408 G0 = vec_add (Y2,uvx0); \
409 B0 = vec_add (Y2,ux0); \
410 R1 = vec_add (Y3,vx1); \
411 G1 = vec_add (Y3,uvx1); \
412 B1 = vec_add (Y3,ux1); \
413 R = vec_packclp (R0,R1); \
414 G = vec_packclp (G0,G1); \
415 B = vec_packclp (B0,B1); \
418 out_pixels(R,G,B,outo); \
427 outo += (outstrides[0])>>4; \
428 oute += (outstrides[0])>>4; \
430 ui += instrides_scl[1]; \
431 vi += instrides_scl[2]; \
432 y1i += instrides_scl[0]; \
433 y2i += instrides_scl[0]; \
439 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
440 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
441 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
442 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
443 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
444 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
446 DEFCSP420_CVT (yuv2_abgr
, out_abgr
)
448 DEFCSP420_CVT (yuv2_bgra
, out_bgra
)
450 static int altivec_yuv2_bgra32 (SwsContext
*c
,
451 unsigned char **in
, int *instrides
,
452 int srcSliceY
, int srcSliceH
,
453 unsigned char **oplanes
, int *outstrides
)
458 int instrides_scl
[3];
459 vector
unsigned char y0
,y1
;
461 vector
signed char u
,v
;
463 vector
signed short Y0
,Y1
,Y2
,Y3
;
464 vector
signed short U
,V
;
465 vector
signed short vx
,ux
,uvx
;
466 vector
signed short vx0
,ux0
,uvx0
;
467 vector
signed short vx1
,ux1
,uvx1
;
468 vector
signed short R0
,G0
,B0
;
469 vector
signed short R1
,G1
,B1
;
470 vector
unsigned char R
,G
,B
;
472 vector
unsigned char *uivP
, *vivP
;
473 vector
unsigned char align_perm
;
483 vector
unsigned short lCSHIFT
= c
->CSHIFT
;
486 ubyte
*y2i
= in
[0]+w
;
490 vector
unsigned char *oute
491 = (vector
unsigned char *)
492 (oplanes
[0]+srcSliceY
*outstrides
[0]);
493 vector
unsigned char *outo
494 = (vector
unsigned char *)
495 (oplanes
[0]+srcSliceY
*outstrides
[0]+outstrides
[0]);
498 instrides_scl
[0] = instrides
[0];
499 instrides_scl
[1] = instrides
[1]-w
/2; /* the loop moves ui by w/2 */
500 instrides_scl
[2] = instrides
[2]-w
/2; /* the loop moves vi by w/2 */
503 for (i
=0;i
<h
/2;i
++) {
504 vec_dstst (outo
, (0x02000002|(((w
*3+32)/32)<<16)), 0);
505 vec_dstst (oute
, (0x02000002|(((w
*3+32)/32)<<16)), 1);
507 for (j
=0;j
<w
/16;j
++) {
509 y0
= vec_ldl (0,y1i
);
510 y1
= vec_ldl (0,y2i
);
511 uivP
= (vector
unsigned char *)ui
;
512 vivP
= (vector
unsigned char *)vi
;
514 align_perm
= vec_lvsl (0, ui
);
515 u
= (vector
signed char)vec_perm (uivP
[0], uivP
[1], align_perm
);
517 align_perm
= vec_lvsl (0, vi
);
518 v
= (vector
signed char)vec_perm (vivP
[0], vivP
[1], align_perm
);
519 u
= (vector
signed char)
520 vec_sub (u
,(vector
signed char)
521 vec_splat((vector
signed char)AVV(128),0));
523 v
= (vector
signed char)
524 vec_sub (v
, (vector
signed char)
525 vec_splat((vector
signed char)AVV(128),0));
536 Y0
= vec_mradds (Y0
, lCY
, lOY
);
537 Y1
= vec_mradds (Y1
, lCY
, lOY
);
538 Y2
= vec_mradds (Y2
, lCY
, lOY
);
539 Y3
= vec_mradds (Y3
, lCY
, lOY
);
541 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
542 ux
= vec_sl (U
, lCSHIFT
);
543 ux
= vec_mradds (ux
, lCBU
, (vector
signed short)AVV(0));
544 ux0
= vec_mergeh (ux
,ux
);
545 ux1
= vec_mergel (ux
,ux
);
547 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
548 vx
= vec_sl (V
, lCSHIFT
);
549 vx
= vec_mradds (vx
, lCRV
, (vector
signed short)AVV(0));
550 vx0
= vec_mergeh (vx
,vx
);
551 vx1
= vec_mergel (vx
,vx
);
552 /* uvx = ((CGU*u) + (CGV*v))>>15 */
553 uvx
= vec_mradds (U
, lCGU
, (vector
signed short)AVV(0));
554 uvx
= vec_mradds (V
, lCGV
, uvx
);
555 uvx0
= vec_mergeh (uvx
,uvx
);
556 uvx1
= vec_mergel (uvx
,uvx
);
557 R0
= vec_add (Y0
,vx0
);
558 G0
= vec_add (Y0
,uvx0
);
559 B0
= vec_add (Y0
,ux0
);
560 R1
= vec_add (Y1
,vx1
);
561 G1
= vec_add (Y1
,uvx1
);
562 B1
= vec_add (Y1
,ux1
);
563 R
= vec_packclp (R0
,R1
);
564 G
= vec_packclp (G0
,G1
);
565 B
= vec_packclp (B0
,B1
);
567 out_argb(R
,G
,B
,oute
);
568 R0
= vec_add (Y2
,vx0
);
569 G0
= vec_add (Y2
,uvx0
);
570 B0
= vec_add (Y2
,ux0
);
571 R1
= vec_add (Y3
,vx1
);
572 G1
= vec_add (Y3
,uvx1
);
573 B1
= vec_add (Y3
,ux1
);
574 R
= vec_packclp (R0
,R1
);
575 G
= vec_packclp (G0
,G1
);
576 B
= vec_packclp (B0
,B1
);
578 out_argb(R
,G
,B
,outo
);
586 outo
+= (outstrides
[0])>>4;
587 oute
+= (outstrides
[0])>>4;
589 ui
+= instrides_scl
[1];
590 vi
+= instrides_scl
[2];
591 y1i
+= instrides_scl
[0];
592 y2i
+= instrides_scl
[0];
600 DEFCSP420_CVT (yuv2_rgba
, out_rgba
)
601 DEFCSP420_CVT (yuv2_argb
, out_argb
)
602 DEFCSP420_CVT (yuv2_rgb24
, out_rgb24
)
603 DEFCSP420_CVT (yuv2_bgr24
, out_bgr24
)
606 // uyvy|uyvy|uyvy|uyvy
607 // 0123 4567 89ab cdef
609 const vector
unsigned char
610 demux_u
= (const vector
unsigned char)AVV(0x10,0x00,0x10,0x00,
613 0x10,0x0c,0x10,0x0c),
614 demux_v
= (const vector
unsigned char)AVV(0x10,0x02,0x10,0x02,
617 0x10,0x0E,0x10,0x0E),
618 demux_y
= (const vector
unsigned char)AVV(0x10,0x01,0x10,0x03,
621 0x10,0x0D,0x10,0x0F);
624 this is so I can play live CCIR raw video
626 static int altivec_uyvy_rgb32 (SwsContext
*c
,
627 unsigned char **in
, int *instrides
,
628 int srcSliceY
, int srcSliceH
,
629 unsigned char **oplanes
, int *outstrides
)
634 vector
unsigned char uyvy
;
635 vector
signed short Y
,U
,V
;
636 vector
signed short R0
,G0
,B0
,R1
,G1
,B1
;
637 vector
unsigned char R
,G
,B
;
638 vector
unsigned char *out
;
642 out
= (vector
unsigned char *)(oplanes
[0]+srcSliceY
*outstrides
[0]);
645 for (j
=0;j
<w
/16;j
++) {
646 uyvy
= vec_ld (0, img
);
647 U
= (vector
signed short)
648 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_u
);
650 V
= (vector
signed short)
651 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_v
);
653 Y
= (vector
signed short)
654 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_y
);
656 cvtyuvtoRGB (c
, Y
,U
,V
,&R0
,&G0
,&B0
);
658 uyvy
= vec_ld (16, img
);
659 U
= (vector
signed short)
660 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_u
);
662 V
= (vector
signed short)
663 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_v
);
665 Y
= (vector
signed short)
666 vec_perm (uyvy
, (vector
unsigned char)AVV(0), demux_y
);
668 cvtyuvtoRGB (c
, Y
,U
,V
,&R1
,&G1
,&B1
);
670 R
= vec_packclp (R0
,R1
);
671 G
= vec_packclp (G0
,G1
);
672 B
= vec_packclp (B0
,B1
);
674 // vec_mstbgr24 (R,G,B, out);
675 out_rgba (R
,G
,B
,out
);
685 /* Ok currently the acceleration routine only supports
686 inputs of widths a multiple of 16
687 and heights a multiple 2
689 So we just fall back to the C codes for this.
691 SwsFunc
yuv2rgb_init_altivec (SwsContext
*c
)
693 if (!(c
->flags
& SWS_CPU_CAPS_ALTIVEC
))
697 and this seems not to matter too much I tried a bunch of
698 videos with abnormal widths and mplayer crashes else where.
699 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
700 boom with X11 bad match.
703 if ((c
->srcW
& 0xf) != 0) return NULL
;
705 switch (c
->srcFormat
) {
706 case PIX_FMT_YUV410P
:
707 case PIX_FMT_YUV420P
:
708 /*case IMGFMT_CLPL: ??? */
712 if ((c
->srcH
& 0x1) != 0)
715 switch(c
->dstFormat
){
717 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space RGB24\n");
718 return altivec_yuv2_rgb24
;
720 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space BGR24\n");
721 return altivec_yuv2_bgr24
;
723 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space ARGB\n");
724 return altivec_yuv2_argb
;
726 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space ABGR\n");
727 return altivec_yuv2_abgr
;
729 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space RGBA\n");
730 return altivec_yuv2_rgba
;
732 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space BGRA\n");
733 return altivec_yuv2_bgra
;
734 default: return NULL
;
738 case PIX_FMT_UYVY422
:
739 switch(c
->dstFormat
){
741 av_log(c
, AV_LOG_WARNING
, "ALTIVEC: Color Space UYVY -> RGB32\n");
742 return altivec_uyvy_rgb32
;
743 default: return NULL
;
751 static uint16_t roundToInt16(int64_t f
){
752 int r
= (f
+ (1<<15))>>16;
753 if (r
<-0x7FFF) return 0x8000;
754 else if (r
> 0x7FFF) return 0x7FFF;
758 void yuv2rgb_altivec_init_tables (SwsContext
*c
, const int inv_table
[4],int brightness
,int contrast
, int saturation
)
761 signed short tmp
[8] __attribute__ ((aligned(16)));
762 vector
signed short vec
;
765 buf
.tmp
[0] = ( (0xffffLL
) * contrast
>>8 )>>9; //cy
766 buf
.tmp
[1] = -256*brightness
; //oy
767 buf
.tmp
[2] = (inv_table
[0]>>3) *(contrast
>>16)*(saturation
>>16); //crv
768 buf
.tmp
[3] = (inv_table
[1]>>3) *(contrast
>>16)*(saturation
>>16); //cbu
769 buf
.tmp
[4] = -((inv_table
[2]>>1)*(contrast
>>16)*(saturation
>>16)); //cgu
770 buf
.tmp
[5] = -((inv_table
[3]>>1)*(contrast
>>16)*(saturation
>>16)); //cgv
773 c
->CSHIFT
= (vector
unsigned short)vec_splat_u16(2);
774 c
->CY
= vec_splat ((vector
signed short)buf
.vec
, 0);
775 c
->OY
= vec_splat ((vector
signed short)buf
.vec
, 1);
776 c
->CRV
= vec_splat ((vector
signed short)buf
.vec
, 2);
777 c
->CBU
= vec_splat ((vector
signed short)buf
.vec
, 3);
778 c
->CGU
= vec_splat ((vector
signed short)buf
.vec
, 4);
779 c
->CGV
= vec_splat ((vector
signed short)buf
.vec
, 5);
783 char *v
[6]={"cy","oy","crv","cbu","cgu","cgv"};
785 printf("%s %d ", v
[i
],buf
.tmp
[i
] );
794 altivec_yuv2packedX (SwsContext
*c
,
795 int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
796 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
797 uint8_t *dest
, int dstW
, int dstY
)
800 vector
signed short X
,X0
,X1
,Y0
,U0
,V0
,Y1
,U1
,V1
,U
,V
;
801 vector
signed short R0
,G0
,B0
,R1
,G1
,B1
;
803 vector
unsigned char R
,G
,B
;
804 vector
unsigned char *out
,*nout
;
806 vector
signed short RND
= vec_splat_s16(1<<3);
807 vector
unsigned short SCL
= vec_splat_u16(4);
808 unsigned long scratch
[16] __attribute__ ((aligned (16)));
810 vector
signed short *YCoeffs
, *CCoeffs
;
812 YCoeffs
= c
->vYCoeffsBank
+dstY
*lumFilterSize
;
813 CCoeffs
= c
->vCCoeffsBank
+dstY
*chrFilterSize
;
815 out
= (vector
unsigned char *)dest
;
817 for (i
=0; i
<dstW
; i
+=16){
820 /* extract 16 coeffs from lumSrc */
821 for (j
=0; j
<lumFilterSize
; j
++) {
822 X0
= vec_ld (0, &lumSrc
[j
][i
]);
823 X1
= vec_ld (16, &lumSrc
[j
][i
]);
824 Y0
= vec_mradds (X0
, YCoeffs
[j
], Y0
);
825 Y1
= vec_mradds (X1
, YCoeffs
[j
], Y1
);
830 /* extract 8 coeffs from U,V */
831 for (j
=0; j
<chrFilterSize
; j
++) {
832 X
= vec_ld (0, &chrSrc
[j
][i
/2]);
833 U
= vec_mradds (X
, CCoeffs
[j
], U
);
834 X
= vec_ld (0, &chrSrc
[j
][i
/2+2048]);
835 V
= vec_mradds (X
, CCoeffs
[j
], V
);
838 /* scale and clip signals */
839 Y0
= vec_sra (Y0
, SCL
);
840 Y1
= vec_sra (Y1
, SCL
);
841 U
= vec_sra (U
, SCL
);
842 V
= vec_sra (V
, SCL
);
844 Y0
= vec_clip_s16 (Y0
);
845 Y1
= vec_clip_s16 (Y1
);
846 U
= vec_clip_s16 (U
);
847 V
= vec_clip_s16 (V
);
850 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
851 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
853 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
854 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
855 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
858 U0
= vec_mergeh (U
,U
);
859 V0
= vec_mergeh (V
,V
);
861 U1
= vec_mergel (U
,U
);
862 V1
= vec_mergel (V
,V
);
864 cvtyuvtoRGB (c
, Y0
,U0
,V0
,&R0
,&G0
,&B0
);
865 cvtyuvtoRGB (c
, Y1
,U1
,V1
,&R1
,&G1
,&B1
);
867 R
= vec_packclp (R0
,R1
);
868 G
= vec_packclp (G0
,G1
);
869 B
= vec_packclp (B0
,B1
);
871 switch(c
->dstFormat
) {
872 case PIX_FMT_ABGR
: out_abgr (R
,G
,B
,out
); break;
873 case PIX_FMT_BGRA
: out_bgra (R
,G
,B
,out
); break;
874 case PIX_FMT_RGBA
: out_rgba (R
,G
,B
,out
); break;
875 case PIX_FMT_ARGB
: out_argb (R
,G
,B
,out
); break;
876 case PIX_FMT_RGB24
: out_rgb24 (R
,G
,B
,out
); break;
877 case PIX_FMT_BGR24
: out_bgr24 (R
,G
,B
,out
); break;
880 /* If this is reached, the caller should have called yuv2packedXinC
882 static int printed_error_message
;
883 if (!printed_error_message
) {
884 av_log(c
, AV_LOG_ERROR
, "altivec_yuv2packedX doesn't support %s output\n",
885 sws_format_name(c
->dstFormat
));
886 printed_error_message
=1;
898 /* extract 16 coeffs from lumSrc */
899 for (j
=0; j
<lumFilterSize
; j
++) {
900 X0
= vec_ld (0, &lumSrc
[j
][i
]);
901 X1
= vec_ld (16, &lumSrc
[j
][i
]);
902 Y0
= vec_mradds (X0
, YCoeffs
[j
], Y0
);
903 Y1
= vec_mradds (X1
, YCoeffs
[j
], Y1
);
908 /* extract 8 coeffs from U,V */
909 for (j
=0; j
<chrFilterSize
; j
++) {
910 X
= vec_ld (0, &chrSrc
[j
][i
/2]);
911 U
= vec_mradds (X
, CCoeffs
[j
], U
);
912 X
= vec_ld (0, &chrSrc
[j
][i
/2+2048]);
913 V
= vec_mradds (X
, CCoeffs
[j
], V
);
916 /* scale and clip signals */
917 Y0
= vec_sra (Y0
, SCL
);
918 Y1
= vec_sra (Y1
, SCL
);
919 U
= vec_sra (U
, SCL
);
920 V
= vec_sra (V
, SCL
);
922 Y0
= vec_clip_s16 (Y0
);
923 Y1
= vec_clip_s16 (Y1
);
924 U
= vec_clip_s16 (U
);
925 V
= vec_clip_s16 (V
);
928 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
929 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
931 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
932 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
933 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
936 U0
= vec_mergeh (U
,U
);
937 V0
= vec_mergeh (V
,V
);
939 U1
= vec_mergel (U
,U
);
940 V1
= vec_mergel (V
,V
);
942 cvtyuvtoRGB (c
, Y0
,U0
,V0
,&R0
,&G0
,&B0
);
943 cvtyuvtoRGB (c
, Y1
,U1
,V1
,&R1
,&G1
,&B1
);
945 R
= vec_packclp (R0
,R1
);
946 G
= vec_packclp (G0
,G1
);
947 B
= vec_packclp (B0
,B1
);
949 nout
= (vector
unsigned char *)scratch
;
950 switch(c
->dstFormat
) {
951 case PIX_FMT_ABGR
: out_abgr (R
,G
,B
,nout
); break;
952 case PIX_FMT_BGRA
: out_bgra (R
,G
,B
,nout
); break;
953 case PIX_FMT_RGBA
: out_rgba (R
,G
,B
,nout
); break;
954 case PIX_FMT_ARGB
: out_argb (R
,G
,B
,nout
); break;
955 case PIX_FMT_RGB24
: out_rgb24 (R
,G
,B
,nout
); break;
956 case PIX_FMT_BGR24
: out_bgr24 (R
,G
,B
,nout
); break;
958 /* Unreachable, I think. */
959 av_log(c
, AV_LOG_ERROR
, "altivec_yuv2packedX doesn't support %s output\n",
960 sws_format_name(c
->dstFormat
));
964 memcpy (&((uint32_t*)dest
)[i
], scratch
, (dstW
-i
)/4);