2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * Blackfin Video Color Space Converters Operations
6 * convert I420 YV12 to RGB in various formats,
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts
31 The following calculation is used for the conversion:
33 r = clipz((y-oy)*cy + crv*(v-128))
34 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
35 b = clipz((y-oy)*cy + cbu*(u-128))
37 y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
40 New factorization to elliminate the truncation error which was
41 occuring due to the byteop3p.
44 1) use the bytop16m to subtract quad bytes we use this in U8 this
45 then so the offsets need to be renormalized to 8bits.
47 2) scale operands up by a factor of 4 not 8 because Blackfin
48 multiplies include a shift.
50 3) compute into the accumulators cy*yx0, cy*yx1
52 4) compute each of the linear equations
53 r = clipz((y-oy)*cy + crv*(v-128))
55 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
57 b = clipz((y-oy)*cy + cbu*(u-128))
59 reuse of the accumulators requires that we actually multiply
60 twice once with addition and the second time with a subtaction.
62 because of this we need to compute the equations in the order R B
63 then G saving the writes for B in the case of 24/32 bit color
66 api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67 int dW, uint32_t *coeffs);
74 Where coeffs have the following layout in memory.
76 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
78 coeffs is a pointer to oy.
80 the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81 replication is used to simplify the internal algorithms for the dual mac architecture
84 All routines are exported with _ff_bfin_ as a symbol prefix
86 rough performance gain compared against -O3:
88 2779809/1484290 187.28%
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91 c/pel for the optimized implementations. Not sure why there is such a
92 huge variation on the reference codes on Blackfin I guess it must have
93 to do with the memory system.
101 #define DEFUN(fname,where,interface) \
103 .global _ff_bfin_ ## fname; \
104 .type _ff_bfin_ ## fname, STT_FUNC; \
108 #define DEFUN_END(fname) \
109 .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
114 #define COEFF_LEN 11*4
115 #define COEFF_REL_CY_OFF 4*4
121 DEFUN(yuv2rgb565_line,MEM,
122 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
136 m0 = COEFF_REL_CY_OFF;
140 r1.l = w[i2++]; // 2u
141 r1.h = w[i3++]; // 2v
144 lsetup (.L0565, .L1565) lc0 = p0;
147 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
148 r0 -- used to load 4ys
149 r1 -- used to load 2us,2vs
158 rrrrrrrr gggggggg bbbbbbbb
165 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
166 (r7,r6) = byteop16m (r1:0, r3:2) (r);
167 r5 = r5 << 2 (v); // y1,y0
168 r4 = r4 << 2 (v); // y3,y2
169 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
170 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
172 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
174 /* R = Y+ crv*(Cr-128) */
175 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
176 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
177 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
181 /* B = Y+ cbu*(Cb-128) */
182 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
183 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
184 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
189 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
190 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
191 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
192 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
196 [p1++]=r3 || r1=[i1++]; // cy
200 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
202 /* R = Y+ crv*(Cr-128) */
203 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
204 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
205 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
209 /* B = Y+ cbu*(Cb-128) */
210 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
211 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
212 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
217 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
218 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
219 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
220 r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
221 r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
224 [p1++]=r3 || r1.h = w[i3++]; // 2v
225 .L1565: r2=[i1++]; // oy
232 DEFUN_END(yuv2rgb565_line)
234 DEFUN(yuv2rgb555_line,MEM,
235 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
249 m0 = COEFF_REL_CY_OFF;
253 r1.l = w[i2++]; // 2u
254 r1.h = w[i3++]; // 2v
257 lsetup (.L0555, .L1555) lc0 = p0;
260 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
261 r0 -- used to load 4ys
262 r1 -- used to load 2us,2vs
271 rrrrrrrr gggggggg bbbbbbbb
279 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
280 (r7,r6) = byteop16m (r1:0, r3:2) (r);
281 r5 = r5 << 2 (v); // y1,y0
282 r4 = r4 << 2 (v); // y3,y2
283 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
284 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
286 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
288 /* R = Y+ crv*(Cr-128) */
289 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
290 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
291 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
295 /* B = Y+ cbu*(Cb-128) */
296 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
297 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
298 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
303 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
304 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
305 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
306 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
310 [p1++]=r3 || r1=[i1++]; // cy
314 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
316 /* R = Y+ crv*(Cr-128) */
317 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
318 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
319 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
323 /* B = Y+ cbu*(Cb-128) */
324 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
325 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
326 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
331 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
332 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
333 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
334 r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
335 r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
338 [p1++]=r3 || r1.h=w[i3++]; // 2v
340 .L1555: r2=[i1++]; // oy
347 DEFUN_END(yuv2rgb555_line)
349 DEFUN(yuv2rgb24_line,MEM,
350 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
362 r0 = [fp+ARG_COEFF]; // coeff buffer
366 m0 = COEFF_REL_CY_OFF;
370 r1.l = w[i2++]; // 2u
371 r1.h = w[i3++]; // 2v
374 lsetup (.L0888, .L1888) lc0 = p0;
377 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
378 r0 -- used to load 4ys
379 r1 -- used to load 2us,2vs
387 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
388 (r7,r6) = byteop16m (r1:0, r3:2) (r);
389 r5 = r5 << 2 (v); // y1,y0
390 r4 = r4 << 2 (v); // y3,y2
391 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
392 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
395 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
397 /* R = Y+ crv*(Cr-128) */
398 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
399 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
400 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
401 r2=r2>>16 || B[p1++]=r2;
404 /* B = Y+ cbu*(Cb-128) */
405 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
406 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
407 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
409 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
410 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
411 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
412 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
414 r2=r2>>16 || B[p1++]=r2;
417 r3=r3>>16 || B[p1++]=r3;
418 B[p2++]=r3 || r1=[i1++]; // cy
423 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
425 /* R = Y+ crv*(Cr-128) */
426 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
427 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
428 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
429 r2=r2>>16 || B[p1++]=r2;
432 /* B = Y+ cbu*(Cb-128) */
433 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
434 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
435 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
437 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
438 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
439 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
440 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
441 r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
442 B[p2++]=r2 || r1.l = w[i2++]; // 2u
443 r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
444 B[p2++]=r3 || r2=[i1++]; // oy
454 DEFUN_END(yuv2rgb24_line)
460 #define ARG_height 28
461 #define ARG_lumStride 32
462 #define ARG_chromStride 36
463 #define ARG_srcStride 40
465 DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
466 long width, long height,
467 long lumStride, long chromStride, long srcStride)):
469 [--sp] = (r7:4,p5:4);
471 p0 = r1; // Y top even
474 r2 = [fp + ARG_vdst];
477 r1 = [fp + ARG_srcStride];
479 r1 += -8; // i0,i1 is pre read need to correct
482 i0 = r0; // uyvy_T even
483 i1 = r2; // uyvy_B odd
485 p2 = [fp + ARG_lumStride];
486 p1 = p0 + p2; // Y bot odd
488 p5 = [fp + ARG_width];
489 p4 = [fp + ARG_height];
494 r2 = [fp + ARG_chromStride];
499 /* I0,I1 - src input line pointers
500 * p0,p1 - luma output line pointers
505 lsetup (0f, 1f) lc1 = p4; // H/2
506 0: r0 = [i0++] || r2 = [i1++];
507 r1 = [i0++] || r3 = [i1++];
508 r4 = byteop1p(r1:0, r3:2);
509 r5 = byteop1p(r1:0, r3:2) (r);
510 lsetup (2f, 3f) lc0 = p5; // W/4
515 r0 = bytepack(r0, r1);
516 r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
517 r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
518 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
519 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
520 r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
521 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
530 (r7:4,p5:4) = [sp++];
533 DEFUN_END(uyvytoyv12)
535 DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
536 long width, long height,
537 long lumStride, long chromStride, long srcStride)):
539 [--sp] = (r7:4,p5:4);
541 p0 = r1; // Y top even
544 r2 = [fp + ARG_vdst];
547 r1 = [fp + ARG_srcStride];
549 r1 += -8; // i0,i1 is pre read need to correct
552 i0 = r0; // uyvy_T even
553 i1 = r2; // uyvy_B odd
555 p2 = [fp + ARG_lumStride];
556 p1 = p0 + p2; // Y bot odd
558 p5 = [fp + ARG_width];
559 p4 = [fp + ARG_height];
564 r2 = [fp + ARG_chromStride];
569 /* I0,I1 - src input line pointers
570 * p0,p1 - luma output line pointers
575 lsetup (0f, 1f) lc1 = p4; // H/2
576 0: r0 = [i0++] || r2 = [i1++];
577 r1 = [i0++] || r3 = [i1++];
578 r4 = bytepack(r0, r1);
579 r5 = bytepack(r2, r3);
580 lsetup (2f, 3f) lc0 = p5; // W/4
581 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
582 r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
585 r4 = byteop1p(r1:0, r3:2);
586 r5 = byteop1p(r1:0, r3:2) (r);
587 r6 = pack(r5.l, r4.l);
588 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
589 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
590 r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
591 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
600 (r7:4,p5:4) = [sp++];
603 DEFUN_END(yuyvtoyv12)