2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * Blackfin video color space converter operations
6 * convert I420 YV12 to RGB in various formats
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
31 The following calculation is used for the conversion:
33 r = clipz((y-oy)*cy + crv*(v-128))
34 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
35 b = clipz((y-oy)*cy + cbu*(u-128))
37 y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
40 New factorization to eliminate the truncation error which was
41 occurring due to the byteop3p.
44 1) Use the bytop16m to subtract quad bytes we use this in U8 this
45 then so the offsets need to be renormalized to 8bits.
47 2) Scale operands up by a factor of 4 not 8 because Blackfin
48 multiplies include a shift.
50 3) Compute into the accumulators cy*yx0, cy*yx1.
52 4) Compute each of the linear equations:
53 r = clipz((y - oy) * cy + crv * (v - 128))
55 g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
57 b = clipz((y - oy) * cy + cbu * (u - 128))
59 Reuse of the accumulators requires that we actually multiply
60 twice once with addition and the second time with a subtraction.
62 Because of this we need to compute the equations in the order R B
63 then G saving the writes for B in the case of 24/32 bit color
66 API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67 int dW, uint32_t *coeffs);
74 Where coeffs have the following layout in memory.
76 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
78 coeffs is a pointer to oy.
80 The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81 replication is used to simplify the internal algorithms for the dual Mac
82 architecture of BlackFin.
84 All routines are exported with _ff_bfin_ as a symbol prefix.
86 Rough performance gain compared against -O3:
88 2779809/1484290 187.28%
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91 c/pel for the optimized implementations. Not sure why there is such a
92 huge variation on the reference codes on Blackfin I guess it must have
93 to do with the memory system.
104 #define DEFUN(fname,where,interface) \
106 .global _ff_bfin_ ## fname; \
107 .type _ff_bfin_ ## fname, STT_FUNC; \
111 #define DEFUN_END(fname) \
112 .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
117 #define COEFF_LEN 11*4
118 #define COEFF_REL_CY_OFF 4*4
124 DEFUN(yuv2rgb565_line,MEM,
125 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
139 m0 = COEFF_REL_CY_OFF;
143 r1.l = w[i2++]; // 2u
144 r1.h = w[i3++]; // 2v
147 lsetup (.L0565, .L1565) lc0 = p0;
150 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151 r0 -- used to load 4ys
152 r1 -- used to load 2us,2vs
161 rrrrrrrr gggggggg bbbbbbbb
168 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
169 (r7,r6) = byteop16m (r1:0, r3:2) (r);
170 r5 = r5 << 2 (v); // y1,y0
171 r4 = r4 << 2 (v); // y3,y2
172 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
173 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
175 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
177 /* R = Y+ crv*(Cr-128) */
178 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
180 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
184 /* B = Y+ cbu*(Cb-128) */
185 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
187 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
192 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
194 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
199 [p1++]=r3 || r1=[i1++]; // cy
203 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
205 /* R = Y+ crv*(Cr-128) */
206 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
208 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
212 /* B = Y+ cbu*(Cb-128) */
213 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
215 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
220 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
222 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223 r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
224 r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
227 [p1++]=r3 || r1.h = w[i3++]; // 2v
228 .L1565: r2=[i1++]; // oy
235 DEFUN_END(yuv2rgb565_line)
237 DEFUN(yuv2rgb555_line,MEM,
238 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
252 m0 = COEFF_REL_CY_OFF;
256 r1.l = w[i2++]; // 2u
257 r1.h = w[i3++]; // 2v
260 lsetup (.L0555, .L1555) lc0 = p0;
263 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264 r0 -- used to load 4ys
265 r1 -- used to load 2us,2vs
274 rrrrrrrr gggggggg bbbbbbbb
282 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
283 (r7,r6) = byteop16m (r1:0, r3:2) (r);
284 r5 = r5 << 2 (v); // y1,y0
285 r4 = r4 << 2 (v); // y3,y2
286 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
287 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
289 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
291 /* R = Y+ crv*(Cr-128) */
292 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
294 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
298 /* B = Y+ cbu*(Cb-128) */
299 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
301 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
306 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
308 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
313 [p1++]=r3 || r1=[i1++]; // cy
317 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
319 /* R = Y+ crv*(Cr-128) */
320 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
322 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
326 /* B = Y+ cbu*(Cb-128) */
327 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
329 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
334 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
336 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337 r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
338 r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
341 [p1++]=r3 || r1.h=w[i3++]; // 2v
343 .L1555: r2=[i1++]; // oy
350 DEFUN_END(yuv2rgb555_line)
352 DEFUN(yuv2rgb24_line,MEM,
353 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
365 r0 = [fp+ARG_COEFF]; // coeff buffer
369 m0 = COEFF_REL_CY_OFF;
373 r1.l = w[i2++]; // 2u
374 r1.h = w[i3++]; // 2v
377 lsetup (.L0888, .L1888) lc0 = p0;
380 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381 r0 -- used to load 4ys
382 r1 -- used to load 2us,2vs
390 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
391 (r7,r6) = byteop16m (r1:0, r3:2) (r);
392 r5 = r5 << 2 (v); // y1,y0
393 r4 = r4 << 2 (v); // y3,y2
394 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
395 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
398 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
400 /* R = Y+ crv*(Cr-128) */
401 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
403 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
404 r2=r2>>16 || B[p1++]=r2;
407 /* B = Y+ cbu*(Cb-128) */
408 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
410 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
412 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
414 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
417 r2=r2>>16 || B[p1++]=r2;
420 r3=r3>>16 || B[p1++]=r3;
421 B[p2++]=r3 || r1=[i1++]; // cy
426 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
428 /* R = Y+ crv*(Cr-128) */
429 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
431 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
432 r2=r2>>16 || B[p1++]=r2;
435 /* B = Y+ cbu*(Cb-128) */
436 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
438 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
440 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
442 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
444 r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
445 B[p2++]=r2 || r1.l = w[i2++]; // 2u
446 r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447 B[p2++]=r3 || r2=[i1++]; // oy
457 DEFUN_END(yuv2rgb24_line)
463 #define ARG_height 28
464 #define ARG_lumStride 32
465 #define ARG_chromStride 36
466 #define ARG_srcStride 40
468 DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469 long width, long height,
470 long lumStride, long chromStride, long srcStride)):
472 [--sp] = (r7:4,p5:4);
474 p0 = r1; // Y top even
477 r2 = [fp + ARG_vdst];
480 r1 = [fp + ARG_srcStride];
482 r1 += -8; // i0,i1 is pre read need to correct
485 i0 = r0; // uyvy_T even
486 i1 = r2; // uyvy_B odd
488 p2 = [fp + ARG_lumStride];
489 p1 = p0 + p2; // Y bot odd
491 p5 = [fp + ARG_width];
492 p4 = [fp + ARG_height];
497 r2 = [fp + ARG_chromStride];
502 /* I0,I1 - src input line pointers
503 * p0,p1 - luma output line pointers
508 lsetup (0f, 1f) lc1 = p4; // H/2
509 0: r0 = [i0++] || r2 = [i1++];
510 r1 = [i0++] || r3 = [i1++];
511 r4 = byteop1p(r1:0, r3:2);
512 r5 = byteop1p(r1:0, r3:2) (r);
513 lsetup (2f, 3f) lc0 = p5; // W/4
518 r0 = bytepack(r0, r1);
519 r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
520 r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
521 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
522 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
523 r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
524 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
533 (r7:4,p5:4) = [sp++];
536 DEFUN_END(uyvytoyv12)
538 DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
539 long width, long height,
540 long lumStride, long chromStride, long srcStride)):
542 [--sp] = (r7:4,p5:4);
544 p0 = r1; // Y top even
547 r2 = [fp + ARG_vdst];
550 r1 = [fp + ARG_srcStride];
552 r1 += -8; // i0,i1 is pre read need to correct
555 i0 = r0; // uyvy_T even
556 i1 = r2; // uyvy_B odd
558 p2 = [fp + ARG_lumStride];
559 p1 = p0 + p2; // Y bot odd
561 p5 = [fp + ARG_width];
562 p4 = [fp + ARG_height];
567 r2 = [fp + ARG_chromStride];
572 /* I0,I1 - src input line pointers
573 * p0,p1 - luma output line pointers
578 lsetup (0f, 1f) lc1 = p4; // H/2
579 0: r0 = [i0++] || r2 = [i1++];
580 r1 = [i0++] || r3 = [i1++];
581 r4 = bytepack(r0, r1);
582 r5 = bytepack(r2, r3);
583 lsetup (2f, 3f) lc0 = p5; // W/4
584 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
585 r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
588 r4 = byteop1p(r1:0, r3:2);
589 r5 = byteop1p(r1:0, r3:2) (r);
590 r6 = pack(r5.l, r4.l);
591 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
592 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
593 r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
594 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
603 (r7:4,p5:4) = [sp++];
606 DEFUN_END(yuyvtoyv12)