typo fix
[mplayer/greg.git] / libswscale / internal_bfin.S
blob91325af23dbf37d114e8ca9135b94afb9a782422
1 /*
2  * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3  *                    April 20, 2007
4  *
5  * Blackfin Video Color Space Converters Operations
6  *  convert I420 YV12 to RGB in various formats,
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
27     YUV420 to RGB565 conversion.  This routine takes a YUV 420 planar macroblock
28     and converts it to RGB565.  R:5 bits, G:6 bits, B:5 bits.. packed into shorts
31     The following calculation is used for the conversion:
33       r = clipz((y-oy)*cy  + crv*(v-128))
34       g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35       b = clipz((y-oy)*cy  + cbu*(u-128))
37     y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
40     New factorization to elliminate the truncation error which was
41     occuring due to the byteop3p.
44   1) use the bytop16m to subtract quad bytes we use this in U8 this
45    then so the offsets need to be renormalized to 8bits.
47   2) scale operands up by a factor of 4 not 8 because Blackfin
48      multiplies include a shift.
50   3) compute into the accumulators cy*yx0, cy*yx1
52   4) compute each of the linear equations
53       r = clipz((y-oy)*cy  + crv*(v-128))
55       g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
57       b = clipz((y-oy)*cy  + cbu*(u-128))
59      reuse of the accumulators requires that we actually multiply
60      twice once with addition and the second time with a subtaction.
62      because of this we need to compute the equations in the order R B
63      then G saving the writes for B in the case of 24/32 bit color
64      formats.
66     api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67                        int dW, uint32_t *coeffs);
69         A          B
70         ---        ---
71         i2 = cb    i3 = cr
72         i1 = coeff i0 = y
74   Where coeffs have the following layout in memory.
76   uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
78   coeffs is a pointer to oy.
80   the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81   replication is used to simplify the internal algorithms for the dual mac architecture
82   of BlackFin.
84   All routines are exported with _ff_bfin_ as a symbol prefix
86   rough performance gain compared against -O3:
88   2779809/1484290 187.28%
90   which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91   c/pel for the optimized implementations. Not sure why there is such a
92   huge variation on the reference codes on Blackfin I guess it must have
93   to do with the memory system.
97 #define mL1 .l1.text
98 #define mL3 .text
99 #define MEM mL1
101 #define DEFUN(fname,where,interface) \
102         .section where;              \
103         .global _ff_bfin_ ## fname;  \
104         .type _ff_bfin_ ## fname, STT_FUNC; \
105         .align 8;                    \
106         _ff_bfin_ ## fname
108 #define DEFUN_END(fname) \
109         .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
112 .text
114 #define COEFF_LEN        11*4
115 #define COEFF_REL_CY_OFF 4*4
117 #define ARG_OUT   20
118 #define ARG_W     24
119 #define ARG_COEFF 28
121 DEFUN(yuv2rgb565_line,MEM,
122    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
123         link 0;
124         [--sp] = (r7:4);
125         p1 = [fp+ARG_OUT];
126         r3 = [fp+ARG_W];
128         i0 = r0;
129         i2 = r1;
130         i3 = r2;
132         r0 = [fp+ARG_COEFF];
133         i1 = r0;
134         b1 = i1;
135         l1 = COEFF_LEN;
136         m0 = COEFF_REL_CY_OFF;
137         p0 = r3;
139         r0   = [i0++];         // 2Y
140         r1.l = w[i2++];        // 2u
141         r1.h = w[i3++];        // 2v
142         p0 = p0>>2;
144         lsetup (.L0565, .L1565) lc0 = p0;
146         /*
147            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
148            r0 -- used to load 4ys
149            r1 -- used to load 2us,2vs
150            r4 -- y3,y2
151            r5 -- y1,y0
152            r6 -- u1,u0
153            r7 -- v1,v0
154         */
155                                                               r2=[i1++]; // oy
156 .L0565:
157         /*
158         rrrrrrrr gggggggg bbbbbbbb
159          5432109876543210
160                     bbbbb >>3
161               gggggggg    <<3
162          rrrrrrrr         <<8
163          rrrrrggggggbbbbb
164         */
165         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
166         (r7,r6) = byteop16m (r1:0, r3:2) (r);
167         r5 = r5 << 2 (v);                                                // y1,y0
168         r4 = r4 << 2 (v);                                                // y3,y2
169         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
170         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
171         /* Y' = y*cy */
172         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
174         /* R = Y+ crv*(Cr-128) */
175         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
176                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
177         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
178         r2 = r2 >> 3 (v);
179         r3 = r2 & r5;
181         /* B = Y+ cbu*(Cb-128) */
182         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
183                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
184         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
185         r2 = r2 << 8 (v);
186         r2 = r2 & r5;
187         r3 = r3 | r2;
189         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
190                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
191         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
192         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
193         r2 = r2 << 3 (v);
194         r2 = r2 & r5;
195         r3 = r3 | r2;
196         [p1++]=r3                                          || r1=[i1++]; // cy
198         /* Y' = y*cy */
200         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
202         /* R = Y+ crv*(Cr-128) */
203         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
204                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
205         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
206         r2 = r2 >> 3 (v);
207         r3 = r2 & r5;
209         /* B = Y+ cbu*(Cb-128) */
210         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
211                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
212         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
213         r2 = r2 << 8 (v);
214         r2 = r2 & r5;
215         r3 = r3 | r2;
217         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
218                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
219         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
220         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
221         r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
222         r2 = r2 & r5;
223         r3 = r3 | r2;
224         [p1++]=r3                                          || r1.h = w[i3++];        // 2v
225 .L1565:                                                       r2=[i1++]; // oy
227         l1 = 0;
229         (r7:4) = [sp++];
230         unlink;
231         rts;
232 DEFUN_END(yuv2rgb565_line)
234 DEFUN(yuv2rgb555_line,MEM,
235    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
236         link 0;
237         [--sp] = (r7:4);
238         p1 = [fp+ARG_OUT];
239         r3 = [fp+ARG_W];
241         i0 = r0;
242         i2 = r1;
243         i3 = r2;
245         r0 = [fp+ARG_COEFF];
246         i1 = r0;
247         b1 = i1;
248         l1 = COEFF_LEN;
249         m0 = COEFF_REL_CY_OFF;
250         p0 = r3;
252         r0   = [i0++];         // 2Y
253         r1.l = w[i2++];        // 2u
254         r1.h = w[i3++];        // 2v
255         p0 = p0>>2;
257         lsetup (.L0555, .L1555) lc0 = p0;
259         /*
260            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
261            r0 -- used to load 4ys
262            r1 -- used to load 2us,2vs
263            r4 -- y3,y2
264            r5 -- y1,y0
265            r6 -- u1,u0
266            r7 -- v1,v0
267         */
268                                                               r2=[i1++]; // oy
269 .L0555:
270         /*
271         rrrrrrrr gggggggg bbbbbbbb
272          5432109876543210
273                     bbbbb >>3
274                gggggggg   <<2
275           rrrrrrrr        <<7
276          xrrrrrgggggbbbbb
277         */
279         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
280         (r7,r6) = byteop16m (r1:0, r3:2) (r);
281         r5 = r5 << 2 (v);                                                // y1,y0
282         r4 = r4 << 2 (v);                                                // y3,y2
283         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
284         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
285         /* Y' = y*cy */
286         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
288         /* R = Y+ crv*(Cr-128) */
289         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
290                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
291         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
292         r2 = r2 >> 3 (v);
293         r3 = r2 & r5;
295         /* B = Y+ cbu*(Cb-128) */
296         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
297                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
298         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
299         r2 = r2 << 7 (v);
300         r2 = r2 & r5;
301         r3 = r3 | r2;
303         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
304                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
305         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
306         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
307         r2 = r2 << 2 (v);
308         r2 = r2 & r5;
309         r3 = r3 | r2;
310         [p1++]=r3                                          || r1=[i1++]; // cy
312         /* Y' = y*cy */
314         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
316         /* R = Y+ crv*(Cr-128) */
317         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
318                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
319         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
320         r2 = r2 >> 3 (v);
321         r3 = r2 & r5;
323         /* B = Y+ cbu*(Cb-128) */
324         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
325                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
326         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
327         r2 = r2 << 7 (v);
328         r2 = r2 & r5;
329         r3 = r3 | r2;
331         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
332                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
333         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
334         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
335         r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
336         r2 = r2 & r5;
337         r3 = r3 | r2;
338         [p1++]=r3                                          || r1.h=w[i3++]; // 2v
340 .L1555:                                                       r2=[i1++]; // oy
342         l1 = 0;
344         (r7:4) = [sp++];
345         unlink;
346         rts;
347 DEFUN_END(yuv2rgb555_line)
349 DEFUN(yuv2rgb24_line,MEM,
350    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
351         link 0;
352         [--sp] = (r7:4);
353         p1 = [fp+ARG_OUT];
354         r3 = [fp+ARG_W];
355         p2 = p1;
356         p2 += 3;
358         i0 = r0;
359         i2 = r1;
360         i3 = r2;
362         r0 = [fp+ARG_COEFF]; // coeff buffer
363         i1 = r0;
364         b1 = i1;
365         l1 = COEFF_LEN;
366         m0 = COEFF_REL_CY_OFF;
367         p0 = r3;
369         r0   = [i0++];         // 2Y
370         r1.l = w[i2++];        // 2u
371         r1.h = w[i3++];        // 2v
372         p0 = p0>>2;
374         lsetup (.L0888, .L1888) lc0 = p0;
376         /*
377            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
378            r0 -- used to load 4ys
379            r1 -- used to load 2us,2vs
380            r4 -- y3,y2
381            r5 -- y1,y0
382            r6 -- u1,u0
383            r7 -- v1,v0
384         */
385                                                               r2=[i1++]; // oy
386 .L0888:
387         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
388         (r7,r6) = byteop16m (r1:0, r3:2) (r);
389         r5 = r5 << 2 (v);               // y1,y0
390         r4 = r4 << 2 (v);               // y3,y2
391         r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
392         r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
394         /* Y' = y*cy */
395         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
397         /* R = Y+ crv*(Cr-128) */
398         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
399                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
400         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
401         r2=r2>>16 || B[p1++]=r2;
402                      B[p2++]=r2;
404         /* B = Y+ cbu*(Cb-128) */
405         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
406                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
407         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
409         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
410                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
411         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
412         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
414         r2=r2>>16 || B[p1++]=r2;
415                      B[p2++]=r2;
417         r3=r3>>16 || B[p1++]=r3;
418                      B[p2++]=r3                            || r1=[i1++]; // cy
420         p1+=3;
421         p2+=3;
422         /* Y' = y*cy */
423         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
425         /* R = Y+ crv*(Cr-128) */
426         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
427                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
428         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
429         r2=r2>>16 || B[p1++]=r2;
430         B[p2++]=r2;
432         /* B = Y+ cbu*(Cb-128) */
433         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
434                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
435         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
437         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
438                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
439         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
440         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
441         r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
442                      B[p2++]=r2 || r1.l = w[i2++]; // 2u
443         r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
444                      B[p2++]=r3 || r2=[i1++];      // oy
446         p1+=3;
447 .L1888: p2+=3;
449         l1 = 0;
451         (r7:4) = [sp++];
452         unlink;
453         rts;
454 DEFUN_END(yuv2rgb24_line)
458 #define ARG_vdst        20
459 #define ARG_width       24
460 #define ARG_height      28
461 #define ARG_lumStride   32
462 #define ARG_chromStride 36
463 #define ARG_srcStride   40
465 DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
466                          long width, long height,
467                          long lumStride, long chromStride, long srcStride)):
468         link 0;
469         [--sp] = (r7:4,p5:4);
471         p0 = r1;       // Y top even
473         i2 = r2; // *u
474         r2 = [fp + ARG_vdst];
475         i3 = r2; // *v
477         r1 = [fp + ARG_srcStride];
478         r2 = r0 + r1;
479         r1 += -8;  // i0,i1 is pre read need to correct
480         m0 = r1;
482         i0 = r0;  // uyvy_T even
483         i1 = r2;  // uyvy_B odd
485         p2 = [fp + ARG_lumStride];
486         p1 = p0 + p2;  // Y bot odd
488         p5 = [fp + ARG_width];
489         p4 = [fp + ARG_height];
490         r0 = p5;
491         p4 = p4 >> 1;
492         p5 = p5 >> 2;
494         r2 = [fp + ARG_chromStride];
495         r0 = r0 >> 1;
496         r2 = r2 - r0;
497         m1 = r2;
499         /*   I0,I1 - src input line pointers
500          *   p0,p1 - luma output line pointers
501          *   I2    - dstU
502          *   I3    - dstV
503          */
505         lsetup (0f, 1f) lc1 = p4;   // H/2
506 0:        r0 = [i0++] || r2 = [i1++];
507           r1 = [i0++] || r3 = [i1++];
508           r4 = byteop1p(r1:0, r3:2);
509           r5 = byteop1p(r1:0, r3:2) (r);
510           lsetup (2f, 3f) lc0 = p5; // W/4
511 2:          r0 = r0 >> 8(v);
512             r1 = r1 >> 8(v);
513             r2 = r2 >> 8(v);
514             r3 = r3 >> 8(v);
515             r0 = bytepack(r0, r1);
516             r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
517             r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
518             r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
519             r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
520             r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
521 3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
523           i0 += m0;
524           i1 += m0;
525           i2 += m1;
526           i3 += m1;
527           p0 = p0 + p2;
528 1:        p1 = p1 + p2;
530         (r7:4,p5:4) = [sp++];
531         unlink;
532         rts;
533 DEFUN_END(uyvytoyv12)
535 DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
536                          long width, long height,
537                          long lumStride, long chromStride, long srcStride)):
538         link 0;
539         [--sp] = (r7:4,p5:4);
541         p0 = r1;       // Y top even
543         i2 = r2; // *u
544         r2 = [fp + ARG_vdst];
545         i3 = r2; // *v
547         r1 = [fp + ARG_srcStride];
548         r2 = r0 + r1;
549         r1 += -8;  // i0,i1 is pre read need to correct
550         m0 = r1;
552         i0 = r0;  // uyvy_T even
553         i1 = r2;  // uyvy_B odd
555         p2 = [fp + ARG_lumStride];
556         p1 = p0 + p2;  // Y bot odd
558         p5 = [fp + ARG_width];
559         p4 = [fp + ARG_height];
560         r0 = p5;
561         p4 = p4 >> 1;
562         p5 = p5 >> 2;
564         r2 = [fp + ARG_chromStride];
565         r0 = r0 >> 1;
566         r2 = r2 - r0;
567         m1 = r2;
569         /*   I0,I1 - src input line pointers
570          *   p0,p1 - luma output line pointers
571          *   I2    - dstU
572          *   I3    - dstV
573          */
575         lsetup (0f, 1f) lc1 = p4;   // H/2
576 0:        r0 = [i0++] || r2 = [i1++];
577           r1 = [i0++] || r3 = [i1++];
578           r4 = bytepack(r0, r1);
579           r5 = bytepack(r2, r3);
580           lsetup (2f, 3f) lc0 = p5; // W/4
581 2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
582             r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
583             r2 = r2 >> 8(v);
584             r3 = r3 >> 8(v);
585             r4 = byteop1p(r1:0, r3:2);
586             r5 = byteop1p(r1:0, r3:2) (r);
587             r6 = pack(r5.l, r4.l);
588             r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
589             r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
590             r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
591 3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
593           i0 += m0;
594           i1 += m0;
595           i2 += m1;
596           i3 += m1;
597           p0 = p0 + p2;
598 1:        p1 = p1 + p2;
600         (r7:4,p5:4) = [sp++];
601         unlink;
602         rts;
603 DEFUN_END(yuyvtoyv12)