synced with r26967
[mplayer/glamo.git] / libswscale / internal_bfin.S
blob2d5c61a14acd7fc07f3fbf6a6cb75cae8c57a8e0
1 /*
2  * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3  *                    April 20, 2007
4  *
5  * Blackfin Video Color Space Converters Operations
6  *  convert I420 YV12 to RGB in various formats,
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
27     YUV420 to RGB565 conversion.  This routine takes a YUV 420 planar macroblock
28     and converts it to RGB565.  R:5 bits, G:6 bits, B:5 bits.. packed into shorts
31     The following calculation is used for the conversion:
33       r = clipz((y-oy)*cy  + crv*(v-128))
34       g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35       b = clipz((y-oy)*cy  + cbu*(u-128))
37     y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
40     New factorization to eliminate the truncation error which was
41     occuring due to the byteop3p.
44   1) use the bytop16m to subtract quad bytes we use this in U8 this
45    then so the offsets need to be renormalized to 8bits.
47   2) scale operands up by a factor of 4 not 8 because Blackfin
48      multiplies include a shift.
50   3) compute into the accumulators cy*yx0, cy*yx1
52   4) compute each of the linear equations
53       r = clipz((y-oy)*cy  + crv*(v-128))
55       g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
57       b = clipz((y-oy)*cy  + cbu*(u-128))
59      reuse of the accumulators requires that we actually multiply
60      twice once with addition and the second time with a subtaction.
62      because of this we need to compute the equations in the order R B
63      then G saving the writes for B in the case of 24/32 bit color
64      formats.
66     api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67                        int dW, uint32_t *coeffs);
69         A          B
70         ---        ---
71         i2 = cb    i3 = cr
72         i1 = coeff i0 = y
74   Where coeffs have the following layout in memory.
76   uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
78   coeffs is a pointer to oy.
80   the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81   replication is used to simplify the internal algorithms for the dual mac architecture
82   of BlackFin.
84   All routines are exported with _ff_bfin_ as a symbol prefix
86   rough performance gain compared against -O3:
88   2779809/1484290 187.28%
90   which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91   c/pel for the optimized implementations. Not sure why there is such a
92   huge variation on the reference codes on Blackfin I guess it must have
93   to do with the memory system.
97 #define mL3 .text
98 #ifdef __FDPIC__
99 #define mL1 .l1.text
100 #else
101 #define mL1 mL3
102 #endif
103 #define MEM mL1
105 #define DEFUN(fname,where,interface) \
106         .section where;              \
107         .global _ff_bfin_ ## fname;  \
108         .type _ff_bfin_ ## fname, STT_FUNC; \
109         .align 8;                    \
110         _ff_bfin_ ## fname
112 #define DEFUN_END(fname) \
113         .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
116 .text
118 #define COEFF_LEN        11*4
119 #define COEFF_REL_CY_OFF 4*4
121 #define ARG_OUT   20
122 #define ARG_W     24
123 #define ARG_COEFF 28
125 DEFUN(yuv2rgb565_line,MEM,
126    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
127         link 0;
128         [--sp] = (r7:4);
129         p1 = [fp+ARG_OUT];
130         r3 = [fp+ARG_W];
132         i0 = r0;
133         i2 = r1;
134         i3 = r2;
136         r0 = [fp+ARG_COEFF];
137         i1 = r0;
138         b1 = i1;
139         l1 = COEFF_LEN;
140         m0 = COEFF_REL_CY_OFF;
141         p0 = r3;
143         r0   = [i0++];         // 2Y
144         r1.l = w[i2++];        // 2u
145         r1.h = w[i3++];        // 2v
146         p0 = p0>>2;
148         lsetup (.L0565, .L1565) lc0 = p0;
150         /*
151            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
152            r0 -- used to load 4ys
153            r1 -- used to load 2us,2vs
154            r4 -- y3,y2
155            r5 -- y1,y0
156            r6 -- u1,u0
157            r7 -- v1,v0
158         */
159                                                               r2=[i1++]; // oy
160 .L0565:
161         /*
162         rrrrrrrr gggggggg bbbbbbbb
163          5432109876543210
164                     bbbbb >>3
165               gggggggg    <<3
166          rrrrrrrr         <<8
167          rrrrrggggggbbbbb
168         */
169         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
170         (r7,r6) = byteop16m (r1:0, r3:2) (r);
171         r5 = r5 << 2 (v);                                                // y1,y0
172         r4 = r4 << 2 (v);                                                // y3,y2
173         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
174         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
175         /* Y' = y*cy */
176         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
178         /* R = Y+ crv*(Cr-128) */
179         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
180                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
181         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
182         r2 = r2 >> 3 (v);
183         r3 = r2 & r5;
185         /* B = Y+ cbu*(Cb-128) */
186         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
187                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
188         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
189         r2 = r2 << 8 (v);
190         r2 = r2 & r5;
191         r3 = r3 | r2;
193         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
194                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
195         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
196         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
197         r2 = r2 << 3 (v);
198         r2 = r2 & r5;
199         r3 = r3 | r2;
200         [p1++]=r3                                          || r1=[i1++]; // cy
202         /* Y' = y*cy */
204         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
206         /* R = Y+ crv*(Cr-128) */
207         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
208                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
209         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
210         r2 = r2 >> 3 (v);
211         r3 = r2 & r5;
213         /* B = Y+ cbu*(Cb-128) */
214         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
215                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
216         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
217         r2 = r2 << 8 (v);
218         r2 = r2 & r5;
219         r3 = r3 | r2;
221         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
222                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
223         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
224         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
225         r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
226         r2 = r2 & r5;
227         r3 = r3 | r2;
228         [p1++]=r3                                          || r1.h = w[i3++];        // 2v
229 .L1565:                                                       r2=[i1++]; // oy
231         l1 = 0;
233         (r7:4) = [sp++];
234         unlink;
235         rts;
236 DEFUN_END(yuv2rgb565_line)
238 DEFUN(yuv2rgb555_line,MEM,
239    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
240         link 0;
241         [--sp] = (r7:4);
242         p1 = [fp+ARG_OUT];
243         r3 = [fp+ARG_W];
245         i0 = r0;
246         i2 = r1;
247         i3 = r2;
249         r0 = [fp+ARG_COEFF];
250         i1 = r0;
251         b1 = i1;
252         l1 = COEFF_LEN;
253         m0 = COEFF_REL_CY_OFF;
254         p0 = r3;
256         r0   = [i0++];         // 2Y
257         r1.l = w[i2++];        // 2u
258         r1.h = w[i3++];        // 2v
259         p0 = p0>>2;
261         lsetup (.L0555, .L1555) lc0 = p0;
263         /*
264            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
265            r0 -- used to load 4ys
266            r1 -- used to load 2us,2vs
267            r4 -- y3,y2
268            r5 -- y1,y0
269            r6 -- u1,u0
270            r7 -- v1,v0
271         */
272                                                               r2=[i1++]; // oy
273 .L0555:
274         /*
275         rrrrrrrr gggggggg bbbbbbbb
276          5432109876543210
277                     bbbbb >>3
278                gggggggg   <<2
279           rrrrrrrr        <<7
280          xrrrrrgggggbbbbb
281         */
283         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
284         (r7,r6) = byteop16m (r1:0, r3:2) (r);
285         r5 = r5 << 2 (v);                                                // y1,y0
286         r4 = r4 << 2 (v);                                                // y3,y2
287         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
288         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
289         /* Y' = y*cy */
290         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
292         /* R = Y+ crv*(Cr-128) */
293         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
294                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
295         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
296         r2 = r2 >> 3 (v);
297         r3 = r2 & r5;
299         /* B = Y+ cbu*(Cb-128) */
300         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
301                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
302         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
303         r2 = r2 << 7 (v);
304         r2 = r2 & r5;
305         r3 = r3 | r2;
307         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
308                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
309         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
310         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
311         r2 = r2 << 2 (v);
312         r2 = r2 & r5;
313         r3 = r3 | r2;
314         [p1++]=r3                                          || r1=[i1++]; // cy
316         /* Y' = y*cy */
318         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
320         /* R = Y+ crv*(Cr-128) */
321         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
322                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
323         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
324         r2 = r2 >> 3 (v);
325         r3 = r2 & r5;
327         /* B = Y+ cbu*(Cb-128) */
328         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
329                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
330         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
331         r2 = r2 << 7 (v);
332         r2 = r2 & r5;
333         r3 = r3 | r2;
335         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
336                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
337         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
338         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
339         r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
340         r2 = r2 & r5;
341         r3 = r3 | r2;
342         [p1++]=r3                                          || r1.h=w[i3++]; // 2v
344 .L1555:                                                       r2=[i1++]; // oy
346         l1 = 0;
348         (r7:4) = [sp++];
349         unlink;
350         rts;
351 DEFUN_END(yuv2rgb555_line)
353 DEFUN(yuv2rgb24_line,MEM,
354    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
355         link 0;
356         [--sp] = (r7:4);
357         p1 = [fp+ARG_OUT];
358         r3 = [fp+ARG_W];
359         p2 = p1;
360         p2 += 3;
362         i0 = r0;
363         i2 = r1;
364         i3 = r2;
366         r0 = [fp+ARG_COEFF]; // coeff buffer
367         i1 = r0;
368         b1 = i1;
369         l1 = COEFF_LEN;
370         m0 = COEFF_REL_CY_OFF;
371         p0 = r3;
373         r0   = [i0++];         // 2Y
374         r1.l = w[i2++];        // 2u
375         r1.h = w[i3++];        // 2v
376         p0 = p0>>2;
378         lsetup (.L0888, .L1888) lc0 = p0;
380         /*
381            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
382            r0 -- used to load 4ys
383            r1 -- used to load 2us,2vs
384            r4 -- y3,y2
385            r5 -- y1,y0
386            r6 -- u1,u0
387            r7 -- v1,v0
388         */
389                                                               r2=[i1++]; // oy
390 .L0888:
391         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
392         (r7,r6) = byteop16m (r1:0, r3:2) (r);
393         r5 = r5 << 2 (v);               // y1,y0
394         r4 = r4 << 2 (v);               // y3,y2
395         r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
396         r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
398         /* Y' = y*cy */
399         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
401         /* R = Y+ crv*(Cr-128) */
402         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
403                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
404         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
405         r2=r2>>16 || B[p1++]=r2;
406                      B[p2++]=r2;
408         /* B = Y+ cbu*(Cb-128) */
409         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
410                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
411         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
413         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
414                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
415         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
416         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
418         r2=r2>>16 || B[p1++]=r2;
419                      B[p2++]=r2;
421         r3=r3>>16 || B[p1++]=r3;
422                      B[p2++]=r3                            || r1=[i1++]; // cy
424         p1+=3;
425         p2+=3;
426         /* Y' = y*cy */
427         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
429         /* R = Y+ crv*(Cr-128) */
430         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
431                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
432         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
433         r2=r2>>16 || B[p1++]=r2;
434         B[p2++]=r2;
436         /* B = Y+ cbu*(Cb-128) */
437         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
438                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
439         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
441         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
442                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
443         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
444         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
445         r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
446                      B[p2++]=r2 || r1.l = w[i2++]; // 2u
447         r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
448                      B[p2++]=r3 || r2=[i1++];      // oy
450         p1+=3;
451 .L1888: p2+=3;
453         l1 = 0;
455         (r7:4) = [sp++];
456         unlink;
457         rts;
458 DEFUN_END(yuv2rgb24_line)
462 #define ARG_vdst        20
463 #define ARG_width       24
464 #define ARG_height      28
465 #define ARG_lumStride   32
466 #define ARG_chromStride 36
467 #define ARG_srcStride   40
469 DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
470                          long width, long height,
471                          long lumStride, long chromStride, long srcStride)):
472         link 0;
473         [--sp] = (r7:4,p5:4);
475         p0 = r1;       // Y top even
477         i2 = r2; // *u
478         r2 = [fp + ARG_vdst];
479         i3 = r2; // *v
481         r1 = [fp + ARG_srcStride];
482         r2 = r0 + r1;
483         r1 += -8;  // i0,i1 is pre read need to correct
484         m0 = r1;
486         i0 = r0;  // uyvy_T even
487         i1 = r2;  // uyvy_B odd
489         p2 = [fp + ARG_lumStride];
490         p1 = p0 + p2;  // Y bot odd
492         p5 = [fp + ARG_width];
493         p4 = [fp + ARG_height];
494         r0 = p5;
495         p4 = p4 >> 1;
496         p5 = p5 >> 2;
498         r2 = [fp + ARG_chromStride];
499         r0 = r0 >> 1;
500         r2 = r2 - r0;
501         m1 = r2;
503         /*   I0,I1 - src input line pointers
504          *   p0,p1 - luma output line pointers
505          *   I2    - dstU
506          *   I3    - dstV
507          */
509         lsetup (0f, 1f) lc1 = p4;   // H/2
510 0:        r0 = [i0++] || r2 = [i1++];
511           r1 = [i0++] || r3 = [i1++];
512           r4 = byteop1p(r1:0, r3:2);
513           r5 = byteop1p(r1:0, r3:2) (r);
514           lsetup (2f, 3f) lc0 = p5; // W/4
515 2:          r0 = r0 >> 8(v);
516             r1 = r1 >> 8(v);
517             r2 = r2 >> 8(v);
518             r3 = r3 >> 8(v);
519             r0 = bytepack(r0, r1);
520             r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
521             r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
522             r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
523             r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
524             r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
525 3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
527           i0 += m0;
528           i1 += m0;
529           i2 += m1;
530           i3 += m1;
531           p0 = p0 + p2;
532 1:        p1 = p1 + p2;
534         (r7:4,p5:4) = [sp++];
535         unlink;
536         rts;
537 DEFUN_END(uyvytoyv12)
539 DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
540                          long width, long height,
541                          long lumStride, long chromStride, long srcStride)):
542         link 0;
543         [--sp] = (r7:4,p5:4);
545         p0 = r1;       // Y top even
547         i2 = r2; // *u
548         r2 = [fp + ARG_vdst];
549         i3 = r2; // *v
551         r1 = [fp + ARG_srcStride];
552         r2 = r0 + r1;
553         r1 += -8;  // i0,i1 is pre read need to correct
554         m0 = r1;
556         i0 = r0;  // uyvy_T even
557         i1 = r2;  // uyvy_B odd
559         p2 = [fp + ARG_lumStride];
560         p1 = p0 + p2;  // Y bot odd
562         p5 = [fp + ARG_width];
563         p4 = [fp + ARG_height];
564         r0 = p5;
565         p4 = p4 >> 1;
566         p5 = p5 >> 2;
568         r2 = [fp + ARG_chromStride];
569         r0 = r0 >> 1;
570         r2 = r2 - r0;
571         m1 = r2;
573         /*   I0,I1 - src input line pointers
574          *   p0,p1 - luma output line pointers
575          *   I2    - dstU
576          *   I3    - dstV
577          */
579         lsetup (0f, 1f) lc1 = p4;   // H/2
580 0:        r0 = [i0++] || r2 = [i1++];
581           r1 = [i0++] || r3 = [i1++];
582           r4 = bytepack(r0, r1);
583           r5 = bytepack(r2, r3);
584           lsetup (2f, 3f) lc0 = p5; // W/4
585 2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
586             r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
587             r2 = r2 >> 8(v);
588             r3 = r3 >> 8(v);
589             r4 = byteop1p(r1:0, r3:2);
590             r5 = byteop1p(r1:0, r3:2) (r);
591             r6 = pack(r5.l, r4.l);
592             r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
593             r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
594             r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
595 3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
597           i0 += m0;
598           i1 += m0;
599           i2 += m1;
600           i3 += m1;
601           p0 = p0 + p2;
602 1:        p1 = p1 + p2;
604         (r7:4,p5:4) = [sp++];
605         unlink;
606         rts;
607 DEFUN_END(yuyvtoyv12)