Merge svn changes up to r28366
[mplayer.git] / libswscale / internal_bfin.S
blobfb7bda7e1214d6d695ca3ab73f9c1d0906da957d
1 /*
2  * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3  *                    April 20, 2007
4  *
5  * Blackfin video color space converter operations
6  * convert I420 YV12 to RGB in various formats
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
31 The following calculation is used for the conversion:
33   r = clipz((y-oy)*cy  + crv*(v-128))
34   g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35   b = clipz((y-oy)*cy  + cbu*(u-128))
37 y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
40 New factorization to eliminate the truncation error which was
41 occurring due to the byteop3p.
44 1) Use the bytop16m to subtract quad bytes we use this in U8 this
45  then so the offsets need to be renormalized to 8bits.
47 2) Scale operands up by a factor of 4 not 8 because Blackfin
48    multiplies include a shift.
50 3) Compute into the accumulators cy*yx0, cy*yx1.
52 4) Compute each of the linear equations:
53      r = clipz((y - oy) * cy  + crv * (v - 128))
55      g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
57      b = clipz((y - oy) * cy  + cbu * (u - 128))
59    Reuse of the accumulators requires that we actually multiply
60    twice once with addition and the second time with a subtraction.
62    Because of this we need to compute the equations in the order R B
63    then G saving the writes for B in the case of 24/32 bit color
64    formats.
66    API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67                       int dW, uint32_t *coeffs);
69        A          B
70        ---        ---
71        i2 = cb    i3 = cr
72        i1 = coeff i0 = y
74 Where coeffs have the following layout in memory.
76 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
78 coeffs is a pointer to oy.
80 The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81 replication is used to simplify the internal algorithms for the dual Mac
82 architecture of BlackFin.
84 All routines are exported with _ff_bfin_ as a symbol prefix.
86 Rough performance gain compared against -O3:
88 2779809/1484290 187.28%
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91 c/pel for the optimized implementations. Not sure why there is such a
92 huge variation on the reference codes on Blackfin I guess it must have
93 to do with the memory system.
96 #define mL3 .text
97 #ifdef __FDPIC__
98 #define mL1 .l1.text
99 #else
100 #define mL1 mL3
101 #endif
102 #define MEM mL1
104 #define DEFUN(fname,where,interface) \
105         .section where;              \
106         .global _ff_bfin_ ## fname;  \
107         .type _ff_bfin_ ## fname, STT_FUNC; \
108         .align 8;                    \
109         _ff_bfin_ ## fname
111 #define DEFUN_END(fname) \
112         .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
115 .text
117 #define COEFF_LEN        11*4
118 #define COEFF_REL_CY_OFF 4*4
120 #define ARG_OUT   20
121 #define ARG_W     24
122 #define ARG_COEFF 28
124 DEFUN(yuv2rgb565_line,MEM,
125    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
126         link 0;
127         [--sp] = (r7:4);
128         p1 = [fp+ARG_OUT];
129         r3 = [fp+ARG_W];
131         i0 = r0;
132         i2 = r1;
133         i3 = r2;
135         r0 = [fp+ARG_COEFF];
136         i1 = r0;
137         b1 = i1;
138         l1 = COEFF_LEN;
139         m0 = COEFF_REL_CY_OFF;
140         p0 = r3;
142         r0   = [i0++];         // 2Y
143         r1.l = w[i2++];        // 2u
144         r1.h = w[i3++];        // 2v
145         p0 = p0>>2;
147         lsetup (.L0565, .L1565) lc0 = p0;
149         /*
150            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151            r0 -- used to load 4ys
152            r1 -- used to load 2us,2vs
153            r4 -- y3,y2
154            r5 -- y1,y0
155            r6 -- u1,u0
156            r7 -- v1,v0
157         */
158                                                               r2=[i1++]; // oy
159 .L0565:
160         /*
161         rrrrrrrr gggggggg bbbbbbbb
162          5432109876543210
163                     bbbbb >>3
164               gggggggg    <<3
165          rrrrrrrr         <<8
166          rrrrrggggggbbbbb
167         */
168         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
169         (r7,r6) = byteop16m (r1:0, r3:2) (r);
170         r5 = r5 << 2 (v);                                                // y1,y0
171         r4 = r4 << 2 (v);                                                // y3,y2
172         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
173         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
174         /* Y' = y*cy */
175         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
177         /* R = Y+ crv*(Cr-128) */
178         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
180         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
181         r2 = r2 >> 3 (v);
182         r3 = r2 & r5;
184         /* B = Y+ cbu*(Cb-128) */
185         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
187         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
188         r2 = r2 << 8 (v);
189         r2 = r2 & r5;
190         r3 = r3 | r2;
192         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
194         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
196         r2 = r2 << 3 (v);
197         r2 = r2 & r5;
198         r3 = r3 | r2;
199         [p1++]=r3                                          || r1=[i1++]; // cy
201         /* Y' = y*cy */
203         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
205         /* R = Y+ crv*(Cr-128) */
206         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
208         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
209         r2 = r2 >> 3 (v);
210         r3 = r2 & r5;
212         /* B = Y+ cbu*(Cb-128) */
213         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
215         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
216         r2 = r2 << 8 (v);
217         r2 = r2 & r5;
218         r3 = r3 | r2;
220         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
222         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
224         r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
225         r2 = r2 & r5;
226         r3 = r3 | r2;
227         [p1++]=r3                                          || r1.h = w[i3++];        // 2v
228 .L1565:                                                       r2=[i1++]; // oy
230         l1 = 0;
232         (r7:4) = [sp++];
233         unlink;
234         rts;
235 DEFUN_END(yuv2rgb565_line)
237 DEFUN(yuv2rgb555_line,MEM,
238    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
239         link 0;
240         [--sp] = (r7:4);
241         p1 = [fp+ARG_OUT];
242         r3 = [fp+ARG_W];
244         i0 = r0;
245         i2 = r1;
246         i3 = r2;
248         r0 = [fp+ARG_COEFF];
249         i1 = r0;
250         b1 = i1;
251         l1 = COEFF_LEN;
252         m0 = COEFF_REL_CY_OFF;
253         p0 = r3;
255         r0   = [i0++];         // 2Y
256         r1.l = w[i2++];        // 2u
257         r1.h = w[i3++];        // 2v
258         p0 = p0>>2;
260         lsetup (.L0555, .L1555) lc0 = p0;
262         /*
263            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264            r0 -- used to load 4ys
265            r1 -- used to load 2us,2vs
266            r4 -- y3,y2
267            r5 -- y1,y0
268            r6 -- u1,u0
269            r7 -- v1,v0
270         */
271                                                               r2=[i1++]; // oy
272 .L0555:
273         /*
274         rrrrrrrr gggggggg bbbbbbbb
275          5432109876543210
276                     bbbbb >>3
277                gggggggg   <<2
278           rrrrrrrr        <<7
279          xrrrrrgggggbbbbb
280         */
282         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
283         (r7,r6) = byteop16m (r1:0, r3:2) (r);
284         r5 = r5 << 2 (v);                                                // y1,y0
285         r4 = r4 << 2 (v);                                                // y3,y2
286         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
287         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
288         /* Y' = y*cy */
289         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
291         /* R = Y+ crv*(Cr-128) */
292         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
294         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
295         r2 = r2 >> 3 (v);
296         r3 = r2 & r5;
298         /* B = Y+ cbu*(Cb-128) */
299         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
301         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
302         r2 = r2 << 7 (v);
303         r2 = r2 & r5;
304         r3 = r3 | r2;
306         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
308         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
310         r2 = r2 << 2 (v);
311         r2 = r2 & r5;
312         r3 = r3 | r2;
313         [p1++]=r3                                          || r1=[i1++]; // cy
315         /* Y' = y*cy */
317         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
319         /* R = Y+ crv*(Cr-128) */
320         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
322         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
323         r2 = r2 >> 3 (v);
324         r3 = r2 & r5;
326         /* B = Y+ cbu*(Cb-128) */
327         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
329         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
330         r2 = r2 << 7 (v);
331         r2 = r2 & r5;
332         r3 = r3 | r2;
334         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
336         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
338         r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
339         r2 = r2 & r5;
340         r3 = r3 | r2;
341         [p1++]=r3                                          || r1.h=w[i3++]; // 2v
343 .L1555:                                                       r2=[i1++]; // oy
345         l1 = 0;
347         (r7:4) = [sp++];
348         unlink;
349         rts;
350 DEFUN_END(yuv2rgb555_line)
352 DEFUN(yuv2rgb24_line,MEM,
353    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
354         link 0;
355         [--sp] = (r7:4);
356         p1 = [fp+ARG_OUT];
357         r3 = [fp+ARG_W];
358         p2 = p1;
359         p2 += 3;
361         i0 = r0;
362         i2 = r1;
363         i3 = r2;
365         r0 = [fp+ARG_COEFF]; // coeff buffer
366         i1 = r0;
367         b1 = i1;
368         l1 = COEFF_LEN;
369         m0 = COEFF_REL_CY_OFF;
370         p0 = r3;
372         r0   = [i0++];         // 2Y
373         r1.l = w[i2++];        // 2u
374         r1.h = w[i3++];        // 2v
375         p0 = p0>>2;
377         lsetup (.L0888, .L1888) lc0 = p0;
379         /*
380            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381            r0 -- used to load 4ys
382            r1 -- used to load 2us,2vs
383            r4 -- y3,y2
384            r5 -- y1,y0
385            r6 -- u1,u0
386            r7 -- v1,v0
387         */
388                                                               r2=[i1++]; // oy
389 .L0888:
390         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
391         (r7,r6) = byteop16m (r1:0, r3:2) (r);
392         r5 = r5 << 2 (v);               // y1,y0
393         r4 = r4 << 2 (v);               // y3,y2
394         r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
395         r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
397         /* Y' = y*cy */
398         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
400         /* R = Y+ crv*(Cr-128) */
401         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
403         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
404         r2=r2>>16 || B[p1++]=r2;
405                      B[p2++]=r2;
407         /* B = Y+ cbu*(Cb-128) */
408         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
410         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
412         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
414         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
417         r2=r2>>16 || B[p1++]=r2;
418                      B[p2++]=r2;
420         r3=r3>>16 || B[p1++]=r3;
421                      B[p2++]=r3                            || r1=[i1++]; // cy
423         p1+=3;
424         p2+=3;
425         /* Y' = y*cy */
426         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
428         /* R = Y+ crv*(Cr-128) */
429         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
431         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
432         r2=r2>>16 || B[p1++]=r2;
433         B[p2++]=r2;
435         /* B = Y+ cbu*(Cb-128) */
436         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
438         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
440         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
442         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
444         r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
445                      B[p2++]=r2 || r1.l = w[i2++]; // 2u
446         r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447                      B[p2++]=r3 || r2=[i1++];      // oy
449         p1+=3;
450 .L1888: p2+=3;
452         l1 = 0;
454         (r7:4) = [sp++];
455         unlink;
456         rts;
457 DEFUN_END(yuv2rgb24_line)
461 #define ARG_vdst        20
462 #define ARG_width       24
463 #define ARG_height      28
464 #define ARG_lumStride   32
465 #define ARG_chromStride 36
466 #define ARG_srcStride   40
468 DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469                          long width, long height,
470                          long lumStride, long chromStride, long srcStride)):
471         link 0;
472         [--sp] = (r7:4,p5:4);
474         p0 = r1;       // Y top even
476         i2 = r2; // *u
477         r2 = [fp + ARG_vdst];
478         i3 = r2; // *v
480         r1 = [fp + ARG_srcStride];
481         r2 = r0 + r1;
482         r1 += -8;  // i0,i1 is pre read need to correct
483         m0 = r1;
485         i0 = r0;  // uyvy_T even
486         i1 = r2;  // uyvy_B odd
488         p2 = [fp + ARG_lumStride];
489         p1 = p0 + p2;  // Y bot odd
491         p5 = [fp + ARG_width];
492         p4 = [fp + ARG_height];
493         r0 = p5;
494         p4 = p4 >> 1;
495         p5 = p5 >> 2;
497         r2 = [fp + ARG_chromStride];
498         r0 = r0 >> 1;
499         r2 = r2 - r0;
500         m1 = r2;
502         /*   I0,I1 - src input line pointers
503          *   p0,p1 - luma output line pointers
504          *   I2    - dstU
505          *   I3    - dstV
506          */
508         lsetup (0f, 1f) lc1 = p4;   // H/2
509 0:        r0 = [i0++] || r2 = [i1++];
510           r1 = [i0++] || r3 = [i1++];
511           r4 = byteop1p(r1:0, r3:2);
512           r5 = byteop1p(r1:0, r3:2) (r);
513           lsetup (2f, 3f) lc0 = p5; // W/4
514 2:          r0 = r0 >> 8(v);
515             r1 = r1 >> 8(v);
516             r2 = r2 >> 8(v);
517             r3 = r3 >> 8(v);
518             r0 = bytepack(r0, r1);
519             r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
520             r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
521             r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
522             r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
523             r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
524 3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
526           i0 += m0;
527           i1 += m0;
528           i2 += m1;
529           i3 += m1;
530           p0 = p0 + p2;
531 1:        p1 = p1 + p2;
533         (r7:4,p5:4) = [sp++];
534         unlink;
535         rts;
536 DEFUN_END(uyvytoyv12)
538 DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
539                          long width, long height,
540                          long lumStride, long chromStride, long srcStride)):
541         link 0;
542         [--sp] = (r7:4,p5:4);
544         p0 = r1;       // Y top even
546         i2 = r2; // *u
547         r2 = [fp + ARG_vdst];
548         i3 = r2; // *v
550         r1 = [fp + ARG_srcStride];
551         r2 = r0 + r1;
552         r1 += -8;  // i0,i1 is pre read need to correct
553         m0 = r1;
555         i0 = r0;  // uyvy_T even
556         i1 = r2;  // uyvy_B odd
558         p2 = [fp + ARG_lumStride];
559         p1 = p0 + p2;  // Y bot odd
561         p5 = [fp + ARG_width];
562         p4 = [fp + ARG_height];
563         r0 = p5;
564         p4 = p4 >> 1;
565         p5 = p5 >> 2;
567         r2 = [fp + ARG_chromStride];
568         r0 = r0 >> 1;
569         r2 = r2 - r0;
570         m1 = r2;
572         /*   I0,I1 - src input line pointers
573          *   p0,p1 - luma output line pointers
574          *   I2    - dstU
575          *   I3    - dstV
576          */
578         lsetup (0f, 1f) lc1 = p4;   // H/2
579 0:        r0 = [i0++] || r2 = [i1++];
580           r1 = [i0++] || r3 = [i1++];
581           r4 = bytepack(r0, r1);
582           r5 = bytepack(r2, r3);
583           lsetup (2f, 3f) lc0 = p5; // W/4
584 2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
585             r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
586             r2 = r2 >> 8(v);
587             r3 = r3 >> 8(v);
588             r4 = byteop1p(r1:0, r3:2);
589             r5 = byteop1p(r1:0, r3:2) (r);
590             r6 = pack(r5.l, r4.l);
591             r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
592             r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
593             r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
594 3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
596           i0 += m0;
597           i1 += m0;
598           i2 += m1;
599           i3 += m1;
600           p0 = p0 + p2;
601 1:        p1 = p1 + p2;
603         (r7:4,p5:4) = [sp++];
604         unlink;
605         rts;
606 DEFUN_END(yuyvtoyv12)