demux: Reset demux stream 'eof' flag after packet buffer overflow
[mplayer.git] / libswscale / swscale_template.c
blob74f4c57285fa29dde6cb447e41be4425f51f78a1
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
32 #ifdef HAVE_3DNOW
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34 #define EMMS "femms"
35 #else
36 #define EMMS "emms"
37 #endif
39 #ifdef HAVE_3DNOW
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
48 #endif
50 #ifdef HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
56 #ifdef HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
62 #ifdef HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
69 #ifdef HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74 asm volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 asm volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX \
210 asm volatile(\
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
237 ASMALIGN(4)\
238 "2: \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
249 " jnz 2b \n\t"\
251 #define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
258 #define YSCALEYUV2PACKEDX_ACCURATE \
259 asm volatile(\
260 "xor %%"REG_a", %%"REG_a" \n\t"\
261 ASMALIGN(4)\
262 "nop \n\t"\
263 "1: \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
270 ASMALIGN(4)\
271 "2: \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
295 " jnz 2b \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
314 ASMALIGN(4)\
315 "2: \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
339 " jnz 2b \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
352 #define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
388 #if 0
389 #define FULL_YSCALEYUV2RGB \
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
398 ASMALIGN(4)\
399 "1: \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
440 "packuswb %%mm1, %%mm1 \n\t"
441 #endif
443 #define REAL_YSCALEYUV2PACKED(index, c) \
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
451 ASMALIGN(4)\
452 "1: \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
479 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
481 #define REAL_YSCALEYUV2RGB(index, c) \
482 "xor "#index", "#index" \n\t"\
483 ASMALIGN(4)\
484 "1: \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
545 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
547 #define REAL_YSCALEYUV2PACKED1(index, c) \
548 "xor "#index", "#index" \n\t"\
549 ASMALIGN(4)\
550 "1: \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
560 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
562 #define REAL_YSCALEYUV2RGB1(index, c) \
563 "xor "#index", "#index" \n\t"\
564 ASMALIGN(4)\
565 "1: \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
609 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
611 #define REAL_YSCALEYUV2PACKED1b(index, c) \
612 "xor "#index", "#index" \n\t"\
613 ASMALIGN(4)\
614 "1: \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
627 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
629 // do vertical chrominance interpolation
630 #define REAL_YSCALEYUV2RGB1b(index, c) \
631 "xor "#index", "#index" \n\t"\
632 ASMALIGN(4)\
633 "1: \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
681 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
683 #define REAL_WRITEBGR32(dst, dstw, index) \
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
705 " jb 1b \n\t"
706 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
708 #define REAL_WRITERGB16(dst, dstw, index) \
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
733 " jb 1b \n\t"
734 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
736 #define REAL_WRITERGB15(dst, dstw, index) \
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
762 " jb 1b \n\t"
763 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
765 #define WRITEBGR24OLD(dst, dstw, index) \
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
819 " jb 1b \n\t"
821 #define WRITEBGR24MMX(dst, dstw, index) \
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
872 " jb 1b \n\t"
874 #define WRITEBGR24MMX2(dst, dstw, index) \
875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
916 "add $24, "#dst" \n\t"\
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
920 " jb 1b \n\t"
922 #ifdef HAVE_MMX2
923 #undef WRITEBGR24
924 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
925 #else
926 #undef WRITEBGR24
927 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
928 #endif
930 #define REAL_WRITEYUY2(dst, dstw, index) \
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
944 " jb 1b \n\t"
945 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
948 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
952 #ifdef HAVE_MMX
953 if(!(c->flags & SWS_BITEXACT)){
954 if (c->flags & SWS_ACCURATE_RND){
955 if (uDest){
956 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
957 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
960 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
961 }else{
962 if (uDest){
963 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
964 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
967 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
969 return;
971 #endif
972 #ifdef HAVE_ALTIVEC
973 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
974 chrFilter, chrSrc, chrFilterSize,
975 dest, uDest, vDest, dstW, chrDstW);
976 #else //HAVE_ALTIVEC
977 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
978 chrFilter, chrSrc, chrFilterSize,
979 dest, uDest, vDest, dstW, chrDstW);
980 #endif //!HAVE_ALTIVEC
983 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
984 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
985 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
987 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
988 chrFilter, chrSrc, chrFilterSize,
989 dest, uDest, dstW, chrDstW, dstFormat);
992 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
993 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
995 int i;
996 #ifdef HAVE_MMX
997 if(!(c->flags & SWS_BITEXACT)){
998 long p= uDest ? 3 : 1;
999 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
1000 uint8_t *dst[3]= {dest, uDest, vDest};
1001 long counter[3] = {dstW, chrDstW, chrDstW};
1003 if (c->flags & SWS_ACCURATE_RND){
1004 while(p--){
1005 asm volatile(
1006 YSCALEYUV2YV121_ACCURATE
1007 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1008 "g" (-counter[p])
1009 : "%"REG_a
1012 }else{
1013 while(p--){
1014 asm volatile(
1015 YSCALEYUV2YV121
1016 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1017 "g" (-counter[p])
1018 : "%"REG_a
1022 return;
1024 #endif
1025 for (i=0; i<dstW; i++)
1027 int val= (lumSrc[i]+64)>>7;
1029 if (val&256){
1030 if (val<0) val=0;
1031 else val=255;
1034 dest[i]= val;
1037 if (uDest)
1038 for (i=0; i<chrDstW; i++)
1040 int u=(chrSrc[i ]+64)>>7;
1041 int v=(chrSrc[i + VOFW]+64)>>7;
1043 if ((u|v)&256){
1044 if (u<0) u=0;
1045 else if (u>255) u=255;
1046 if (v<0) v=0;
1047 else if (v>255) v=255;
1050 uDest[i]= u;
1051 vDest[i]= v;
1057 * vertical scale YV12 to RGB
1059 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1060 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1061 uint8_t *dest, long dstW, long dstY)
1063 #ifdef HAVE_MMX
1064 long dummy=0;
1065 if(!(c->flags & SWS_BITEXACT)){
1066 if (c->flags & SWS_ACCURATE_RND){
1067 switch(c->dstFormat){
1068 case PIX_FMT_RGB32:
1069 YSCALEYUV2PACKEDX_ACCURATE
1070 YSCALEYUV2RGBX
1071 WRITEBGR32(%4, %5, %%REGa)
1073 YSCALEYUV2PACKEDX_END
1074 return;
1075 case PIX_FMT_BGR24:
1076 YSCALEYUV2PACKEDX_ACCURATE
1077 YSCALEYUV2RGBX
1078 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1079 "add %4, %%"REG_c" \n\t"
1080 WRITEBGR24(%%REGc, %5, %%REGa)
1083 :: "r" (&c->redDither),
1084 "m" (dummy), "m" (dummy), "m" (dummy),
1085 "r" (dest), "m" (dstW)
1086 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1088 return;
1089 case PIX_FMT_RGB555:
1090 YSCALEYUV2PACKEDX_ACCURATE
1091 YSCALEYUV2RGBX
1092 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1093 #ifdef DITHER1XBPP
1094 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1095 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1096 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1097 #endif
1099 WRITERGB15(%4, %5, %%REGa)
1100 YSCALEYUV2PACKEDX_END
1101 return;
1102 case PIX_FMT_RGB565:
1103 YSCALEYUV2PACKEDX_ACCURATE
1104 YSCALEYUV2RGBX
1105 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1106 #ifdef DITHER1XBPP
1107 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1108 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1109 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1110 #endif
1112 WRITERGB16(%4, %5, %%REGa)
1113 YSCALEYUV2PACKEDX_END
1114 return;
1115 case PIX_FMT_YUYV422:
1116 YSCALEYUV2PACKEDX_ACCURATE
1117 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1119 "psraw $3, %%mm3 \n\t"
1120 "psraw $3, %%mm4 \n\t"
1121 "psraw $3, %%mm1 \n\t"
1122 "psraw $3, %%mm7 \n\t"
1123 WRITEYUY2(%4, %5, %%REGa)
1124 YSCALEYUV2PACKEDX_END
1125 return;
1127 }else{
1128 switch(c->dstFormat)
1130 case PIX_FMT_RGB32:
1131 YSCALEYUV2PACKEDX
1132 YSCALEYUV2RGBX
1133 WRITEBGR32(%4, %5, %%REGa)
1134 YSCALEYUV2PACKEDX_END
1135 return;
1136 case PIX_FMT_BGR24:
1137 YSCALEYUV2PACKEDX
1138 YSCALEYUV2RGBX
1139 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1140 "add %4, %%"REG_c" \n\t"
1141 WRITEBGR24(%%REGc, %5, %%REGa)
1143 :: "r" (&c->redDither),
1144 "m" (dummy), "m" (dummy), "m" (dummy),
1145 "r" (dest), "m" (dstW)
1146 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1148 return;
1149 case PIX_FMT_RGB555:
1150 YSCALEYUV2PACKEDX
1151 YSCALEYUV2RGBX
1152 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1153 #ifdef DITHER1XBPP
1154 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1155 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1156 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1157 #endif
1159 WRITERGB15(%4, %5, %%REGa)
1160 YSCALEYUV2PACKEDX_END
1161 return;
1162 case PIX_FMT_RGB565:
1163 YSCALEYUV2PACKEDX
1164 YSCALEYUV2RGBX
1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1166 #ifdef DITHER1XBPP
1167 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1168 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1169 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1170 #endif
1172 WRITERGB16(%4, %5, %%REGa)
1173 YSCALEYUV2PACKEDX_END
1174 return;
1175 case PIX_FMT_YUYV422:
1176 YSCALEYUV2PACKEDX
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa)
1184 YSCALEYUV2PACKEDX_END
1185 return;
1189 #endif /* HAVE_MMX */
1190 #ifdef HAVE_ALTIVEC
1191 /* The following list of supported dstFormat values should
1192 match what's found in the body of altivec_yuv2packedX() */
1193 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1194 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1195 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1196 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1198 dest, dstW, dstY);
1199 else
1200 #endif
1201 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1202 chrFilter, chrSrc, chrFilterSize,
1203 dest, dstW, dstY);
1207 * vertical bilinear scale YV12 to RGB
1209 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1210 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1212 int yalpha1=4095- yalpha;
1213 int uvalpha1=4095-uvalpha;
1214 int i;
1216 #if 0 //isn't used
1217 if (flags&SWS_FULL_CHR_H_INT)
1219 switch(dstFormat)
1221 #ifdef HAVE_MMX
1222 case PIX_FMT_RGB32:
1223 asm volatile(
1226 FULL_YSCALEYUV2RGB
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1234 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1235 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1237 "add $4, %%"REG_a" \n\t"
1238 "cmp %5, %%"REG_a" \n\t"
1239 " jb 1b \n\t"
1241 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1242 "m" (yalpha1), "m" (uvalpha1)
1243 : "%"REG_a
1245 break;
1246 case PIX_FMT_BGR24:
1247 asm volatile(
1249 FULL_YSCALEYUV2RGB
1251 // lsb ... msb
1252 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1253 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1255 "movq %%mm3, %%mm1 \n\t"
1256 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1257 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1259 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1260 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1261 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1262 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1263 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1264 "movq %%mm1, %%mm2 \n\t"
1265 "psllq $48, %%mm1 \n\t" // 000000BG
1266 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1268 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1269 "psrld $16, %%mm2 \n\t" // R000R000
1270 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1271 "por %%mm2, %%mm1 \n\t" // RBGRR000
1273 "mov %4, %%"REG_b" \n\t"
1274 "add %%"REG_a", %%"REG_b" \n\t"
1276 #ifdef HAVE_MMX2
1277 //FIXME Alignment
1278 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1280 #else
1281 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1282 "psrlq $32, %%mm3 \n\t"
1283 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1284 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1285 #endif
1286 "add $4, %%"REG_a" \n\t"
1287 "cmp %5, %%"REG_a" \n\t"
1288 " jb 1b \n\t"
1290 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1291 "m" (yalpha1), "m" (uvalpha1)
1292 : "%"REG_a, "%"REG_b
1294 break;
1295 case PIX_FMT_BGR555:
1296 asm volatile(
1298 FULL_YSCALEYUV2RGB
1299 #ifdef DITHER1XBPP
1300 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1301 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1302 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1303 #endif
1304 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1305 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1306 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1308 "psrlw $3, %%mm3 \n\t"
1309 "psllw $2, %%mm1 \n\t"
1310 "psllw $7, %%mm0 \n\t"
1311 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1312 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1314 "por %%mm3, %%mm1 \n\t"
1315 "por %%mm1, %%mm0 \n\t"
1317 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1319 "add $4, %%"REG_a" \n\t"
1320 "cmp %5, %%"REG_a" \n\t"
1321 " jb 1b \n\t"
1323 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1324 "m" (yalpha1), "m" (uvalpha1)
1325 : "%"REG_a
1327 break;
1328 case PIX_FMT_BGR565:
1329 asm volatile(
1331 FULL_YSCALEYUV2RGB
1332 #ifdef DITHER1XBPP
1333 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1334 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1335 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1336 #endif
1337 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1338 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1339 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1341 "psrlw $3, %%mm3 \n\t"
1342 "psllw $3, %%mm1 \n\t"
1343 "psllw $8, %%mm0 \n\t"
1344 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1345 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1347 "por %%mm3, %%mm1 \n\t"
1348 "por %%mm1, %%mm0 \n\t"
1350 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1352 "add $4, %%"REG_a" \n\t"
1353 "cmp %5, %%"REG_a" \n\t"
1354 " jb 1b \n\t"
1356 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1357 "m" (yalpha1), "m" (uvalpha1)
1358 : "%"REG_a
1360 break;
1361 #endif /* HAVE_MMX */
1362 case PIX_FMT_BGR32:
1363 #ifndef HAVE_MMX
1364 case PIX_FMT_RGB32:
1365 #endif
1366 if (dstFormat==PIX_FMT_RGB32)
1368 int i;
1369 #ifdef WORDS_BIGENDIAN
1370 dest++;
1371 #endif
1372 for (i=0;i<dstW;i++){
1373 // vertical linear interpolation && yuv2rgb in a single step:
1374 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1375 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1376 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1377 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1378 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1379 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1380 dest+= 4;
1383 else if (dstFormat==PIX_FMT_BGR24)
1385 int i;
1386 for (i=0;i<dstW;i++){
1387 // vertical linear interpolation && yuv2rgb in a single step:
1388 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1390 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1391 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1392 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1393 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1394 dest+= 3;
1397 else if (dstFormat==PIX_FMT_BGR565)
1399 int i;
1400 for (i=0;i<dstW;i++){
1401 // vertical linear interpolation && yuv2rgb in a single step:
1402 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1403 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1404 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1406 ((uint16_t*)dest)[i] =
1407 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1408 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1409 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1412 else if (dstFormat==PIX_FMT_BGR555)
1414 int i;
1415 for (i=0;i<dstW;i++){
1416 // vertical linear interpolation && yuv2rgb in a single step:
1417 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1418 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1419 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1421 ((uint16_t*)dest)[i] =
1422 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1423 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1424 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1427 }//FULL_UV_IPOL
1428 else
1430 #endif // if 0
1431 #ifdef HAVE_MMX
1432 if(!(c->flags & SWS_BITEXACT)){
1433 switch(c->dstFormat)
1435 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1436 case PIX_FMT_RGB32:
1437 asm volatile(
1438 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1439 "mov %4, %%"REG_b" \n\t"
1440 "push %%"REG_BP" \n\t"
1441 YSCALEYUV2RGB(%%REGBP, %5)
1442 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1443 "pop %%"REG_BP" \n\t"
1444 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1446 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1447 "a" (&c->redDither)
1449 return;
1450 case PIX_FMT_BGR24:
1451 asm volatile(
1452 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1453 "mov %4, %%"REG_b" \n\t"
1454 "push %%"REG_BP" \n\t"
1455 YSCALEYUV2RGB(%%REGBP, %5)
1456 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1457 "pop %%"REG_BP" \n\t"
1458 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1459 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1460 "a" (&c->redDither)
1462 return;
1463 case PIX_FMT_RGB555:
1464 asm volatile(
1465 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1466 "mov %4, %%"REG_b" \n\t"
1467 "push %%"REG_BP" \n\t"
1468 YSCALEYUV2RGB(%%REGBP, %5)
1469 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1470 #ifdef DITHER1XBPP
1471 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1472 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1473 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1474 #endif
1476 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1477 "pop %%"REG_BP" \n\t"
1478 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1480 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1481 "a" (&c->redDither)
1483 return;
1484 case PIX_FMT_RGB565:
1485 asm volatile(
1486 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1487 "mov %4, %%"REG_b" \n\t"
1488 "push %%"REG_BP" \n\t"
1489 YSCALEYUV2RGB(%%REGBP, %5)
1490 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1491 #ifdef DITHER1XBPP
1492 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1493 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1494 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1495 #endif
1497 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1498 "pop %%"REG_BP" \n\t"
1499 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1500 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1501 "a" (&c->redDither)
1503 return;
1504 case PIX_FMT_YUYV422:
1505 asm volatile(
1506 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1507 "mov %4, %%"REG_b" \n\t"
1508 "push %%"REG_BP" \n\t"
1509 YSCALEYUV2PACKED(%%REGBP, %5)
1510 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1511 "pop %%"REG_BP" \n\t"
1512 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1513 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514 "a" (&c->redDither)
1516 return;
1517 default: break;
1520 #endif //HAVE_MMX
1521 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1525 * YV12 to RGB without scaling or interpolating
1527 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1528 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1530 const int yalpha1=0;
1531 int i;
1533 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1534 const int yalpha= 4096; //FIXME ...
1536 if (flags&SWS_FULL_CHR_H_INT)
1538 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1539 return;
1542 #ifdef HAVE_MMX
1543 if(!(flags & SWS_BITEXACT)){
1544 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1546 switch(dstFormat)
1548 case PIX_FMT_RGB32:
1549 asm volatile(
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP, %5)
1554 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1555 "pop %%"REG_BP" \n\t"
1556 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1558 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1559 "a" (&c->redDither)
1561 return;
1562 case PIX_FMT_BGR24:
1563 asm volatile(
1564 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1565 "mov %4, %%"REG_b" \n\t"
1566 "push %%"REG_BP" \n\t"
1567 YSCALEYUV2RGB1(%%REGBP, %5)
1568 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1569 "pop %%"REG_BP" \n\t"
1570 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1572 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1573 "a" (&c->redDither)
1575 return;
1576 case PIX_FMT_RGB555:
1577 asm volatile(
1578 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1579 "mov %4, %%"REG_b" \n\t"
1580 "push %%"REG_BP" \n\t"
1581 YSCALEYUV2RGB1(%%REGBP, %5)
1582 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1583 #ifdef DITHER1XBPP
1584 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1585 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1586 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1587 #endif
1588 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1589 "pop %%"REG_BP" \n\t"
1590 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1592 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1593 "a" (&c->redDither)
1595 return;
1596 case PIX_FMT_RGB565:
1597 asm volatile(
1598 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1599 "mov %4, %%"REG_b" \n\t"
1600 "push %%"REG_BP" \n\t"
1601 YSCALEYUV2RGB1(%%REGBP, %5)
1602 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1603 #ifdef DITHER1XBPP
1604 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1605 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1606 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1607 #endif
1609 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1610 "pop %%"REG_BP" \n\t"
1611 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1613 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1614 "a" (&c->redDither)
1616 return;
1617 case PIX_FMT_YUYV422:
1618 asm volatile(
1619 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1620 "mov %4, %%"REG_b" \n\t"
1621 "push %%"REG_BP" \n\t"
1622 YSCALEYUV2PACKED1(%%REGBP, %5)
1623 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1624 "pop %%"REG_BP" \n\t"
1625 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1627 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1628 "a" (&c->redDither)
1630 return;
1633 else
1635 switch(dstFormat)
1637 case PIX_FMT_RGB32:
1638 asm volatile(
1639 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640 "mov %4, %%"REG_b" \n\t"
1641 "push %%"REG_BP" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP, %5)
1643 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1644 "pop %%"REG_BP" \n\t"
1645 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1647 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1648 "a" (&c->redDither)
1650 return;
1651 case PIX_FMT_BGR24:
1652 asm volatile(
1653 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1654 "mov %4, %%"REG_b" \n\t"
1655 "push %%"REG_BP" \n\t"
1656 YSCALEYUV2RGB1b(%%REGBP, %5)
1657 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1658 "pop %%"REG_BP" \n\t"
1659 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1661 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1662 "a" (&c->redDither)
1664 return;
1665 case PIX_FMT_RGB555:
1666 asm volatile(
1667 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1668 "mov %4, %%"REG_b" \n\t"
1669 "push %%"REG_BP" \n\t"
1670 YSCALEYUV2RGB1b(%%REGBP, %5)
1671 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1672 #ifdef DITHER1XBPP
1673 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1674 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1675 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1676 #endif
1677 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1678 "pop %%"REG_BP" \n\t"
1679 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1681 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1682 "a" (&c->redDither)
1684 return;
1685 case PIX_FMT_RGB565:
1686 asm volatile(
1687 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1688 "mov %4, %%"REG_b" \n\t"
1689 "push %%"REG_BP" \n\t"
1690 YSCALEYUV2RGB1b(%%REGBP, %5)
1691 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1692 #ifdef DITHER1XBPP
1693 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1694 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1695 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1696 #endif
1698 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1699 "pop %%"REG_BP" \n\t"
1700 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1702 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1703 "a" (&c->redDither)
1705 return;
1706 case PIX_FMT_YUYV422:
1707 asm volatile(
1708 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1709 "mov %4, %%"REG_b" \n\t"
1710 "push %%"REG_BP" \n\t"
1711 YSCALEYUV2PACKED1b(%%REGBP, %5)
1712 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1713 "pop %%"REG_BP" \n\t"
1714 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1716 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1717 "a" (&c->redDither)
1719 return;
1723 #endif /* HAVE_MMX */
1724 if (uvalpha < 2048)
1726 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1727 }else{
1728 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1732 //FIXME yuy2* can read up to 7 samples too much
1734 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1736 #ifdef HAVE_MMX
1737 asm volatile(
1738 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1739 "mov %0, %%"REG_a" \n\t"
1740 "1: \n\t"
1741 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1742 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1743 "pand %%mm2, %%mm0 \n\t"
1744 "pand %%mm2, %%mm1 \n\t"
1745 "packuswb %%mm1, %%mm0 \n\t"
1746 "movq %%mm0, (%2, %%"REG_a") \n\t"
1747 "add $8, %%"REG_a" \n\t"
1748 " js 1b \n\t"
1749 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1750 : "%"REG_a
1752 #else
1753 int i;
1754 for (i=0; i<width; i++)
1755 dst[i]= src[2*i];
1756 #endif
1759 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1761 #ifdef HAVE_MMX
1762 asm volatile(
1763 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1764 "mov %0, %%"REG_a" \n\t"
1765 "1: \n\t"
1766 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1767 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1768 "psrlw $8, %%mm0 \n\t"
1769 "psrlw $8, %%mm1 \n\t"
1770 "packuswb %%mm1, %%mm0 \n\t"
1771 "movq %%mm0, %%mm1 \n\t"
1772 "psrlw $8, %%mm0 \n\t"
1773 "pand %%mm4, %%mm1 \n\t"
1774 "packuswb %%mm0, %%mm0 \n\t"
1775 "packuswb %%mm1, %%mm1 \n\t"
1776 "movd %%mm0, (%3, %%"REG_a") \n\t"
1777 "movd %%mm1, (%2, %%"REG_a") \n\t"
1778 "add $4, %%"REG_a" \n\t"
1779 " js 1b \n\t"
1780 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1781 : "%"REG_a
1783 #else
1784 int i;
1785 for (i=0; i<width; i++)
1787 dstU[i]= src1[4*i + 1];
1788 dstV[i]= src1[4*i + 3];
1790 #endif
1791 assert(src1 == src2);
1794 /* This is almost identical to the previous, end exists only because
1795 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1796 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1798 #ifdef HAVE_MMX
1799 asm volatile(
1800 "mov %0, %%"REG_a" \n\t"
1801 "1: \n\t"
1802 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1803 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1804 "psrlw $8, %%mm0 \n\t"
1805 "psrlw $8, %%mm1 \n\t"
1806 "packuswb %%mm1, %%mm0 \n\t"
1807 "movq %%mm0, (%2, %%"REG_a") \n\t"
1808 "add $8, %%"REG_a" \n\t"
1809 " js 1b \n\t"
1810 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1811 : "%"REG_a
1813 #else
1814 int i;
1815 for (i=0; i<width; i++)
1816 dst[i]= src[2*i+1];
1817 #endif
1820 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1822 #ifdef HAVE_MMX
1823 asm volatile(
1824 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1825 "mov %0, %%"REG_a" \n\t"
1826 "1: \n\t"
1827 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1828 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1829 "pand %%mm4, %%mm0 \n\t"
1830 "pand %%mm4, %%mm1 \n\t"
1831 "packuswb %%mm1, %%mm0 \n\t"
1832 "movq %%mm0, %%mm1 \n\t"
1833 "psrlw $8, %%mm0 \n\t"
1834 "pand %%mm4, %%mm1 \n\t"
1835 "packuswb %%mm0, %%mm0 \n\t"
1836 "packuswb %%mm1, %%mm1 \n\t"
1837 "movd %%mm0, (%3, %%"REG_a") \n\t"
1838 "movd %%mm1, (%2, %%"REG_a") \n\t"
1839 "add $4, %%"REG_a" \n\t"
1840 " js 1b \n\t"
1841 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1842 : "%"REG_a
1844 #else
1845 int i;
1846 for (i=0; i<width; i++)
1848 dstU[i]= src1[4*i + 0];
1849 dstV[i]= src1[4*i + 2];
1851 #endif
1852 assert(src1 == src2);
1855 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1856 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1858 int i;\
1859 for (i=0; i<width; i++)\
1861 int b= (((type*)src)[i]>>shb)&maskb;\
1862 int g= (((type*)src)[i]>>shg)&maskg;\
1863 int r= (((type*)src)[i]>>shr)&maskr;\
1865 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1869 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1870 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1871 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1872 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1873 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1874 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1876 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1877 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1879 int i;\
1880 for (i=0; i<width; i++)\
1882 int b= (((type*)src)[i]&maskb)>>shb;\
1883 int g= (((type*)src)[i]&maskg)>>shg;\
1884 int r= (((type*)src)[i]&maskr)>>shr;\
1886 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1887 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1890 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1892 int i;\
1893 for (i=0; i<width; i++)\
1895 int pix0= ((type*)src)[2*i+0];\
1896 int pix1= ((type*)src)[2*i+1];\
1897 int g= (pix0&maskg)+(pix1&maskg);\
1898 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1899 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1901 g>>=shg;\
1903 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1904 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1908 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1909 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1910 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1911 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1912 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1913 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1915 #ifdef HAVE_MMX
1916 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1919 if(srcFormat == PIX_FMT_BGR24){
1920 asm volatile(
1921 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1922 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1925 }else{
1926 asm volatile(
1927 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1928 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1933 asm volatile(
1934 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1935 "mov %2, %%"REG_a" \n\t"
1936 "pxor %%mm7, %%mm7 \n\t"
1937 "1: \n\t"
1938 PREFETCH" 64(%0) \n\t"
1939 "movd (%0), %%mm0 \n\t"
1940 "movd 2(%0), %%mm1 \n\t"
1941 "movd 6(%0), %%mm2 \n\t"
1942 "movd 8(%0), %%mm3 \n\t"
1943 "add $12, %0 \n\t"
1944 "punpcklbw %%mm7, %%mm0 \n\t"
1945 "punpcklbw %%mm7, %%mm1 \n\t"
1946 "punpcklbw %%mm7, %%mm2 \n\t"
1947 "punpcklbw %%mm7, %%mm3 \n\t"
1948 "pmaddwd %%mm5, %%mm0 \n\t"
1949 "pmaddwd %%mm6, %%mm1 \n\t"
1950 "pmaddwd %%mm5, %%mm2 \n\t"
1951 "pmaddwd %%mm6, %%mm3 \n\t"
1952 "paddd %%mm1, %%mm0 \n\t"
1953 "paddd %%mm3, %%mm2 \n\t"
1954 "paddd %%mm4, %%mm0 \n\t"
1955 "paddd %%mm4, %%mm2 \n\t"
1956 "psrad $15, %%mm0 \n\t"
1957 "psrad $15, %%mm2 \n\t"
1958 "packssdw %%mm2, %%mm0 \n\t"
1959 "packuswb %%mm0, %%mm0 \n\t"
1960 "movd %%mm0, (%1, %%"REG_a") \n\t"
1961 "add $4, %%"REG_a" \n\t"
1962 " js 1b \n\t"
1963 : "+r" (src)
1964 : "r" (dst+width), "g" (-width)
1965 : "%"REG_a
1969 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1971 asm volatile(
1972 "movq 24+%4, %%mm6 \n\t"
1973 "mov %3, %%"REG_a" \n\t"
1974 "pxor %%mm7, %%mm7 \n\t"
1975 "1: \n\t"
1976 PREFETCH" 64(%0) \n\t"
1977 "movd (%0), %%mm0 \n\t"
1978 "movd 2(%0), %%mm1 \n\t"
1979 "punpcklbw %%mm7, %%mm0 \n\t"
1980 "punpcklbw %%mm7, %%mm1 \n\t"
1981 "movq %%mm0, %%mm2 \n\t"
1982 "movq %%mm1, %%mm3 \n\t"
1983 "pmaddwd %4, %%mm0 \n\t"
1984 "pmaddwd 8+%4, %%mm1 \n\t"
1985 "pmaddwd 16+%4, %%mm2 \n\t"
1986 "pmaddwd %%mm6, %%mm3 \n\t"
1987 "paddd %%mm1, %%mm0 \n\t"
1988 "paddd %%mm3, %%mm2 \n\t"
1990 "movd 6(%0), %%mm1 \n\t"
1991 "movd 8(%0), %%mm3 \n\t"
1992 "add $12, %0 \n\t"
1993 "punpcklbw %%mm7, %%mm1 \n\t"
1994 "punpcklbw %%mm7, %%mm3 \n\t"
1995 "movq %%mm1, %%mm4 \n\t"
1996 "movq %%mm3, %%mm5 \n\t"
1997 "pmaddwd %4, %%mm1 \n\t"
1998 "pmaddwd 8+%4, %%mm3 \n\t"
1999 "pmaddwd 16+%4, %%mm4 \n\t"
2000 "pmaddwd %%mm6, %%mm5 \n\t"
2001 "paddd %%mm3, %%mm1 \n\t"
2002 "paddd %%mm5, %%mm4 \n\t"
2004 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
2005 "paddd %%mm3, %%mm0 \n\t"
2006 "paddd %%mm3, %%mm2 \n\t"
2007 "paddd %%mm3, %%mm1 \n\t"
2008 "paddd %%mm3, %%mm4 \n\t"
2009 "psrad $15, %%mm0 \n\t"
2010 "psrad $15, %%mm2 \n\t"
2011 "psrad $15, %%mm1 \n\t"
2012 "psrad $15, %%mm4 \n\t"
2013 "packssdw %%mm1, %%mm0 \n\t"
2014 "packssdw %%mm4, %%mm2 \n\t"
2015 "packuswb %%mm0, %%mm0 \n\t"
2016 "packuswb %%mm2, %%mm2 \n\t"
2017 "movd %%mm0, (%1, %%"REG_a") \n\t"
2018 "movd %%mm2, (%2, %%"REG_a") \n\t"
2019 "add $4, %%"REG_a" \n\t"
2020 " js 1b \n\t"
2021 : "+r" (src)
2022 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2023 : "%"REG_a
2026 #endif
2028 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2030 #ifdef HAVE_MMX
2031 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
2032 #else
2033 int i;
2034 for (i=0; i<width; i++)
2036 int b= src[i*3+0];
2037 int g= src[i*3+1];
2038 int r= src[i*3+2];
2040 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2042 #endif /* HAVE_MMX */
2045 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2047 #ifdef HAVE_MMX
2048 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
2049 #else
2050 int i;
2051 for (i=0; i<width; i++)
2053 int b= src1[3*i + 0];
2054 int g= src1[3*i + 1];
2055 int r= src1[3*i + 2];
2057 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2058 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2060 #endif /* HAVE_MMX */
2061 assert(src1 == src2);
2064 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2066 int i;
2067 for (i=0; i<width; i++)
2069 int b= src1[6*i + 0] + src1[6*i + 3];
2070 int g= src1[6*i + 1] + src1[6*i + 4];
2071 int r= src1[6*i + 2] + src1[6*i + 5];
2073 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2074 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2076 assert(src1 == src2);
2079 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2081 #ifdef HAVE_MMX
2082 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2083 #else
2084 int i;
2085 for (i=0; i<width; i++)
2087 int r= src[i*3+0];
2088 int g= src[i*3+1];
2089 int b= src[i*3+2];
2091 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2093 #endif
2096 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2098 int i;
2099 assert(src1==src2);
2100 #ifdef HAVE_MMX
2101 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2102 #else
2103 for (i=0; i<width; i++)
2105 int r= src1[3*i + 0];
2106 int g= src1[3*i + 1];
2107 int b= src1[3*i + 2];
2109 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2110 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2112 #endif
2115 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2117 int i;
2118 assert(src1==src2);
2119 for (i=0; i<width; i++)
2121 int r= src1[6*i + 0] + src1[6*i + 3];
2122 int g= src1[6*i + 1] + src1[6*i + 4];
2123 int b= src1[6*i + 2] + src1[6*i + 5];
2125 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2126 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2131 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2133 int i;
2134 for (i=0; i<width; i++)
2136 int d= src[i];
2138 dst[i]= pal[d] & 0xFF;
2142 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2144 int i;
2145 assert(src1 == src2);
2146 for (i=0; i<width; i++)
2148 int p= pal[src1[i]];
2150 dstU[i]= p>>8;
2151 dstV[i]= p>>16;
2155 static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2157 int i, j;
2158 for (i=0; i<width/8; i++){
2159 int d= ~src[i];
2160 for(j=0; j<8; j++)
2161 dst[8*i+j]= ((d>>(7-j))&1)*255;
2165 static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2167 int i, j;
2168 for (i=0; i<width/8; i++){
2169 int d= src[i];
2170 for(j=0; j<8; j++)
2171 dst[8*i+j]= ((d>>(7-j))&1)*255;
2175 // bilinear / bicubic scaling
2176 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2177 int16_t *filter, int16_t *filterPos, long filterSize)
2179 #ifdef HAVE_MMX
2180 assert(filterSize % 4 == 0 && filterSize>0);
2181 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2183 long counter= -2*dstW;
2184 filter-= counter*2;
2185 filterPos-= counter/2;
2186 dst-= counter/2;
2187 asm volatile(
2188 #if defined(PIC)
2189 "push %%"REG_b" \n\t"
2190 #endif
2191 "pxor %%mm7, %%mm7 \n\t"
2192 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2193 "mov %%"REG_a", %%"REG_BP" \n\t"
2194 ASMALIGN(4)
2195 "1: \n\t"
2196 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2197 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2198 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2199 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2200 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2201 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2202 "punpcklbw %%mm7, %%mm0 \n\t"
2203 "punpcklbw %%mm7, %%mm2 \n\t"
2204 "pmaddwd %%mm1, %%mm0 \n\t"
2205 "pmaddwd %%mm2, %%mm3 \n\t"
2206 "movq %%mm0, %%mm4 \n\t"
2207 "punpckldq %%mm3, %%mm0 \n\t"
2208 "punpckhdq %%mm3, %%mm4 \n\t"
2209 "paddd %%mm4, %%mm0 \n\t"
2210 "psrad $7, %%mm0 \n\t"
2211 "packssdw %%mm0, %%mm0 \n\t"
2212 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2213 "add $4, %%"REG_BP" \n\t"
2214 " jnc 1b \n\t"
2216 "pop %%"REG_BP" \n\t"
2217 #if defined(PIC)
2218 "pop %%"REG_b" \n\t"
2219 #endif
2220 : "+a" (counter)
2221 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2222 #if !defined(PIC)
2223 : "%"REG_b
2224 #endif
2227 else if (filterSize==8)
2229 long counter= -2*dstW;
2230 filter-= counter*4;
2231 filterPos-= counter/2;
2232 dst-= counter/2;
2233 asm volatile(
2234 #if defined(PIC)
2235 "push %%"REG_b" \n\t"
2236 #endif
2237 "pxor %%mm7, %%mm7 \n\t"
2238 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2239 "mov %%"REG_a", %%"REG_BP" \n\t"
2240 ASMALIGN(4)
2241 "1: \n\t"
2242 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2243 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2244 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2245 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2246 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2247 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2248 "punpcklbw %%mm7, %%mm0 \n\t"
2249 "punpcklbw %%mm7, %%mm2 \n\t"
2250 "pmaddwd %%mm1, %%mm0 \n\t"
2251 "pmaddwd %%mm2, %%mm3 \n\t"
2253 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2254 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2255 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2256 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2257 "punpcklbw %%mm7, %%mm4 \n\t"
2258 "punpcklbw %%mm7, %%mm2 \n\t"
2259 "pmaddwd %%mm1, %%mm4 \n\t"
2260 "pmaddwd %%mm2, %%mm5 \n\t"
2261 "paddd %%mm4, %%mm0 \n\t"
2262 "paddd %%mm5, %%mm3 \n\t"
2263 "movq %%mm0, %%mm4 \n\t"
2264 "punpckldq %%mm3, %%mm0 \n\t"
2265 "punpckhdq %%mm3, %%mm4 \n\t"
2266 "paddd %%mm4, %%mm0 \n\t"
2267 "psrad $7, %%mm0 \n\t"
2268 "packssdw %%mm0, %%mm0 \n\t"
2269 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2270 "add $4, %%"REG_BP" \n\t"
2271 " jnc 1b \n\t"
2273 "pop %%"REG_BP" \n\t"
2274 #if defined(PIC)
2275 "pop %%"REG_b" \n\t"
2276 #endif
2277 : "+a" (counter)
2278 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2279 #if !defined(PIC)
2280 : "%"REG_b
2281 #endif
2284 else
2286 uint8_t *offset = src+filterSize;
2287 long counter= -2*dstW;
2288 //filter-= counter*filterSize/2;
2289 filterPos-= counter/2;
2290 dst-= counter/2;
2291 asm volatile(
2292 "pxor %%mm7, %%mm7 \n\t"
2293 ASMALIGN(4)
2294 "1: \n\t"
2295 "mov %2, %%"REG_c" \n\t"
2296 "movzwl (%%"REG_c", %0), %%eax \n\t"
2297 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2298 "mov %5, %%"REG_c" \n\t"
2299 "pxor %%mm4, %%mm4 \n\t"
2300 "pxor %%mm5, %%mm5 \n\t"
2301 "2: \n\t"
2302 "movq (%1), %%mm1 \n\t"
2303 "movq (%1, %6), %%mm3 \n\t"
2304 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2305 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2306 "punpcklbw %%mm7, %%mm0 \n\t"
2307 "punpcklbw %%mm7, %%mm2 \n\t"
2308 "pmaddwd %%mm1, %%mm0 \n\t"
2309 "pmaddwd %%mm2, %%mm3 \n\t"
2310 "paddd %%mm3, %%mm5 \n\t"
2311 "paddd %%mm0, %%mm4 \n\t"
2312 "add $8, %1 \n\t"
2313 "add $4, %%"REG_c" \n\t"
2314 "cmp %4, %%"REG_c" \n\t"
2315 " jb 2b \n\t"
2316 "add %6, %1 \n\t"
2317 "movq %%mm4, %%mm0 \n\t"
2318 "punpckldq %%mm5, %%mm4 \n\t"
2319 "punpckhdq %%mm5, %%mm0 \n\t"
2320 "paddd %%mm0, %%mm4 \n\t"
2321 "psrad $7, %%mm4 \n\t"
2322 "packssdw %%mm4, %%mm4 \n\t"
2323 "mov %3, %%"REG_a" \n\t"
2324 "movd %%mm4, (%%"REG_a", %0) \n\t"
2325 "add $4, %0 \n\t"
2326 " jnc 1b \n\t"
2328 : "+r" (counter), "+r" (filter)
2329 : "m" (filterPos), "m" (dst), "m"(offset),
2330 "m" (src), "r" (filterSize*2)
2331 : "%"REG_a, "%"REG_c, "%"REG_d
2334 #else
2335 #ifdef HAVE_ALTIVEC
2336 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2337 #else
2338 int i;
2339 for (i=0; i<dstW; i++)
2341 int j;
2342 int srcPos= filterPos[i];
2343 int val=0;
2344 //printf("filterPos: %d\n", filterPos[i]);
2345 for (j=0; j<filterSize; j++)
2347 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2348 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2350 //filter += hFilterSize;
2351 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2352 //dst[i] = val>>7;
2354 #endif /* HAVE_ALTIVEC */
2355 #endif /* HAVE_MMX */
2357 // *** horizontal scale Y line to temp buffer
2358 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2359 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2360 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2361 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2362 int32_t *mmx2FilterPos, uint32_t *pal)
2364 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2366 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2367 src= formatConvBuffer;
2369 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2371 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2372 src= formatConvBuffer;
2374 else if (srcFormat==PIX_FMT_RGB32)
2376 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2377 src= formatConvBuffer;
2379 else if (srcFormat==PIX_FMT_RGB32_1)
2381 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2382 src= formatConvBuffer;
2384 else if (srcFormat==PIX_FMT_BGR24)
2386 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2387 src= formatConvBuffer;
2389 else if (srcFormat==PIX_FMT_BGR565)
2391 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2392 src= formatConvBuffer;
2394 else if (srcFormat==PIX_FMT_BGR555)
2396 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2397 src= formatConvBuffer;
2399 else if (srcFormat==PIX_FMT_BGR32)
2401 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2402 src= formatConvBuffer;
2404 else if (srcFormat==PIX_FMT_BGR32_1)
2406 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2407 src= formatConvBuffer;
2409 else if (srcFormat==PIX_FMT_RGB24)
2411 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2412 src= formatConvBuffer;
2414 else if (srcFormat==PIX_FMT_RGB565)
2416 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2417 src= formatConvBuffer;
2419 else if (srcFormat==PIX_FMT_RGB555)
2421 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2422 src= formatConvBuffer;
2424 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2426 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2427 src= formatConvBuffer;
2429 else if (srcFormat==PIX_FMT_MONOBLACK)
2431 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2432 src= formatConvBuffer;
2434 else if (srcFormat==PIX_FMT_MONOWHITE)
2436 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2437 src= formatConvBuffer;
2440 #ifdef HAVE_MMX
2441 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2442 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2443 #else
2444 if (!(flags&SWS_FAST_BILINEAR))
2445 #endif
2447 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2449 else // fast bilinear upscale / crap downscale
2451 #if defined(ARCH_X86)
2452 #ifdef HAVE_MMX2
2453 int i;
2454 #if defined(PIC)
2455 uint64_t ebxsave __attribute__((aligned(8)));
2456 #endif
2457 if (canMMX2BeUsed)
2459 asm volatile(
2460 #if defined(PIC)
2461 "mov %%"REG_b", %5 \n\t"
2462 #endif
2463 "pxor %%mm7, %%mm7 \n\t"
2464 "mov %0, %%"REG_c" \n\t"
2465 "mov %1, %%"REG_D" \n\t"
2466 "mov %2, %%"REG_d" \n\t"
2467 "mov %3, %%"REG_b" \n\t"
2468 "xor %%"REG_a", %%"REG_a" \n\t" // i
2469 PREFETCH" (%%"REG_c") \n\t"
2470 PREFETCH" 32(%%"REG_c") \n\t"
2471 PREFETCH" 64(%%"REG_c") \n\t"
2473 #ifdef ARCH_X86_64
2475 #define FUNNY_Y_CODE \
2476 "movl (%%"REG_b"), %%esi \n\t"\
2477 "call *%4 \n\t"\
2478 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2479 "add %%"REG_S", %%"REG_c" \n\t"\
2480 "add %%"REG_a", %%"REG_D" \n\t"\
2481 "xor %%"REG_a", %%"REG_a" \n\t"\
2483 #else
2485 #define FUNNY_Y_CODE \
2486 "movl (%%"REG_b"), %%esi \n\t"\
2487 "call *%4 \n\t"\
2488 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2489 "add %%"REG_a", %%"REG_D" \n\t"\
2490 "xor %%"REG_a", %%"REG_a" \n\t"\
2492 #endif /* ARCH_X86_64 */
2494 FUNNY_Y_CODE
2495 FUNNY_Y_CODE
2496 FUNNY_Y_CODE
2497 FUNNY_Y_CODE
2498 FUNNY_Y_CODE
2499 FUNNY_Y_CODE
2500 FUNNY_Y_CODE
2501 FUNNY_Y_CODE
2503 #if defined(PIC)
2504 "mov %5, %%"REG_b" \n\t"
2505 #endif
2506 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2507 "m" (funnyYCode)
2508 #if defined(PIC)
2509 ,"m" (ebxsave)
2510 #endif
2511 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2512 #if !defined(PIC)
2513 ,"%"REG_b
2514 #endif
2516 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2518 else
2520 #endif /* HAVE_MMX2 */
2521 long xInc_shr16 = xInc >> 16;
2522 uint16_t xInc_mask = xInc & 0xffff;
2523 //NO MMX just normal asm ...
2524 asm volatile(
2525 "xor %%"REG_a", %%"REG_a" \n\t" // i
2526 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2527 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2528 ASMALIGN(4)
2529 "1: \n\t"
2530 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2531 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2532 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2533 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2534 "shll $16, %%edi \n\t"
2535 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2536 "mov %1, %%"REG_D" \n\t"
2537 "shrl $9, %%esi \n\t"
2538 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2539 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2540 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2542 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2543 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2544 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2545 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2546 "shll $16, %%edi \n\t"
2547 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2548 "mov %1, %%"REG_D" \n\t"
2549 "shrl $9, %%esi \n\t"
2550 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2551 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2552 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2555 "add $2, %%"REG_a" \n\t"
2556 "cmp %2, %%"REG_a" \n\t"
2557 " jb 1b \n\t"
2560 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2561 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2563 #ifdef HAVE_MMX2
2564 } //if MMX2 can't be used
2565 #endif
2566 #else
2567 int i;
2568 unsigned int xpos=0;
2569 for (i=0;i<dstWidth;i++)
2571 register unsigned int xx=xpos>>16;
2572 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2573 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2574 xpos+=xInc;
2576 #endif /* defined(ARCH_X86) */
2579 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2580 int i;
2581 //FIXME all pal and rgb srcFormats could do this convertion as well
2582 //FIXME all scalers more complex than bilinear could do half of this transform
2583 if(c->srcRange){
2584 for (i=0; i<dstWidth; i++)
2585 dst[i]= (dst[i]*14071 + 33561947)>>14;
2586 }else{
2587 for (i=0; i<dstWidth; i++)
2588 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2593 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2594 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2595 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2596 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2597 int32_t *mmx2FilterPos, uint32_t *pal)
2599 if (srcFormat==PIX_FMT_YUYV422)
2601 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2602 src1= formatConvBuffer;
2603 src2= formatConvBuffer+VOFW;
2605 else if (srcFormat==PIX_FMT_UYVY422)
2607 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2608 src1= formatConvBuffer;
2609 src2= formatConvBuffer+VOFW;
2611 else if (srcFormat==PIX_FMT_RGB32)
2613 if(c->chrSrcHSubSample)
2614 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2615 else
2616 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2617 src1= formatConvBuffer;
2618 src2= formatConvBuffer+VOFW;
2620 else if (srcFormat==PIX_FMT_RGB32_1)
2622 if(c->chrSrcHSubSample)
2623 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2624 else
2625 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2626 src1= formatConvBuffer;
2627 src2= formatConvBuffer+VOFW;
2629 else if (srcFormat==PIX_FMT_BGR24)
2631 if(c->chrSrcHSubSample)
2632 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2633 else
2634 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2635 src1= formatConvBuffer;
2636 src2= formatConvBuffer+VOFW;
2638 else if (srcFormat==PIX_FMT_BGR565)
2640 if(c->chrSrcHSubSample)
2641 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2642 else
2643 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2644 src1= formatConvBuffer;
2645 src2= formatConvBuffer+VOFW;
2647 else if (srcFormat==PIX_FMT_BGR555)
2649 if(c->chrSrcHSubSample)
2650 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2651 else
2652 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2653 src1= formatConvBuffer;
2654 src2= formatConvBuffer+VOFW;
2656 else if (srcFormat==PIX_FMT_BGR32)
2658 if(c->chrSrcHSubSample)
2659 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2660 else
2661 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2662 src1= formatConvBuffer;
2663 src2= formatConvBuffer+VOFW;
2665 else if (srcFormat==PIX_FMT_BGR32_1)
2667 if(c->chrSrcHSubSample)
2668 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2669 else
2670 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2671 src1= formatConvBuffer;
2672 src2= formatConvBuffer+VOFW;
2674 else if (srcFormat==PIX_FMT_RGB24)
2676 if(c->chrSrcHSubSample)
2677 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2678 else
2679 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2680 src1= formatConvBuffer;
2681 src2= formatConvBuffer+VOFW;
2683 else if (srcFormat==PIX_FMT_RGB565)
2685 if(c->chrSrcHSubSample)
2686 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2687 else
2688 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2689 src1= formatConvBuffer;
2690 src2= formatConvBuffer+VOFW;
2692 else if (srcFormat==PIX_FMT_RGB555)
2694 if(c->chrSrcHSubSample)
2695 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2696 else
2697 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2698 src1= formatConvBuffer;
2699 src2= formatConvBuffer+VOFW;
2701 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2703 return;
2705 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2707 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2708 src1= formatConvBuffer;
2709 src2= formatConvBuffer+VOFW;
2712 #ifdef HAVE_MMX
2713 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2714 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2715 #else
2716 if (!(flags&SWS_FAST_BILINEAR))
2717 #endif
2719 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2720 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2722 else // fast bilinear upscale / crap downscale
2724 #if defined(ARCH_X86)
2725 #ifdef HAVE_MMX2
2726 int i;
2727 #if defined(PIC)
2728 uint64_t ebxsave __attribute__((aligned(8)));
2729 #endif
2730 if (canMMX2BeUsed)
2732 asm volatile(
2733 #if defined(PIC)
2734 "mov %%"REG_b", %6 \n\t"
2735 #endif
2736 "pxor %%mm7, %%mm7 \n\t"
2737 "mov %0, %%"REG_c" \n\t"
2738 "mov %1, %%"REG_D" \n\t"
2739 "mov %2, %%"REG_d" \n\t"
2740 "mov %3, %%"REG_b" \n\t"
2741 "xor %%"REG_a", %%"REG_a" \n\t" // i
2742 PREFETCH" (%%"REG_c") \n\t"
2743 PREFETCH" 32(%%"REG_c") \n\t"
2744 PREFETCH" 64(%%"REG_c") \n\t"
2746 #ifdef ARCH_X86_64
2748 #define FUNNY_UV_CODE \
2749 "movl (%%"REG_b"), %%esi \n\t"\
2750 "call *%4 \n\t"\
2751 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2752 "add %%"REG_S", %%"REG_c" \n\t"\
2753 "add %%"REG_a", %%"REG_D" \n\t"\
2754 "xor %%"REG_a", %%"REG_a" \n\t"\
2756 #else
2758 #define FUNNY_UV_CODE \
2759 "movl (%%"REG_b"), %%esi \n\t"\
2760 "call *%4 \n\t"\
2761 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2762 "add %%"REG_a", %%"REG_D" \n\t"\
2763 "xor %%"REG_a", %%"REG_a" \n\t"\
2765 #endif /* ARCH_X86_64 */
2767 FUNNY_UV_CODE
2768 FUNNY_UV_CODE
2769 FUNNY_UV_CODE
2770 FUNNY_UV_CODE
2771 "xor %%"REG_a", %%"REG_a" \n\t" // i
2772 "mov %5, %%"REG_c" \n\t" // src
2773 "mov %1, %%"REG_D" \n\t" // buf1
2774 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2775 PREFETCH" (%%"REG_c") \n\t"
2776 PREFETCH" 32(%%"REG_c") \n\t"
2777 PREFETCH" 64(%%"REG_c") \n\t"
2779 FUNNY_UV_CODE
2780 FUNNY_UV_CODE
2781 FUNNY_UV_CODE
2782 FUNNY_UV_CODE
2784 #if defined(PIC)
2785 "mov %6, %%"REG_b" \n\t"
2786 #endif
2787 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2788 "m" (funnyUVCode), "m" (src2)
2789 #if defined(PIC)
2790 ,"m" (ebxsave)
2791 #endif
2792 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2793 #if !defined(PIC)
2794 ,"%"REG_b
2795 #endif
2797 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2799 //printf("%d %d %d\n", dstWidth, i, srcW);
2800 dst[i] = src1[srcW-1]*128;
2801 dst[i+VOFW] = src2[srcW-1]*128;
2804 else
2806 #endif /* HAVE_MMX2 */
2807 long xInc_shr16 = (long) (xInc >> 16);
2808 uint16_t xInc_mask = xInc & 0xffff;
2809 asm volatile(
2810 "xor %%"REG_a", %%"REG_a" \n\t" // i
2811 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2812 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2813 ASMALIGN(4)
2814 "1: \n\t"
2815 "mov %0, %%"REG_S" \n\t"
2816 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2817 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2818 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2819 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2820 "shll $16, %%edi \n\t"
2821 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2822 "mov %1, %%"REG_D" \n\t"
2823 "shrl $9, %%esi \n\t"
2824 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2826 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2827 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2828 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2829 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2830 "shll $16, %%edi \n\t"
2831 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2832 "mov %1, %%"REG_D" \n\t"
2833 "shrl $9, %%esi \n\t"
2834 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2836 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2837 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2838 "add $1, %%"REG_a" \n\t"
2839 "cmp %2, %%"REG_a" \n\t"
2840 " jb 1b \n\t"
2842 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2843 which is needed to support GCC 4.0. */
2844 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2845 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2846 #else
2847 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2848 #endif
2849 "r" (src2)
2850 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2852 #ifdef HAVE_MMX2
2853 } //if MMX2 can't be used
2854 #endif
2855 #else
2856 int i;
2857 unsigned int xpos=0;
2858 for (i=0;i<dstWidth;i++)
2860 register unsigned int xx=xpos>>16;
2861 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2862 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2863 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2864 /* slower
2865 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2866 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2868 xpos+=xInc;
2870 #endif /* defined(ARCH_X86) */
2872 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2873 int i;
2874 //FIXME all pal and rgb srcFormats could do this convertion as well
2875 //FIXME all scalers more complex than bilinear could do half of this transform
2876 if(c->srcRange){
2877 for (i=0; i<dstWidth; i++){
2878 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2879 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2881 }else{
2882 for (i=0; i<dstWidth; i++){
2883 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2884 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2890 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2891 int srcSliceH, uint8_t* dst[], int dstStride[]){
2893 /* load a few things into local vars to make the code more readable? and faster */
2894 const int srcW= c->srcW;
2895 const int dstW= c->dstW;
2896 const int dstH= c->dstH;
2897 const int chrDstW= c->chrDstW;
2898 const int chrSrcW= c->chrSrcW;
2899 const int lumXInc= c->lumXInc;
2900 const int chrXInc= c->chrXInc;
2901 const int dstFormat= c->dstFormat;
2902 const int srcFormat= c->srcFormat;
2903 const int flags= c->flags;
2904 const int canMMX2BeUsed= c->canMMX2BeUsed;
2905 int16_t *vLumFilterPos= c->vLumFilterPos;
2906 int16_t *vChrFilterPos= c->vChrFilterPos;
2907 int16_t *hLumFilterPos= c->hLumFilterPos;
2908 int16_t *hChrFilterPos= c->hChrFilterPos;
2909 int16_t *vLumFilter= c->vLumFilter;
2910 int16_t *vChrFilter= c->vChrFilter;
2911 int16_t *hLumFilter= c->hLumFilter;
2912 int16_t *hChrFilter= c->hChrFilter;
2913 int32_t *lumMmxFilter= c->lumMmxFilter;
2914 int32_t *chrMmxFilter= c->chrMmxFilter;
2915 const int vLumFilterSize= c->vLumFilterSize;
2916 const int vChrFilterSize= c->vChrFilterSize;
2917 const int hLumFilterSize= c->hLumFilterSize;
2918 const int hChrFilterSize= c->hChrFilterSize;
2919 int16_t **lumPixBuf= c->lumPixBuf;
2920 int16_t **chrPixBuf= c->chrPixBuf;
2921 const int vLumBufSize= c->vLumBufSize;
2922 const int vChrBufSize= c->vChrBufSize;
2923 uint8_t *funnyYCode= c->funnyYCode;
2924 uint8_t *funnyUVCode= c->funnyUVCode;
2925 uint8_t *formatConvBuffer= c->formatConvBuffer;
2926 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2927 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2928 int lastDstY;
2929 uint32_t *pal=NULL;
2931 /* vars which will change and which we need to store back in the context */
2932 int dstY= c->dstY;
2933 int lumBufIndex= c->lumBufIndex;
2934 int chrBufIndex= c->chrBufIndex;
2935 int lastInLumBuf= c->lastInLumBuf;
2936 int lastInChrBuf= c->lastInChrBuf;
2938 if (isPacked(c->srcFormat)){
2939 pal= (uint32_t *)src[1];
2940 src[0]=
2941 src[1]=
2942 src[2]= src[0];
2943 srcStride[0]=
2944 srcStride[1]=
2945 srcStride[2]= srcStride[0];
2947 srcStride[1]<<= c->vChrDrop;
2948 srcStride[2]<<= c->vChrDrop;
2950 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2951 // (int)dst[0], (int)dst[1], (int)dst[2]);
2953 #if 0 //self test FIXME move to a vfilter or something
2955 static volatile int i=0;
2956 i++;
2957 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2958 selfTest(src, srcStride, c->srcW, c->srcH);
2959 i--;
2961 #endif
2963 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2964 //dstStride[0],dstStride[1],dstStride[2]);
2966 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2968 static int firstTime=1; //FIXME move this into the context perhaps
2969 if (flags & SWS_PRINT_INFO && firstTime)
2971 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2972 " ->cannot do aligned memory accesses anymore\n");
2973 firstTime=0;
2977 /* Note the user might start scaling the picture in the middle so this
2978 will not get executed. This is not really intended but works
2979 currently, so people might do it. */
2980 if (srcSliceY ==0){
2981 lumBufIndex=0;
2982 chrBufIndex=0;
2983 dstY=0;
2984 lastInLumBuf= -1;
2985 lastInChrBuf= -1;
2988 lastDstY= dstY;
2990 for (;dstY < dstH; dstY++){
2991 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2992 const int chrDstY= dstY>>c->chrDstVSubSample;
2993 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2994 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2996 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2997 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2998 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2999 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3001 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3002 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3003 //handle holes (FAST_BILINEAR & weird filters)
3004 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3005 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3006 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3007 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3008 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
3010 // Do we have enough lines in this slice to output the dstY line
3011 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3013 //Do horizontal scaling
3014 while(lastInLumBuf < lastLumSrcY)
3016 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3017 lumBufIndex++;
3018 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3019 assert(lumBufIndex < 2*vLumBufSize);
3020 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3021 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3022 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3023 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3024 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3025 funnyYCode, c->srcFormat, formatConvBuffer,
3026 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3027 lastInLumBuf++;
3029 while(lastInChrBuf < lastChrSrcY)
3031 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3032 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3033 chrBufIndex++;
3034 assert(chrBufIndex < 2*vChrBufSize);
3035 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3036 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3037 //FIXME replace parameters through context struct (some at least)
3039 if (!(isGray(srcFormat) || isGray(dstFormat)))
3040 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3041 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3042 funnyUVCode, c->srcFormat, formatConvBuffer,
3043 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3044 lastInChrBuf++;
3046 //wrap buf index around to stay inside the ring buffer
3047 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3048 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3050 else // not enough lines left in this slice -> load the rest in the buffer
3052 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3053 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3054 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3055 vChrBufSize, vLumBufSize);*/
3057 //Do horizontal scaling
3058 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3060 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3061 lumBufIndex++;
3062 assert(lumBufIndex < 2*vLumBufSize);
3063 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3064 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3065 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3066 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3067 funnyYCode, c->srcFormat, formatConvBuffer,
3068 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3069 lastInLumBuf++;
3071 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3073 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3074 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3075 chrBufIndex++;
3076 assert(chrBufIndex < 2*vChrBufSize);
3077 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3078 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3080 if (!(isGray(srcFormat) || isGray(dstFormat)))
3081 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3082 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3083 funnyUVCode, c->srcFormat, formatConvBuffer,
3084 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3085 lastInChrBuf++;
3087 //wrap buf index around to stay inside the ring buffer
3088 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3089 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3090 break; //we can't output a dstY line so let's try with the next slice
3093 #ifdef HAVE_MMX
3094 b5Dither= ff_dither8[dstY&1];
3095 g6Dither= ff_dither4[dstY&1];
3096 g5Dither= ff_dither8[dstY&1];
3097 r5Dither= ff_dither8[(dstY+1)&1];
3098 #endif
3099 if (dstY < dstH-2)
3101 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3102 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3103 #ifdef HAVE_MMX
3104 int i;
3105 if (flags & SWS_ACCURATE_RND){
3106 int s= APCK_SIZE / 8;
3107 for (i=0; i<vLumFilterSize; i+=2){
3108 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
3109 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
3110 lumMmxFilter[s*i+APCK_COEF/4 ]=
3111 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
3112 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3114 for (i=0; i<vChrFilterSize; i+=2){
3115 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
3116 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
3117 chrMmxFilter[s*i+APCK_COEF/4 ]=
3118 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
3119 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3121 }else{
3122 for (i=0; i<vLumFilterSize; i++)
3124 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3125 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3126 lumMmxFilter[4*i+2]=
3127 lumMmxFilter[4*i+3]=
3128 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3130 for (i=0; i<vChrFilterSize; i++)
3132 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3133 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3134 chrMmxFilter[4*i+2]=
3135 chrMmxFilter[4*i+3]=
3136 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3139 #endif
3140 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3141 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3142 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3143 RENAME(yuv2nv12X)(c,
3144 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3145 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3146 dest, uDest, dstW, chrDstW, dstFormat);
3148 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3150 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3151 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3152 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3154 int16_t *lumBuf = lumPixBuf[0];
3155 int16_t *chrBuf= chrPixBuf[0];
3156 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3158 else //General YV12
3160 RENAME(yuv2yuvX)(c,
3161 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3162 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3163 dest, uDest, vDest, dstW, chrDstW);
3166 else
3168 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3169 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3170 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3172 int chrAlpha= vChrFilter[2*dstY+1];
3173 if(flags & SWS_FULL_CHR_H_INT){
3174 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3175 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3176 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3177 dest, dstW, dstY);
3178 }else{
3179 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3180 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3183 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3185 int lumAlpha= vLumFilter[2*dstY+1];
3186 int chrAlpha= vChrFilter[2*dstY+1];
3187 lumMmxFilter[2]=
3188 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3189 chrMmxFilter[2]=
3190 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3191 if(flags & SWS_FULL_CHR_H_INT){
3192 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3193 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3194 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3195 dest, dstW, dstY);
3196 }else{
3197 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3198 dest, dstW, lumAlpha, chrAlpha, dstY);
3201 else //general RGB
3203 if(flags & SWS_FULL_CHR_H_INT){
3204 yuv2rgbXinC_full(c,
3205 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3206 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3207 dest, dstW, dstY);
3208 }else{
3209 RENAME(yuv2packedX)(c,
3210 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3211 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3212 dest, dstW, dstY);
3217 else // hmm looks like we can't use MMX here without overwriting this array's tail
3219 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3220 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3221 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3222 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3223 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3224 yuv2nv12XinC(
3225 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3226 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3227 dest, uDest, dstW, chrDstW, dstFormat);
3229 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3231 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3232 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3233 yuv2yuvXinC(
3234 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3235 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3236 dest, uDest, vDest, dstW, chrDstW);
3238 else
3240 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3241 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3242 if(flags & SWS_FULL_CHR_H_INT){
3243 yuv2rgbXinC_full(c,
3244 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3245 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3246 dest, dstW, dstY);
3247 }else{
3248 yuv2packedXinC(c,
3249 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3250 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3251 dest, dstW, dstY);
3257 #ifdef HAVE_MMX
3258 asm volatile(SFENCE:::"memory");
3259 asm volatile(EMMS:::"memory");
3260 #endif
3261 /* store changed local vars back in the context */
3262 c->dstY= dstY;
3263 c->lumBufIndex= lumBufIndex;
3264 c->chrBufIndex= chrBufIndex;
3265 c->lastInLumBuf= lastInLumBuf;
3266 c->lastInChrBuf= lastInChrBuf;
3268 return dstY - lastDstY;