sync with en/mplayer.1 r28576
[mplayer/glamo.git] / libswscale / swscale_template.c
blobba2b6f99c695496adcfcbd8eb880377191b21ab4
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
32 #if HAVE_AMD3DNOW
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
34 #define EMMS "femms"
35 #else
36 #define EMMS "emms"
37 #endif
39 #if HAVE_AMD3DNOW
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif HAVE_MMX2
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
48 #endif
50 #if HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
56 #if HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif HAVE_AMD3DNOW
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
62 #if HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
69 #if HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74 __asm__ volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 __asm__ volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX_UV \
210 __asm__ volatile(\
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
233 #define YSCALEYUV2PACKEDX_YA(offset) \
234 "lea "offset"(%0), %%"REG_d" \n\t"\
235 "mov (%%"REG_d"), %%"REG_S" \n\t"\
236 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
237 "movq %%mm1, %%mm7 \n\t"\
238 ASMALIGN(4)\
239 "2: \n\t"\
240 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
241 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
242 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
243 "add $16, %%"REG_d" \n\t"\
244 "mov (%%"REG_d"), %%"REG_S" \n\t"\
245 "pmulhw %%mm0, %%mm2 \n\t"\
246 "pmulhw %%mm0, %%mm5 \n\t"\
247 "paddw %%mm2, %%mm1 \n\t"\
248 "paddw %%mm5, %%mm7 \n\t"\
249 "test %%"REG_S", %%"REG_S" \n\t"\
250 " jnz 2b \n\t"\
252 #define YSCALEYUV2PACKEDX \
253 YSCALEYUV2PACKEDX_UV \
254 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET) \
256 #define YSCALEYUV2PACKEDX_END \
257 :: "r" (&c->redDither), \
258 "m" (dummy), "m" (dummy), "m" (dummy),\
259 "r" (dest), "m" (dstW) \
260 : "%"REG_a, "%"REG_d, "%"REG_S \
263 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
264 __asm__ volatile(\
265 "xor %%"REG_a", %%"REG_a" \n\t"\
266 ASMALIGN(4)\
267 "nop \n\t"\
268 "1: \n\t"\
269 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
270 "mov (%%"REG_d"), %%"REG_S" \n\t"\
271 "pxor %%mm4, %%mm4 \n\t"\
272 "pxor %%mm5, %%mm5 \n\t"\
273 "pxor %%mm6, %%mm6 \n\t"\
274 "pxor %%mm7, %%mm7 \n\t"\
275 ASMALIGN(4)\
276 "2: \n\t"\
277 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
278 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
279 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
280 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
281 "movq %%mm0, %%mm3 \n\t"\
282 "punpcklwd %%mm1, %%mm0 \n\t"\
283 "punpckhwd %%mm1, %%mm3 \n\t"\
284 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
285 "pmaddwd %%mm1, %%mm0 \n\t"\
286 "pmaddwd %%mm1, %%mm3 \n\t"\
287 "paddd %%mm0, %%mm4 \n\t"\
288 "paddd %%mm3, %%mm5 \n\t"\
289 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
290 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
291 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
292 "test %%"REG_S", %%"REG_S" \n\t"\
293 "movq %%mm2, %%mm0 \n\t"\
294 "punpcklwd %%mm3, %%mm2 \n\t"\
295 "punpckhwd %%mm3, %%mm0 \n\t"\
296 "pmaddwd %%mm1, %%mm2 \n\t"\
297 "pmaddwd %%mm1, %%mm0 \n\t"\
298 "paddd %%mm2, %%mm6 \n\t"\
299 "paddd %%mm0, %%mm7 \n\t"\
300 " jnz 2b \n\t"\
301 "psrad $16, %%mm4 \n\t"\
302 "psrad $16, %%mm5 \n\t"\
303 "psrad $16, %%mm6 \n\t"\
304 "psrad $16, %%mm7 \n\t"\
305 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
306 "packssdw %%mm5, %%mm4 \n\t"\
307 "packssdw %%mm7, %%mm6 \n\t"\
308 "paddw %%mm0, %%mm4 \n\t"\
309 "paddw %%mm0, %%mm6 \n\t"\
310 "movq %%mm4, "U_TEMP"(%0) \n\t"\
311 "movq %%mm6, "V_TEMP"(%0) \n\t"\
313 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314 "lea "offset"(%0), %%"REG_d" \n\t"\
315 "mov (%%"REG_d"), %%"REG_S" \n\t"\
316 "pxor %%mm1, %%mm1 \n\t"\
317 "pxor %%mm5, %%mm5 \n\t"\
318 "pxor %%mm7, %%mm7 \n\t"\
319 "pxor %%mm6, %%mm6 \n\t"\
320 ASMALIGN(4)\
321 "2: \n\t"\
322 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
323 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
324 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
325 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
326 "movq %%mm0, %%mm3 \n\t"\
327 "punpcklwd %%mm4, %%mm0 \n\t"\
328 "punpckhwd %%mm4, %%mm3 \n\t"\
329 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
330 "pmaddwd %%mm4, %%mm0 \n\t"\
331 "pmaddwd %%mm4, %%mm3 \n\t"\
332 "paddd %%mm0, %%mm1 \n\t"\
333 "paddd %%mm3, %%mm5 \n\t"\
334 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
335 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
336 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
337 "test %%"REG_S", %%"REG_S" \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "punpcklwd %%mm3, %%mm2 \n\t"\
340 "punpckhwd %%mm3, %%mm0 \n\t"\
341 "pmaddwd %%mm4, %%mm2 \n\t"\
342 "pmaddwd %%mm4, %%mm0 \n\t"\
343 "paddd %%mm2, %%mm7 \n\t"\
344 "paddd %%mm0, %%mm6 \n\t"\
345 " jnz 2b \n\t"\
346 "psrad $16, %%mm1 \n\t"\
347 "psrad $16, %%mm5 \n\t"\
348 "psrad $16, %%mm7 \n\t"\
349 "psrad $16, %%mm6 \n\t"\
350 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
351 "packssdw %%mm5, %%mm1 \n\t"\
352 "packssdw %%mm6, %%mm7 \n\t"\
353 "paddw %%mm0, %%mm1 \n\t"\
354 "paddw %%mm0, %%mm7 \n\t"\
355 "movq "U_TEMP"(%0), %%mm3 \n\t"\
356 "movq "V_TEMP"(%0), %%mm4 \n\t"\
358 #define YSCALEYUV2PACKEDX_ACCURATE \
359 YSCALEYUV2PACKEDX_ACCURATE_UV \
360 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
362 #define YSCALEYUV2RGBX \
363 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
364 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
365 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
366 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
367 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
368 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
369 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
371 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
372 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
373 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
374 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
375 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
376 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377 "paddw %%mm3, %%mm4 \n\t"\
378 "movq %%mm2, %%mm0 \n\t"\
379 "movq %%mm5, %%mm6 \n\t"\
380 "movq %%mm4, %%mm3 \n\t"\
381 "punpcklwd %%mm2, %%mm2 \n\t"\
382 "punpcklwd %%mm5, %%mm5 \n\t"\
383 "punpcklwd %%mm4, %%mm4 \n\t"\
384 "paddw %%mm1, %%mm2 \n\t"\
385 "paddw %%mm1, %%mm5 \n\t"\
386 "paddw %%mm1, %%mm4 \n\t"\
387 "punpckhwd %%mm0, %%mm0 \n\t"\
388 "punpckhwd %%mm6, %%mm6 \n\t"\
389 "punpckhwd %%mm3, %%mm3 \n\t"\
390 "paddw %%mm7, %%mm0 \n\t"\
391 "paddw %%mm7, %%mm6 \n\t"\
392 "paddw %%mm7, %%mm3 \n\t"\
393 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394 "packuswb %%mm0, %%mm2 \n\t"\
395 "packuswb %%mm6, %%mm5 \n\t"\
396 "packuswb %%mm3, %%mm4 \n\t"\
398 #define REAL_YSCALEYUV2PACKED(index, c) \
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
401 "psraw $3, %%mm0 \n\t"\
402 "psraw $3, %%mm1 \n\t"\
403 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405 "xor "#index", "#index" \n\t"\
406 ASMALIGN(4)\
407 "1: \n\t"\
408 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
409 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
410 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
411 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
412 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
415 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
422 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
423 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
424 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
425 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
426 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
427 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
434 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
436 #define REAL_YSCALEYUV2RGB_UV(index, c) \
437 "xor "#index", "#index" \n\t"\
438 ASMALIGN(4)\
439 "1: \n\t"\
440 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
441 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
442 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
443 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
444 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
447 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
454 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
455 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
456 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
457 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
458 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
459 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
461 #define REAL_YSCALEYUV2RGB_YA(index, c) \
462 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
463 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
464 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
465 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
466 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
467 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
468 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
475 #define REAL_YSCALEYUV2RGB_COEFF(c) \
476 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
477 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
478 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
479 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
480 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
481 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
482 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483 "paddw %%mm3, %%mm4 \n\t"\
484 "movq %%mm2, %%mm0 \n\t"\
485 "movq %%mm5, %%mm6 \n\t"\
486 "movq %%mm4, %%mm3 \n\t"\
487 "punpcklwd %%mm2, %%mm2 \n\t"\
488 "punpcklwd %%mm5, %%mm5 \n\t"\
489 "punpcklwd %%mm4, %%mm4 \n\t"\
490 "paddw %%mm1, %%mm2 \n\t"\
491 "paddw %%mm1, %%mm5 \n\t"\
492 "paddw %%mm1, %%mm4 \n\t"\
493 "punpckhwd %%mm0, %%mm0 \n\t"\
494 "punpckhwd %%mm6, %%mm6 \n\t"\
495 "punpckhwd %%mm3, %%mm3 \n\t"\
496 "paddw %%mm7, %%mm0 \n\t"\
497 "paddw %%mm7, %%mm6 \n\t"\
498 "paddw %%mm7, %%mm3 \n\t"\
499 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500 "packuswb %%mm0, %%mm2 \n\t"\
501 "packuswb %%mm6, %%mm5 \n\t"\
502 "packuswb %%mm3, %%mm4 \n\t"\
504 #define YSCALEYUV2RGB_YA(index, c) REAL_YSCALEYUV2RGB_YA(index, c)
506 #define YSCALEYUV2RGB(index, c) \
507 REAL_YSCALEYUV2RGB_UV(index, c) \
508 REAL_YSCALEYUV2RGB_YA(index, c) \
509 REAL_YSCALEYUV2RGB_COEFF(c)
511 #define REAL_YSCALEYUV2PACKED1(index, c) \
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $7, %%mm3 \n\t" \
518 "psraw $7, %%mm4 \n\t" \
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $7, %%mm1 \n\t" \
522 "psraw $7, %%mm7 \n\t" \
524 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
526 #define REAL_YSCALEYUV2RGB1(index, c) \
527 "xor "#index", "#index" \n\t"\
528 ASMALIGN(4)\
529 "1: \n\t"\
530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
531 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
532 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
535 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
536 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
537 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
538 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
539 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
540 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
542 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
543 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
546 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
547 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
548 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
549 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
550 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
551 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552 "paddw %%mm3, %%mm4 \n\t"\
553 "movq %%mm2, %%mm0 \n\t"\
554 "movq %%mm5, %%mm6 \n\t"\
555 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklwd %%mm2, %%mm2 \n\t"\
557 "punpcklwd %%mm5, %%mm5 \n\t"\
558 "punpcklwd %%mm4, %%mm4 \n\t"\
559 "paddw %%mm1, %%mm2 \n\t"\
560 "paddw %%mm1, %%mm5 \n\t"\
561 "paddw %%mm1, %%mm4 \n\t"\
562 "punpckhwd %%mm0, %%mm0 \n\t"\
563 "punpckhwd %%mm6, %%mm6 \n\t"\
564 "punpckhwd %%mm3, %%mm3 \n\t"\
565 "paddw %%mm7, %%mm0 \n\t"\
566 "paddw %%mm7, %%mm6 \n\t"\
567 "paddw %%mm7, %%mm3 \n\t"\
568 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569 "packuswb %%mm0, %%mm2 \n\t"\
570 "packuswb %%mm6, %%mm5 \n\t"\
571 "packuswb %%mm3, %%mm4 \n\t"\
573 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
575 #define REAL_YSCALEYUV2PACKED1b(index, c) \
576 "xor "#index", "#index" \n\t"\
577 ASMALIGN(4)\
578 "1: \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $8, %%mm3 \n\t" \
586 "psrlw $8, %%mm4 \n\t" \
587 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
588 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
589 "psraw $7, %%mm1 \n\t" \
590 "psraw $7, %%mm7 \n\t"
591 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
593 // do vertical chrominance interpolation
594 #define REAL_YSCALEYUV2RGB1b(index, c) \
595 "xor "#index", "#index" \n\t"\
596 ASMALIGN(4)\
597 "1: \n\t"\
598 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
599 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
600 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
601 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
602 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
605 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
606 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
607 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
608 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
609 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
610 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
611 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
612 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
614 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
615 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
618 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
619 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
620 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
621 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
622 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
623 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624 "paddw %%mm3, %%mm4 \n\t"\
625 "movq %%mm2, %%mm0 \n\t"\
626 "movq %%mm5, %%mm6 \n\t"\
627 "movq %%mm4, %%mm3 \n\t"\
628 "punpcklwd %%mm2, %%mm2 \n\t"\
629 "punpcklwd %%mm5, %%mm5 \n\t"\
630 "punpcklwd %%mm4, %%mm4 \n\t"\
631 "paddw %%mm1, %%mm2 \n\t"\
632 "paddw %%mm1, %%mm5 \n\t"\
633 "paddw %%mm1, %%mm4 \n\t"\
634 "punpckhwd %%mm0, %%mm0 \n\t"\
635 "punpckhwd %%mm6, %%mm6 \n\t"\
636 "punpckhwd %%mm3, %%mm3 \n\t"\
637 "paddw %%mm7, %%mm0 \n\t"\
638 "paddw %%mm7, %%mm6 \n\t"\
639 "paddw %%mm7, %%mm3 \n\t"\
640 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641 "packuswb %%mm0, %%mm2 \n\t"\
642 "packuswb %%mm6, %%mm5 \n\t"\
643 "packuswb %%mm3, %%mm4 \n\t"\
645 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
647 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
648 "movq "#b", "#q2" \n\t" /* B */\
649 "movq "#r", "#t" \n\t" /* R */\
650 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
651 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
652 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
653 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
654 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
655 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
656 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
657 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
658 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
659 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
661 MOVNTQ( q0, (dst, index, 4))\
662 MOVNTQ( b, 8(dst, index, 4))\
663 MOVNTQ( q2, 16(dst, index, 4))\
664 MOVNTQ( q3, 24(dst, index, 4))\
666 "add $8, "#index" \n\t"\
667 "cmp "#dstw", "#index" \n\t"\
668 " jb 1b \n\t"
669 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
671 #define REAL_WRITERGB16(dst, dstw, index) \
672 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
673 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
674 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
675 "psrlq $3, %%mm2 \n\t"\
677 "movq %%mm2, %%mm1 \n\t"\
678 "movq %%mm4, %%mm3 \n\t"\
680 "punpcklbw %%mm7, %%mm3 \n\t"\
681 "punpcklbw %%mm5, %%mm2 \n\t"\
682 "punpckhbw %%mm7, %%mm4 \n\t"\
683 "punpckhbw %%mm5, %%mm1 \n\t"\
685 "psllq $3, %%mm3 \n\t"\
686 "psllq $3, %%mm4 \n\t"\
688 "por %%mm3, %%mm2 \n\t"\
689 "por %%mm4, %%mm1 \n\t"\
691 MOVNTQ(%%mm2, (dst, index, 2))\
692 MOVNTQ(%%mm1, 8(dst, index, 2))\
694 "add $8, "#index" \n\t"\
695 "cmp "#dstw", "#index" \n\t"\
696 " jb 1b \n\t"
697 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
699 #define REAL_WRITERGB15(dst, dstw, index) \
700 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
701 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
702 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
703 "psrlq $3, %%mm2 \n\t"\
704 "psrlq $1, %%mm5 \n\t"\
706 "movq %%mm2, %%mm1 \n\t"\
707 "movq %%mm4, %%mm3 \n\t"\
709 "punpcklbw %%mm7, %%mm3 \n\t"\
710 "punpcklbw %%mm5, %%mm2 \n\t"\
711 "punpckhbw %%mm7, %%mm4 \n\t"\
712 "punpckhbw %%mm5, %%mm1 \n\t"\
714 "psllq $2, %%mm3 \n\t"\
715 "psllq $2, %%mm4 \n\t"\
717 "por %%mm3, %%mm2 \n\t"\
718 "por %%mm4, %%mm1 \n\t"\
720 MOVNTQ(%%mm2, (dst, index, 2))\
721 MOVNTQ(%%mm1, 8(dst, index, 2))\
723 "add $8, "#index" \n\t"\
724 "cmp "#dstw", "#index" \n\t"\
725 " jb 1b \n\t"
726 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
728 #define WRITEBGR24OLD(dst, dstw, index) \
729 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
730 "movq %%mm2, %%mm1 \n\t" /* B */\
731 "movq %%mm5, %%mm6 \n\t" /* R */\
732 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
733 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
734 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
735 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
736 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
737 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
738 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
739 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
740 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
741 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
743 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
744 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
746 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
747 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
748 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
749 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
750 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
752 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
753 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
754 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
755 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
756 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
757 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
758 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
759 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
760 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
761 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
762 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
763 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
764 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
766 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
767 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
768 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
769 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
770 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
771 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
772 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
773 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
775 MOVNTQ(%%mm0, (dst))\
776 MOVNTQ(%%mm2, 8(dst))\
777 MOVNTQ(%%mm3, 16(dst))\
778 "add $24, "#dst" \n\t"\
780 "add $8, "#index" \n\t"\
781 "cmp "#dstw", "#index" \n\t"\
782 " jb 1b \n\t"
784 #define WRITEBGR24MMX(dst, dstw, index) \
785 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
786 "movq %%mm2, %%mm1 \n\t" /* B */\
787 "movq %%mm5, %%mm6 \n\t" /* R */\
788 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
789 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
790 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
791 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
792 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
793 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
794 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
795 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
796 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
797 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
799 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
800 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
801 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
802 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
804 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
805 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
806 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
807 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
809 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
810 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
811 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
812 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
814 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
815 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
816 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
817 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
818 MOVNTQ(%%mm0, (dst))\
820 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
821 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
822 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
823 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
824 MOVNTQ(%%mm6, 8(dst))\
826 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
827 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
828 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
829 MOVNTQ(%%mm5, 16(dst))\
831 "add $24, "#dst" \n\t"\
833 "add $8, "#index" \n\t"\
834 "cmp "#dstw", "#index" \n\t"\
835 " jb 1b \n\t"
837 #define WRITEBGR24MMX2(dst, dstw, index) \
838 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
839 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
840 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
841 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
842 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
843 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
845 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
846 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
847 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
849 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
850 "por %%mm1, %%mm6 \n\t"\
851 "por %%mm3, %%mm6 \n\t"\
852 MOVNTQ(%%mm6, (dst))\
854 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
855 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
856 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
857 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
859 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
860 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
861 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
863 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
864 "por %%mm3, %%mm6 \n\t"\
865 MOVNTQ(%%mm6, 8(dst))\
867 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
868 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
869 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
871 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
872 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
873 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
875 "por %%mm1, %%mm3 \n\t"\
876 "por %%mm3, %%mm6 \n\t"\
877 MOVNTQ(%%mm6, 16(dst))\
879 "add $24, "#dst" \n\t"\
881 "add $8, "#index" \n\t"\
882 "cmp "#dstw", "#index" \n\t"\
883 " jb 1b \n\t"
885 #if HAVE_MMX2
886 #undef WRITEBGR24
887 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
888 #else
889 #undef WRITEBGR24
890 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
891 #endif
893 #define REAL_WRITEYUY2(dst, dstw, index) \
894 "packuswb %%mm3, %%mm3 \n\t"\
895 "packuswb %%mm4, %%mm4 \n\t"\
896 "packuswb %%mm7, %%mm1 \n\t"\
897 "punpcklbw %%mm4, %%mm3 \n\t"\
898 "movq %%mm1, %%mm7 \n\t"\
899 "punpcklbw %%mm3, %%mm1 \n\t"\
900 "punpckhbw %%mm3, %%mm7 \n\t"\
902 MOVNTQ(%%mm1, (dst, index, 2))\
903 MOVNTQ(%%mm7, 8(dst, index, 2))\
905 "add $8, "#index" \n\t"\
906 "cmp "#dstw", "#index" \n\t"\
907 " jb 1b \n\t"
908 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
911 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
912 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
913 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
915 #if HAVE_MMX
916 if(!(c->flags & SWS_BITEXACT)){
917 if (c->flags & SWS_ACCURATE_RND){
918 if (uDest){
919 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
920 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
923 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
924 }else{
925 if (uDest){
926 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
927 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
930 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
932 return;
934 #endif
935 #if HAVE_ALTIVEC
936 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
937 chrFilter, chrSrc, chrFilterSize,
938 dest, uDest, vDest, dstW, chrDstW);
939 #else //HAVE_ALTIVEC
940 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
941 chrFilter, chrSrc, chrFilterSize,
942 dest, uDest, vDest, dstW, chrDstW);
943 #endif //!HAVE_ALTIVEC
946 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
947 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
948 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
950 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
951 chrFilter, chrSrc, chrFilterSize,
952 dest, uDest, dstW, chrDstW, dstFormat);
955 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
956 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
958 int i;
959 #if HAVE_MMX
960 if(!(c->flags & SWS_BITEXACT)){
961 long p= uDest ? 3 : 1;
962 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
963 uint8_t *dst[3]= {dest, uDest, vDest};
964 long counter[3] = {dstW, chrDstW, chrDstW};
966 if (c->flags & SWS_ACCURATE_RND){
967 while(p--){
968 __asm__ volatile(
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src[p]), "r" (dst[p] + counter[p]),
971 "g" (-counter[p])
972 : "%"REG_a
975 }else{
976 while(p--){
977 __asm__ volatile(
978 YSCALEYUV2YV121
979 :: "r" (src[p]), "r" (dst[p] + counter[p]),
980 "g" (-counter[p])
981 : "%"REG_a
985 return;
987 #endif
988 for (i=0; i<dstW; i++)
990 int val= (lumSrc[i]+64)>>7;
992 if (val&256){
993 if (val<0) val=0;
994 else val=255;
997 dest[i]= val;
1000 if (uDest)
1001 for (i=0; i<chrDstW; i++)
1003 int u=(chrSrc[i ]+64)>>7;
1004 int v=(chrSrc[i + VOFW]+64)>>7;
1006 if ((u|v)&256){
1007 if (u<0) u=0;
1008 else if (u>255) u=255;
1009 if (v<0) v=0;
1010 else if (v>255) v=255;
1013 uDest[i]= u;
1014 vDest[i]= v;
1020 * vertical scale YV12 to RGB
1022 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1023 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1024 uint8_t *dest, long dstW, long dstY)
1026 #if HAVE_MMX
1027 long dummy=0;
1028 if(!(c->flags & SWS_BITEXACT)){
1029 if (c->flags & SWS_ACCURATE_RND){
1030 switch(c->dstFormat){
1031 case PIX_FMT_RGB32:
1032 YSCALEYUV2PACKEDX_ACCURATE
1033 YSCALEYUV2RGBX
1034 "pxor %%mm7, %%mm7 \n\t"
1035 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1037 YSCALEYUV2PACKEDX_END
1038 return;
1039 case PIX_FMT_BGR24:
1040 YSCALEYUV2PACKEDX_ACCURATE
1041 YSCALEYUV2RGBX
1042 "pxor %%mm7, %%mm7 \n\t"
1043 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1044 "add %4, %%"REG_c" \n\t"
1045 WRITEBGR24(%%REGc, %5, %%REGa)
1048 :: "r" (&c->redDither),
1049 "m" (dummy), "m" (dummy), "m" (dummy),
1050 "r" (dest), "m" (dstW)
1051 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1053 return;
1054 case PIX_FMT_RGB555:
1055 YSCALEYUV2PACKEDX_ACCURATE
1056 YSCALEYUV2RGBX
1057 "pxor %%mm7, %%mm7 \n\t"
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1059 #ifdef DITHER1XBPP
1060 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1061 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1062 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1063 #endif
1065 WRITERGB15(%4, %5, %%REGa)
1066 YSCALEYUV2PACKEDX_END
1067 return;
1068 case PIX_FMT_RGB565:
1069 YSCALEYUV2PACKEDX_ACCURATE
1070 YSCALEYUV2RGBX
1071 "pxor %%mm7, %%mm7 \n\t"
1072 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1073 #ifdef DITHER1XBPP
1074 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1075 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1076 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1077 #endif
1079 WRITERGB16(%4, %5, %%REGa)
1080 YSCALEYUV2PACKEDX_END
1081 return;
1082 case PIX_FMT_YUYV422:
1083 YSCALEYUV2PACKEDX_ACCURATE
1084 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1086 "psraw $3, %%mm3 \n\t"
1087 "psraw $3, %%mm4 \n\t"
1088 "psraw $3, %%mm1 \n\t"
1089 "psraw $3, %%mm7 \n\t"
1090 WRITEYUY2(%4, %5, %%REGa)
1091 YSCALEYUV2PACKEDX_END
1092 return;
1094 }else{
1095 switch(c->dstFormat)
1097 case PIX_FMT_RGB32:
1098 YSCALEYUV2PACKEDX
1099 YSCALEYUV2RGBX
1100 "pxor %%mm7, %%mm7 \n\t"
1101 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1102 YSCALEYUV2PACKEDX_END
1103 return;
1104 case PIX_FMT_BGR24:
1105 YSCALEYUV2PACKEDX
1106 YSCALEYUV2RGBX
1107 "pxor %%mm7, %%mm7 \n\t"
1108 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1109 "add %4, %%"REG_c" \n\t"
1110 WRITEBGR24(%%REGc, %5, %%REGa)
1112 :: "r" (&c->redDither),
1113 "m" (dummy), "m" (dummy), "m" (dummy),
1114 "r" (dest), "m" (dstW)
1115 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1117 return;
1118 case PIX_FMT_RGB555:
1119 YSCALEYUV2PACKEDX
1120 YSCALEYUV2RGBX
1121 "pxor %%mm7, %%mm7 \n\t"
1122 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1123 #ifdef DITHER1XBPP
1124 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1125 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1126 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1127 #endif
1129 WRITERGB15(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1131 return;
1132 case PIX_FMT_RGB565:
1133 YSCALEYUV2PACKEDX
1134 YSCALEYUV2RGBX
1135 "pxor %%mm7, %%mm7 \n\t"
1136 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1137 #ifdef DITHER1XBPP
1138 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1139 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1140 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1141 #endif
1143 WRITERGB16(%4, %5, %%REGa)
1144 YSCALEYUV2PACKEDX_END
1145 return;
1146 case PIX_FMT_YUYV422:
1147 YSCALEYUV2PACKEDX
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 "psraw $3, %%mm3 \n\t"
1151 "psraw $3, %%mm4 \n\t"
1152 "psraw $3, %%mm1 \n\t"
1153 "psraw $3, %%mm7 \n\t"
1154 WRITEYUY2(%4, %5, %%REGa)
1155 YSCALEYUV2PACKEDX_END
1156 return;
1160 #endif /* HAVE_MMX */
1161 #if HAVE_ALTIVEC
1162 /* The following list of supported dstFormat values should
1163 match what's found in the body of altivec_yuv2packedX() */
1164 if (!(c->flags & SWS_BITEXACT) &&
1165 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1166 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1167 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1168 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1169 chrFilter, chrSrc, chrFilterSize,
1170 dest, dstW, dstY);
1171 else
1172 #endif
1173 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1174 chrFilter, chrSrc, chrFilterSize,
1175 dest, dstW, dstY);
1179 * vertical bilinear scale YV12 to RGB
1181 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1182 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1184 int yalpha1=4095- yalpha;
1185 int uvalpha1=4095-uvalpha;
1186 int i;
1188 #if HAVE_MMX
1189 if(!(c->flags & SWS_BITEXACT)){
1190 switch(c->dstFormat)
1192 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1193 case PIX_FMT_RGB32:
1194 __asm__ volatile(
1195 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1196 "mov %4, %%"REG_b" \n\t"
1197 "push %%"REG_BP" \n\t"
1198 YSCALEYUV2RGB(%%REGBP, %5)
1199 "pxor %%mm7, %%mm7 \n\t"
1200 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1201 "pop %%"REG_BP" \n\t"
1202 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1204 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1205 "a" (&c->redDither)
1207 return;
1208 case PIX_FMT_BGR24:
1209 __asm__ volatile(
1210 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1211 "mov %4, %%"REG_b" \n\t"
1212 "push %%"REG_BP" \n\t"
1213 YSCALEYUV2RGB(%%REGBP, %5)
1214 "pxor %%mm7, %%mm7 \n\t"
1215 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1216 "pop %%"REG_BP" \n\t"
1217 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1218 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1219 "a" (&c->redDither)
1221 return;
1222 case PIX_FMT_RGB555:
1223 __asm__ volatile(
1224 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1225 "mov %4, %%"REG_b" \n\t"
1226 "push %%"REG_BP" \n\t"
1227 YSCALEYUV2RGB(%%REGBP, %5)
1228 "pxor %%mm7, %%mm7 \n\t"
1229 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1230 #ifdef DITHER1XBPP
1231 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1232 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1233 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1234 #endif
1236 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1237 "pop %%"REG_BP" \n\t"
1238 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1240 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1241 "a" (&c->redDither)
1243 return;
1244 case PIX_FMT_RGB565:
1245 __asm__ volatile(
1246 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1247 "mov %4, %%"REG_b" \n\t"
1248 "push %%"REG_BP" \n\t"
1249 YSCALEYUV2RGB(%%REGBP, %5)
1250 "pxor %%mm7, %%mm7 \n\t"
1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1252 #ifdef DITHER1XBPP
1253 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1254 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1255 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1256 #endif
1258 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1259 "pop %%"REG_BP" \n\t"
1260 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1261 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1262 "a" (&c->redDither)
1264 return;
1265 case PIX_FMT_YUYV422:
1266 __asm__ volatile(
1267 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1268 "mov %4, %%"REG_b" \n\t"
1269 "push %%"REG_BP" \n\t"
1270 YSCALEYUV2PACKED(%%REGBP, %5)
1271 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1272 "pop %%"REG_BP" \n\t"
1273 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1274 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275 "a" (&c->redDither)
1277 return;
1278 default: break;
1281 #endif //HAVE_MMX
1282 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1286 * YV12 to RGB without scaling or interpolating
1288 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1291 const int yalpha1=0;
1292 int i;
1294 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1295 const int yalpha= 4096; //FIXME ...
1297 if (flags&SWS_FULL_CHR_H_INT)
1299 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1300 return;
1303 #if HAVE_MMX
1304 if(!(flags & SWS_BITEXACT)){
1305 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1307 switch(dstFormat)
1309 case PIX_FMT_RGB32:
1310 __asm__ volatile(
1311 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1312 "mov %4, %%"REG_b" \n\t"
1313 "push %%"REG_BP" \n\t"
1314 YSCALEYUV2RGB1(%%REGBP, %5)
1315 "pxor %%mm7, %%mm7 \n\t"
1316 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1317 "pop %%"REG_BP" \n\t"
1318 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1320 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1321 "a" (&c->redDither)
1323 return;
1324 case PIX_FMT_BGR24:
1325 __asm__ volatile(
1326 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1327 "mov %4, %%"REG_b" \n\t"
1328 "push %%"REG_BP" \n\t"
1329 YSCALEYUV2RGB1(%%REGBP, %5)
1330 "pxor %%mm7, %%mm7 \n\t"
1331 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1332 "pop %%"REG_BP" \n\t"
1333 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1335 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1336 "a" (&c->redDither)
1338 return;
1339 case PIX_FMT_RGB555:
1340 __asm__ volatile(
1341 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1342 "mov %4, %%"REG_b" \n\t"
1343 "push %%"REG_BP" \n\t"
1344 YSCALEYUV2RGB1(%%REGBP, %5)
1345 "pxor %%mm7, %%mm7 \n\t"
1346 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1347 #ifdef DITHER1XBPP
1348 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1349 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1350 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1351 #endif
1352 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1353 "pop %%"REG_BP" \n\t"
1354 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1356 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1357 "a" (&c->redDither)
1359 return;
1360 case PIX_FMT_RGB565:
1361 __asm__ volatile(
1362 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1363 "mov %4, %%"REG_b" \n\t"
1364 "push %%"REG_BP" \n\t"
1365 YSCALEYUV2RGB1(%%REGBP, %5)
1366 "pxor %%mm7, %%mm7 \n\t"
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368 #ifdef DITHER1XBPP
1369 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1370 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1371 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1372 #endif
1374 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1375 "pop %%"REG_BP" \n\t"
1376 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1378 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1379 "a" (&c->redDither)
1381 return;
1382 case PIX_FMT_YUYV422:
1383 __asm__ volatile(
1384 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_b" \n\t"
1386 "push %%"REG_BP" \n\t"
1387 YSCALEYUV2PACKED1(%%REGBP, %5)
1388 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1389 "pop %%"REG_BP" \n\t"
1390 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1392 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1393 "a" (&c->redDither)
1395 return;
1398 else
1400 switch(dstFormat)
1402 case PIX_FMT_RGB32:
1403 __asm__ volatile(
1404 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1405 "mov %4, %%"REG_b" \n\t"
1406 "push %%"REG_BP" \n\t"
1407 YSCALEYUV2RGB1b(%%REGBP, %5)
1408 "pxor %%mm7, %%mm7 \n\t"
1409 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1410 "pop %%"REG_BP" \n\t"
1411 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1413 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414 "a" (&c->redDither)
1416 return;
1417 case PIX_FMT_BGR24:
1418 __asm__ volatile(
1419 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1420 "mov %4, %%"REG_b" \n\t"
1421 "push %%"REG_BP" \n\t"
1422 YSCALEYUV2RGB1b(%%REGBP, %5)
1423 "pxor %%mm7, %%mm7 \n\t"
1424 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1425 "pop %%"REG_BP" \n\t"
1426 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1428 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1429 "a" (&c->redDither)
1431 return;
1432 case PIX_FMT_RGB555:
1433 __asm__ volatile(
1434 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1435 "mov %4, %%"REG_b" \n\t"
1436 "push %%"REG_BP" \n\t"
1437 YSCALEYUV2RGB1b(%%REGBP, %5)
1438 "pxor %%mm7, %%mm7 \n\t"
1439 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1440 #ifdef DITHER1XBPP
1441 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1442 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1443 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1444 #endif
1445 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1446 "pop %%"REG_BP" \n\t"
1447 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1449 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1450 "a" (&c->redDither)
1452 return;
1453 case PIX_FMT_RGB565:
1454 __asm__ volatile(
1455 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1456 "mov %4, %%"REG_b" \n\t"
1457 "push %%"REG_BP" \n\t"
1458 YSCALEYUV2RGB1b(%%REGBP, %5)
1459 "pxor %%mm7, %%mm7 \n\t"
1460 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1461 #ifdef DITHER1XBPP
1462 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1463 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1464 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1465 #endif
1467 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1468 "pop %%"REG_BP" \n\t"
1469 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1471 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1472 "a" (&c->redDither)
1474 return;
1475 case PIX_FMT_YUYV422:
1476 __asm__ volatile(
1477 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1478 "mov %4, %%"REG_b" \n\t"
1479 "push %%"REG_BP" \n\t"
1480 YSCALEYUV2PACKED1b(%%REGBP, %5)
1481 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1482 "pop %%"REG_BP" \n\t"
1483 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1485 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1486 "a" (&c->redDither)
1488 return;
1492 #endif /* HAVE_MMX */
1493 if (uvalpha < 2048)
1495 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1496 }else{
1497 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1501 //FIXME yuy2* can read up to 7 samples too much
1503 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1505 #if HAVE_MMX
1506 __asm__ volatile(
1507 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1508 "mov %0, %%"REG_a" \n\t"
1509 "1: \n\t"
1510 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1511 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1512 "pand %%mm2, %%mm0 \n\t"
1513 "pand %%mm2, %%mm1 \n\t"
1514 "packuswb %%mm1, %%mm0 \n\t"
1515 "movq %%mm0, (%2, %%"REG_a") \n\t"
1516 "add $8, %%"REG_a" \n\t"
1517 " js 1b \n\t"
1518 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1519 : "%"REG_a
1521 #else
1522 int i;
1523 for (i=0; i<width; i++)
1524 dst[i]= src[2*i];
1525 #endif
1528 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1530 #if HAVE_MMX
1531 __asm__ volatile(
1532 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1533 "mov %0, %%"REG_a" \n\t"
1534 "1: \n\t"
1535 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1536 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1537 "psrlw $8, %%mm0 \n\t"
1538 "psrlw $8, %%mm1 \n\t"
1539 "packuswb %%mm1, %%mm0 \n\t"
1540 "movq %%mm0, %%mm1 \n\t"
1541 "psrlw $8, %%mm0 \n\t"
1542 "pand %%mm4, %%mm1 \n\t"
1543 "packuswb %%mm0, %%mm0 \n\t"
1544 "packuswb %%mm1, %%mm1 \n\t"
1545 "movd %%mm0, (%3, %%"REG_a") \n\t"
1546 "movd %%mm1, (%2, %%"REG_a") \n\t"
1547 "add $4, %%"REG_a" \n\t"
1548 " js 1b \n\t"
1549 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1550 : "%"REG_a
1552 #else
1553 int i;
1554 for (i=0; i<width; i++)
1556 dstU[i]= src1[4*i + 1];
1557 dstV[i]= src1[4*i + 3];
1559 #endif
1560 assert(src1 == src2);
1563 /* This is almost identical to the previous, end exists only because
1564 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1565 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1567 #if HAVE_MMX
1568 __asm__ volatile(
1569 "mov %0, %%"REG_a" \n\t"
1570 "1: \n\t"
1571 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1572 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1573 "psrlw $8, %%mm0 \n\t"
1574 "psrlw $8, %%mm1 \n\t"
1575 "packuswb %%mm1, %%mm0 \n\t"
1576 "movq %%mm0, (%2, %%"REG_a") \n\t"
1577 "add $8, %%"REG_a" \n\t"
1578 " js 1b \n\t"
1579 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1580 : "%"REG_a
1582 #else
1583 int i;
1584 for (i=0; i<width; i++)
1585 dst[i]= src[2*i+1];
1586 #endif
1589 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1591 #if HAVE_MMX
1592 __asm__ volatile(
1593 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1594 "mov %0, %%"REG_a" \n\t"
1595 "1: \n\t"
1596 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1597 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1598 "pand %%mm4, %%mm0 \n\t"
1599 "pand %%mm4, %%mm1 \n\t"
1600 "packuswb %%mm1, %%mm0 \n\t"
1601 "movq %%mm0, %%mm1 \n\t"
1602 "psrlw $8, %%mm0 \n\t"
1603 "pand %%mm4, %%mm1 \n\t"
1604 "packuswb %%mm0, %%mm0 \n\t"
1605 "packuswb %%mm1, %%mm1 \n\t"
1606 "movd %%mm0, (%3, %%"REG_a") \n\t"
1607 "movd %%mm1, (%2, %%"REG_a") \n\t"
1608 "add $4, %%"REG_a" \n\t"
1609 " js 1b \n\t"
1610 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1611 : "%"REG_a
1613 #else
1614 int i;
1615 for (i=0; i<width; i++)
1617 dstU[i]= src1[4*i + 0];
1618 dstV[i]= src1[4*i + 2];
1620 #endif
1621 assert(src1 == src2);
1624 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1625 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1627 int i;\
1628 for (i=0; i<width; i++)\
1630 int b= (((type*)src)[i]>>shb)&maskb;\
1631 int g= (((type*)src)[i]>>shg)&maskg;\
1632 int r= (((type*)src)[i]>>shr)&maskr;\
1634 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1638 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1639 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1640 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1641 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1642 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1643 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1645 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1646 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1648 int i;\
1649 for (i=0; i<width; i++)\
1651 int b= (((type*)src)[i]&maskb)>>shb;\
1652 int g= (((type*)src)[i]&maskg)>>shg;\
1653 int r= (((type*)src)[i]&maskr)>>shr;\
1655 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1656 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1659 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1661 int i;\
1662 for (i=0; i<width; i++)\
1664 int pix0= ((type*)src)[2*i+0];\
1665 int pix1= ((type*)src)[2*i+1];\
1666 int g= (pix0&maskg)+(pix1&maskg);\
1667 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1668 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1670 g>>=shg;\
1672 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1673 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1677 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1678 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1679 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1680 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1681 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1682 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1684 #if HAVE_MMX
1685 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1688 if(srcFormat == PIX_FMT_BGR24){
1689 __asm__ volatile(
1690 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1691 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1694 }else{
1695 __asm__ volatile(
1696 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1697 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1702 __asm__ volatile(
1703 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1704 "mov %2, %%"REG_a" \n\t"
1705 "pxor %%mm7, %%mm7 \n\t"
1706 "1: \n\t"
1707 PREFETCH" 64(%0) \n\t"
1708 "movd (%0), %%mm0 \n\t"
1709 "movd 2(%0), %%mm1 \n\t"
1710 "movd 6(%0), %%mm2 \n\t"
1711 "movd 8(%0), %%mm3 \n\t"
1712 "add $12, %0 \n\t"
1713 "punpcklbw %%mm7, %%mm0 \n\t"
1714 "punpcklbw %%mm7, %%mm1 \n\t"
1715 "punpcklbw %%mm7, %%mm2 \n\t"
1716 "punpcklbw %%mm7, %%mm3 \n\t"
1717 "pmaddwd %%mm5, %%mm0 \n\t"
1718 "pmaddwd %%mm6, %%mm1 \n\t"
1719 "pmaddwd %%mm5, %%mm2 \n\t"
1720 "pmaddwd %%mm6, %%mm3 \n\t"
1721 "paddd %%mm1, %%mm0 \n\t"
1722 "paddd %%mm3, %%mm2 \n\t"
1723 "paddd %%mm4, %%mm0 \n\t"
1724 "paddd %%mm4, %%mm2 \n\t"
1725 "psrad $15, %%mm0 \n\t"
1726 "psrad $15, %%mm2 \n\t"
1727 "packssdw %%mm2, %%mm0 \n\t"
1728 "packuswb %%mm0, %%mm0 \n\t"
1729 "movd %%mm0, (%1, %%"REG_a") \n\t"
1730 "add $4, %%"REG_a" \n\t"
1731 " js 1b \n\t"
1732 : "+r" (src)
1733 : "r" (dst+width), "g" (-width)
1734 : "%"REG_a
1738 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1740 __asm__ volatile(
1741 "movq 24+%4, %%mm6 \n\t"
1742 "mov %3, %%"REG_a" \n\t"
1743 "pxor %%mm7, %%mm7 \n\t"
1744 "1: \n\t"
1745 PREFETCH" 64(%0) \n\t"
1746 "movd (%0), %%mm0 \n\t"
1747 "movd 2(%0), %%mm1 \n\t"
1748 "punpcklbw %%mm7, %%mm0 \n\t"
1749 "punpcklbw %%mm7, %%mm1 \n\t"
1750 "movq %%mm0, %%mm2 \n\t"
1751 "movq %%mm1, %%mm3 \n\t"
1752 "pmaddwd %4, %%mm0 \n\t"
1753 "pmaddwd 8+%4, %%mm1 \n\t"
1754 "pmaddwd 16+%4, %%mm2 \n\t"
1755 "pmaddwd %%mm6, %%mm3 \n\t"
1756 "paddd %%mm1, %%mm0 \n\t"
1757 "paddd %%mm3, %%mm2 \n\t"
1759 "movd 6(%0), %%mm1 \n\t"
1760 "movd 8(%0), %%mm3 \n\t"
1761 "add $12, %0 \n\t"
1762 "punpcklbw %%mm7, %%mm1 \n\t"
1763 "punpcklbw %%mm7, %%mm3 \n\t"
1764 "movq %%mm1, %%mm4 \n\t"
1765 "movq %%mm3, %%mm5 \n\t"
1766 "pmaddwd %4, %%mm1 \n\t"
1767 "pmaddwd 8+%4, %%mm3 \n\t"
1768 "pmaddwd 16+%4, %%mm4 \n\t"
1769 "pmaddwd %%mm6, %%mm5 \n\t"
1770 "paddd %%mm3, %%mm1 \n\t"
1771 "paddd %%mm5, %%mm4 \n\t"
1773 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1774 "paddd %%mm3, %%mm0 \n\t"
1775 "paddd %%mm3, %%mm2 \n\t"
1776 "paddd %%mm3, %%mm1 \n\t"
1777 "paddd %%mm3, %%mm4 \n\t"
1778 "psrad $15, %%mm0 \n\t"
1779 "psrad $15, %%mm2 \n\t"
1780 "psrad $15, %%mm1 \n\t"
1781 "psrad $15, %%mm4 \n\t"
1782 "packssdw %%mm1, %%mm0 \n\t"
1783 "packssdw %%mm4, %%mm2 \n\t"
1784 "packuswb %%mm0, %%mm0 \n\t"
1785 "packuswb %%mm2, %%mm2 \n\t"
1786 "movd %%mm0, (%1, %%"REG_a") \n\t"
1787 "movd %%mm2, (%2, %%"REG_a") \n\t"
1788 "add $4, %%"REG_a" \n\t"
1789 " js 1b \n\t"
1790 : "+r" (src)
1791 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1792 : "%"REG_a
1795 #endif
1797 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1799 #if HAVE_MMX
1800 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1801 #else
1802 int i;
1803 for (i=0; i<width; i++)
1805 int b= src[i*3+0];
1806 int g= src[i*3+1];
1807 int r= src[i*3+2];
1809 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1811 #endif /* HAVE_MMX */
1814 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1816 #if HAVE_MMX
1817 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1818 #else
1819 int i;
1820 for (i=0; i<width; i++)
1822 int b= src1[3*i + 0];
1823 int g= src1[3*i + 1];
1824 int r= src1[3*i + 2];
1826 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1827 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1829 #endif /* HAVE_MMX */
1830 assert(src1 == src2);
1833 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1835 int i;
1836 for (i=0; i<width; i++)
1838 int b= src1[6*i + 0] + src1[6*i + 3];
1839 int g= src1[6*i + 1] + src1[6*i + 4];
1840 int r= src1[6*i + 2] + src1[6*i + 5];
1842 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1843 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1845 assert(src1 == src2);
1848 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1850 #if HAVE_MMX
1851 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1852 #else
1853 int i;
1854 for (i=0; i<width; i++)
1856 int r= src[i*3+0];
1857 int g= src[i*3+1];
1858 int b= src[i*3+2];
1860 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1862 #endif
1865 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1867 #if HAVE_MMX
1868 assert(src1==src2);
1869 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1870 #else
1871 int i;
1872 assert(src1==src2);
1873 for (i=0; i<width; i++)
1875 int r= src1[3*i + 0];
1876 int g= src1[3*i + 1];
1877 int b= src1[3*i + 2];
1879 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1880 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1882 #endif
1885 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1887 int i;
1888 assert(src1==src2);
1889 for (i=0; i<width; i++)
1891 int r= src1[6*i + 0] + src1[6*i + 3];
1892 int g= src1[6*i + 1] + src1[6*i + 4];
1893 int b= src1[6*i + 2] + src1[6*i + 5];
1895 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1896 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1901 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
1903 int i;
1904 for (i=0; i<width; i++)
1906 int d= src[i];
1908 dst[i]= pal[d] & 0xFF;
1912 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
1914 int i;
1915 assert(src1 == src2);
1916 for (i=0; i<width; i++)
1918 int p= pal[src1[i]];
1920 dstU[i]= p>>8;
1921 dstV[i]= p>>16;
1925 static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1927 int i, j;
1928 for (i=0; i<width/8; i++){
1929 int d= ~src[i];
1930 for(j=0; j<8; j++)
1931 dst[8*i+j]= ((d>>(7-j))&1)*255;
1935 static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1937 int i, j;
1938 for (i=0; i<width/8; i++){
1939 int d= src[i];
1940 for(j=0; j<8; j++)
1941 dst[8*i+j]= ((d>>(7-j))&1)*255;
1945 // bilinear / bicubic scaling
1946 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1947 int16_t *filter, int16_t *filterPos, long filterSize)
1949 #if HAVE_MMX
1950 assert(filterSize % 4 == 0 && filterSize>0);
1951 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
1953 long counter= -2*dstW;
1954 filter-= counter*2;
1955 filterPos-= counter/2;
1956 dst-= counter/2;
1957 __asm__ volatile(
1958 #if defined(PIC)
1959 "push %%"REG_b" \n\t"
1960 #endif
1961 "pxor %%mm7, %%mm7 \n\t"
1962 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1963 "mov %%"REG_a", %%"REG_BP" \n\t"
1964 ASMALIGN(4)
1965 "1: \n\t"
1966 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1967 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1968 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1969 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1970 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1971 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1972 "punpcklbw %%mm7, %%mm0 \n\t"
1973 "punpcklbw %%mm7, %%mm2 \n\t"
1974 "pmaddwd %%mm1, %%mm0 \n\t"
1975 "pmaddwd %%mm2, %%mm3 \n\t"
1976 "movq %%mm0, %%mm4 \n\t"
1977 "punpckldq %%mm3, %%mm0 \n\t"
1978 "punpckhdq %%mm3, %%mm4 \n\t"
1979 "paddd %%mm4, %%mm0 \n\t"
1980 "psrad $7, %%mm0 \n\t"
1981 "packssdw %%mm0, %%mm0 \n\t"
1982 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1983 "add $4, %%"REG_BP" \n\t"
1984 " jnc 1b \n\t"
1986 "pop %%"REG_BP" \n\t"
1987 #if defined(PIC)
1988 "pop %%"REG_b" \n\t"
1989 #endif
1990 : "+a" (counter)
1991 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1992 #if !defined(PIC)
1993 : "%"REG_b
1994 #endif
1997 else if (filterSize==8)
1999 long counter= -2*dstW;
2000 filter-= counter*4;
2001 filterPos-= counter/2;
2002 dst-= counter/2;
2003 __asm__ volatile(
2004 #if defined(PIC)
2005 "push %%"REG_b" \n\t"
2006 #endif
2007 "pxor %%mm7, %%mm7 \n\t"
2008 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2009 "mov %%"REG_a", %%"REG_BP" \n\t"
2010 ASMALIGN(4)
2011 "1: \n\t"
2012 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2013 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2014 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2015 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2016 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2017 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2018 "punpcklbw %%mm7, %%mm0 \n\t"
2019 "punpcklbw %%mm7, %%mm2 \n\t"
2020 "pmaddwd %%mm1, %%mm0 \n\t"
2021 "pmaddwd %%mm2, %%mm3 \n\t"
2023 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2024 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2025 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2026 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2027 "punpcklbw %%mm7, %%mm4 \n\t"
2028 "punpcklbw %%mm7, %%mm2 \n\t"
2029 "pmaddwd %%mm1, %%mm4 \n\t"
2030 "pmaddwd %%mm2, %%mm5 \n\t"
2031 "paddd %%mm4, %%mm0 \n\t"
2032 "paddd %%mm5, %%mm3 \n\t"
2033 "movq %%mm0, %%mm4 \n\t"
2034 "punpckldq %%mm3, %%mm0 \n\t"
2035 "punpckhdq %%mm3, %%mm4 \n\t"
2036 "paddd %%mm4, %%mm0 \n\t"
2037 "psrad $7, %%mm0 \n\t"
2038 "packssdw %%mm0, %%mm0 \n\t"
2039 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2040 "add $4, %%"REG_BP" \n\t"
2041 " jnc 1b \n\t"
2043 "pop %%"REG_BP" \n\t"
2044 #if defined(PIC)
2045 "pop %%"REG_b" \n\t"
2046 #endif
2047 : "+a" (counter)
2048 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2049 #if !defined(PIC)
2050 : "%"REG_b
2051 #endif
2054 else
2056 uint8_t *offset = src+filterSize;
2057 long counter= -2*dstW;
2058 //filter-= counter*filterSize/2;
2059 filterPos-= counter/2;
2060 dst-= counter/2;
2061 __asm__ volatile(
2062 "pxor %%mm7, %%mm7 \n\t"
2063 ASMALIGN(4)
2064 "1: \n\t"
2065 "mov %2, %%"REG_c" \n\t"
2066 "movzwl (%%"REG_c", %0), %%eax \n\t"
2067 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2068 "mov %5, %%"REG_c" \n\t"
2069 "pxor %%mm4, %%mm4 \n\t"
2070 "pxor %%mm5, %%mm5 \n\t"
2071 "2: \n\t"
2072 "movq (%1), %%mm1 \n\t"
2073 "movq (%1, %6), %%mm3 \n\t"
2074 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2075 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2076 "punpcklbw %%mm7, %%mm0 \n\t"
2077 "punpcklbw %%mm7, %%mm2 \n\t"
2078 "pmaddwd %%mm1, %%mm0 \n\t"
2079 "pmaddwd %%mm2, %%mm3 \n\t"
2080 "paddd %%mm3, %%mm5 \n\t"
2081 "paddd %%mm0, %%mm4 \n\t"
2082 "add $8, %1 \n\t"
2083 "add $4, %%"REG_c" \n\t"
2084 "cmp %4, %%"REG_c" \n\t"
2085 " jb 2b \n\t"
2086 "add %6, %1 \n\t"
2087 "movq %%mm4, %%mm0 \n\t"
2088 "punpckldq %%mm5, %%mm4 \n\t"
2089 "punpckhdq %%mm5, %%mm0 \n\t"
2090 "paddd %%mm0, %%mm4 \n\t"
2091 "psrad $7, %%mm4 \n\t"
2092 "packssdw %%mm4, %%mm4 \n\t"
2093 "mov %3, %%"REG_a" \n\t"
2094 "movd %%mm4, (%%"REG_a", %0) \n\t"
2095 "add $4, %0 \n\t"
2096 " jnc 1b \n\t"
2098 : "+r" (counter), "+r" (filter)
2099 : "m" (filterPos), "m" (dst), "m"(offset),
2100 "m" (src), "r" (filterSize*2)
2101 : "%"REG_a, "%"REG_c, "%"REG_d
2104 #else
2105 #if HAVE_ALTIVEC
2106 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2107 #else
2108 int i;
2109 for (i=0; i<dstW; i++)
2111 int j;
2112 int srcPos= filterPos[i];
2113 int val=0;
2114 //printf("filterPos: %d\n", filterPos[i]);
2115 for (j=0; j<filterSize; j++)
2117 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2118 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2120 //filter += hFilterSize;
2121 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2122 //dst[i] = val>>7;
2124 #endif /* HAVE_ALTIVEC */
2125 #endif /* HAVE_MMX */
2127 // *** horizontal scale Y line to temp buffer
2128 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2129 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2130 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2131 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2132 int32_t *mmx2FilterPos, uint32_t *pal)
2134 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2136 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2137 src= formatConvBuffer;
2139 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2141 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2142 src= formatConvBuffer;
2144 else if (srcFormat==PIX_FMT_RGB32)
2146 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2147 src= formatConvBuffer;
2149 else if (srcFormat==PIX_FMT_RGB32_1)
2151 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2152 src= formatConvBuffer;
2154 else if (srcFormat==PIX_FMT_BGR24)
2156 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2157 src= formatConvBuffer;
2159 else if (srcFormat==PIX_FMT_BGR565)
2161 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2162 src= formatConvBuffer;
2164 else if (srcFormat==PIX_FMT_BGR555)
2166 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2167 src= formatConvBuffer;
2169 else if (srcFormat==PIX_FMT_BGR32)
2171 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2172 src= formatConvBuffer;
2174 else if (srcFormat==PIX_FMT_BGR32_1)
2176 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2177 src= formatConvBuffer;
2179 else if (srcFormat==PIX_FMT_RGB24)
2181 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2182 src= formatConvBuffer;
2184 else if (srcFormat==PIX_FMT_RGB565)
2186 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2187 src= formatConvBuffer;
2189 else if (srcFormat==PIX_FMT_RGB555)
2191 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2192 src= formatConvBuffer;
2194 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2196 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2197 src= formatConvBuffer;
2199 else if (srcFormat==PIX_FMT_MONOBLACK)
2201 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2202 src= formatConvBuffer;
2204 else if (srcFormat==PIX_FMT_MONOWHITE)
2206 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2207 src= formatConvBuffer;
2210 #if HAVE_MMX
2211 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2212 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2213 #else
2214 if (!(flags&SWS_FAST_BILINEAR))
2215 #endif
2217 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2219 else // fast bilinear upscale / crap downscale
2221 #if ARCH_X86
2222 #if HAVE_MMX2
2223 int i;
2224 #if defined(PIC)
2225 uint64_t ebxsave __attribute__((aligned(8)));
2226 #endif
2227 if (canMMX2BeUsed)
2229 __asm__ volatile(
2230 #if defined(PIC)
2231 "mov %%"REG_b", %5 \n\t"
2232 #endif
2233 "pxor %%mm7, %%mm7 \n\t"
2234 "mov %0, %%"REG_c" \n\t"
2235 "mov %1, %%"REG_D" \n\t"
2236 "mov %2, %%"REG_d" \n\t"
2237 "mov %3, %%"REG_b" \n\t"
2238 "xor %%"REG_a", %%"REG_a" \n\t" // i
2239 PREFETCH" (%%"REG_c") \n\t"
2240 PREFETCH" 32(%%"REG_c") \n\t"
2241 PREFETCH" 64(%%"REG_c") \n\t"
2243 #if ARCH_X86_64
2245 #define FUNNY_Y_CODE \
2246 "movl (%%"REG_b"), %%esi \n\t"\
2247 "call *%4 \n\t"\
2248 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2249 "add %%"REG_S", %%"REG_c" \n\t"\
2250 "add %%"REG_a", %%"REG_D" \n\t"\
2251 "xor %%"REG_a", %%"REG_a" \n\t"\
2253 #else
2255 #define FUNNY_Y_CODE \
2256 "movl (%%"REG_b"), %%esi \n\t"\
2257 "call *%4 \n\t"\
2258 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2259 "add %%"REG_a", %%"REG_D" \n\t"\
2260 "xor %%"REG_a", %%"REG_a" \n\t"\
2262 #endif /* ARCH_X86_64 */
2264 FUNNY_Y_CODE
2265 FUNNY_Y_CODE
2266 FUNNY_Y_CODE
2267 FUNNY_Y_CODE
2268 FUNNY_Y_CODE
2269 FUNNY_Y_CODE
2270 FUNNY_Y_CODE
2271 FUNNY_Y_CODE
2273 #if defined(PIC)
2274 "mov %5, %%"REG_b" \n\t"
2275 #endif
2276 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2277 "m" (funnyYCode)
2278 #if defined(PIC)
2279 ,"m" (ebxsave)
2280 #endif
2281 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2282 #if !defined(PIC)
2283 ,"%"REG_b
2284 #endif
2286 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2288 else
2290 #endif /* HAVE_MMX2 */
2291 long xInc_shr16 = xInc >> 16;
2292 uint16_t xInc_mask = xInc & 0xffff;
2293 //NO MMX just normal asm ...
2294 __asm__ volatile(
2295 "xor %%"REG_a", %%"REG_a" \n\t" // i
2296 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2297 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2298 ASMALIGN(4)
2299 "1: \n\t"
2300 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2301 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2302 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2303 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2304 "shll $16, %%edi \n\t"
2305 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2306 "mov %1, %%"REG_D" \n\t"
2307 "shrl $9, %%esi \n\t"
2308 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2309 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2310 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2312 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2313 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2314 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2315 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2316 "shll $16, %%edi \n\t"
2317 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2318 "mov %1, %%"REG_D" \n\t"
2319 "shrl $9, %%esi \n\t"
2320 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2321 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2322 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2325 "add $2, %%"REG_a" \n\t"
2326 "cmp %2, %%"REG_a" \n\t"
2327 " jb 1b \n\t"
2330 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2331 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2333 #if HAVE_MMX2
2334 } //if MMX2 can't be used
2335 #endif
2336 #else
2337 int i;
2338 unsigned int xpos=0;
2339 for (i=0;i<dstWidth;i++)
2341 register unsigned int xx=xpos>>16;
2342 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2343 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2344 xpos+=xInc;
2346 #endif /* ARCH_X86 */
2349 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2350 int i;
2351 //FIXME all pal and rgb srcFormats could do this convertion as well
2352 //FIXME all scalers more complex than bilinear could do half of this transform
2353 if(c->srcRange){
2354 for (i=0; i<dstWidth; i++)
2355 dst[i]= (dst[i]*14071 + 33561947)>>14;
2356 }else{
2357 for (i=0; i<dstWidth; i++)
2358 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2363 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2364 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2365 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2366 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2367 int32_t *mmx2FilterPos, uint32_t *pal)
2369 if (srcFormat==PIX_FMT_YUYV422)
2371 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2372 src1= formatConvBuffer;
2373 src2= formatConvBuffer+VOFW;
2375 else if (srcFormat==PIX_FMT_UYVY422)
2377 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2378 src1= formatConvBuffer;
2379 src2= formatConvBuffer+VOFW;
2381 else if (srcFormat==PIX_FMT_RGB32)
2383 if(c->chrSrcHSubSample)
2384 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2385 else
2386 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2387 src1= formatConvBuffer;
2388 src2= formatConvBuffer+VOFW;
2390 else if (srcFormat==PIX_FMT_RGB32_1)
2392 if(c->chrSrcHSubSample)
2393 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2394 else
2395 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2396 src1= formatConvBuffer;
2397 src2= formatConvBuffer+VOFW;
2399 else if (srcFormat==PIX_FMT_BGR24)
2401 if(c->chrSrcHSubSample)
2402 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2403 else
2404 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2405 src1= formatConvBuffer;
2406 src2= formatConvBuffer+VOFW;
2408 else if (srcFormat==PIX_FMT_BGR565)
2410 if(c->chrSrcHSubSample)
2411 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2412 else
2413 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2414 src1= formatConvBuffer;
2415 src2= formatConvBuffer+VOFW;
2417 else if (srcFormat==PIX_FMT_BGR555)
2419 if(c->chrSrcHSubSample)
2420 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2421 else
2422 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2423 src1= formatConvBuffer;
2424 src2= formatConvBuffer+VOFW;
2426 else if (srcFormat==PIX_FMT_BGR32)
2428 if(c->chrSrcHSubSample)
2429 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2430 else
2431 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2432 src1= formatConvBuffer;
2433 src2= formatConvBuffer+VOFW;
2435 else if (srcFormat==PIX_FMT_BGR32_1)
2437 if(c->chrSrcHSubSample)
2438 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2439 else
2440 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2441 src1= formatConvBuffer;
2442 src2= formatConvBuffer+VOFW;
2444 else if (srcFormat==PIX_FMT_RGB24)
2446 if(c->chrSrcHSubSample)
2447 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2448 else
2449 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2450 src1= formatConvBuffer;
2451 src2= formatConvBuffer+VOFW;
2453 else if (srcFormat==PIX_FMT_RGB565)
2455 if(c->chrSrcHSubSample)
2456 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2457 else
2458 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2459 src1= formatConvBuffer;
2460 src2= formatConvBuffer+VOFW;
2462 else if (srcFormat==PIX_FMT_RGB555)
2464 if(c->chrSrcHSubSample)
2465 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2466 else
2467 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2468 src1= formatConvBuffer;
2469 src2= formatConvBuffer+VOFW;
2471 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2473 return;
2475 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2477 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2478 src1= formatConvBuffer;
2479 src2= formatConvBuffer+VOFW;
2482 #if HAVE_MMX
2483 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2484 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2485 #else
2486 if (!(flags&SWS_FAST_BILINEAR))
2487 #endif
2489 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2490 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2492 else // fast bilinear upscale / crap downscale
2494 #if ARCH_X86
2495 #if HAVE_MMX2
2496 int i;
2497 #if defined(PIC)
2498 uint64_t ebxsave __attribute__((aligned(8)));
2499 #endif
2500 if (canMMX2BeUsed)
2502 __asm__ volatile(
2503 #if defined(PIC)
2504 "mov %%"REG_b", %6 \n\t"
2505 #endif
2506 "pxor %%mm7, %%mm7 \n\t"
2507 "mov %0, %%"REG_c" \n\t"
2508 "mov %1, %%"REG_D" \n\t"
2509 "mov %2, %%"REG_d" \n\t"
2510 "mov %3, %%"REG_b" \n\t"
2511 "xor %%"REG_a", %%"REG_a" \n\t" // i
2512 PREFETCH" (%%"REG_c") \n\t"
2513 PREFETCH" 32(%%"REG_c") \n\t"
2514 PREFETCH" 64(%%"REG_c") \n\t"
2516 #if ARCH_X86_64
2518 #define FUNNY_UV_CODE \
2519 "movl (%%"REG_b"), %%esi \n\t"\
2520 "call *%4 \n\t"\
2521 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2522 "add %%"REG_S", %%"REG_c" \n\t"\
2523 "add %%"REG_a", %%"REG_D" \n\t"\
2524 "xor %%"REG_a", %%"REG_a" \n\t"\
2526 #else
2528 #define FUNNY_UV_CODE \
2529 "movl (%%"REG_b"), %%esi \n\t"\
2530 "call *%4 \n\t"\
2531 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2532 "add %%"REG_a", %%"REG_D" \n\t"\
2533 "xor %%"REG_a", %%"REG_a" \n\t"\
2535 #endif /* ARCH_X86_64 */
2537 FUNNY_UV_CODE
2538 FUNNY_UV_CODE
2539 FUNNY_UV_CODE
2540 FUNNY_UV_CODE
2541 "xor %%"REG_a", %%"REG_a" \n\t" // i
2542 "mov %5, %%"REG_c" \n\t" // src
2543 "mov %1, %%"REG_D" \n\t" // buf1
2544 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2545 PREFETCH" (%%"REG_c") \n\t"
2546 PREFETCH" 32(%%"REG_c") \n\t"
2547 PREFETCH" 64(%%"REG_c") \n\t"
2549 FUNNY_UV_CODE
2550 FUNNY_UV_CODE
2551 FUNNY_UV_CODE
2552 FUNNY_UV_CODE
2554 #if defined(PIC)
2555 "mov %6, %%"REG_b" \n\t"
2556 #endif
2557 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2558 "m" (funnyUVCode), "m" (src2)
2559 #if defined(PIC)
2560 ,"m" (ebxsave)
2561 #endif
2562 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2563 #if !defined(PIC)
2564 ,"%"REG_b
2565 #endif
2567 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2569 //printf("%d %d %d\n", dstWidth, i, srcW);
2570 dst[i] = src1[srcW-1]*128;
2571 dst[i+VOFW] = src2[srcW-1]*128;
2574 else
2576 #endif /* HAVE_MMX2 */
2577 long xInc_shr16 = (long) (xInc >> 16);
2578 uint16_t xInc_mask = xInc & 0xffff;
2579 __asm__ volatile(
2580 "xor %%"REG_a", %%"REG_a" \n\t" // i
2581 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2582 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2583 ASMALIGN(4)
2584 "1: \n\t"
2585 "mov %0, %%"REG_S" \n\t"
2586 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2587 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2588 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2589 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2590 "shll $16, %%edi \n\t"
2591 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2592 "mov %1, %%"REG_D" \n\t"
2593 "shrl $9, %%esi \n\t"
2594 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2596 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2597 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2598 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2599 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2600 "shll $16, %%edi \n\t"
2601 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2602 "mov %1, %%"REG_D" \n\t"
2603 "shrl $9, %%esi \n\t"
2604 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2606 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2607 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2608 "add $1, %%"REG_a" \n\t"
2609 "cmp %2, %%"REG_a" \n\t"
2610 " jb 1b \n\t"
2612 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2613 which is needed to support GCC 4.0. */
2614 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2615 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2616 #else
2617 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2618 #endif
2619 "r" (src2)
2620 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2622 #if HAVE_MMX2
2623 } //if MMX2 can't be used
2624 #endif
2625 #else
2626 int i;
2627 unsigned int xpos=0;
2628 for (i=0;i<dstWidth;i++)
2630 register unsigned int xx=xpos>>16;
2631 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2632 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2633 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2634 /* slower
2635 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2636 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2638 xpos+=xInc;
2640 #endif /* ARCH_X86 */
2642 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2643 int i;
2644 //FIXME all pal and rgb srcFormats could do this convertion as well
2645 //FIXME all scalers more complex than bilinear could do half of this transform
2646 if(c->srcRange){
2647 for (i=0; i<dstWidth; i++){
2648 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2649 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2651 }else{
2652 for (i=0; i<dstWidth; i++){
2653 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2654 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2660 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2661 int srcSliceH, uint8_t* dst[], int dstStride[]){
2663 /* load a few things into local vars to make the code more readable? and faster */
2664 const int srcW= c->srcW;
2665 const int dstW= c->dstW;
2666 const int dstH= c->dstH;
2667 const int chrDstW= c->chrDstW;
2668 const int chrSrcW= c->chrSrcW;
2669 const int lumXInc= c->lumXInc;
2670 const int chrXInc= c->chrXInc;
2671 const int dstFormat= c->dstFormat;
2672 const int srcFormat= c->srcFormat;
2673 const int flags= c->flags;
2674 const int canMMX2BeUsed= c->canMMX2BeUsed;
2675 int16_t *vLumFilterPos= c->vLumFilterPos;
2676 int16_t *vChrFilterPos= c->vChrFilterPos;
2677 int16_t *hLumFilterPos= c->hLumFilterPos;
2678 int16_t *hChrFilterPos= c->hChrFilterPos;
2679 int16_t *vLumFilter= c->vLumFilter;
2680 int16_t *vChrFilter= c->vChrFilter;
2681 int16_t *hLumFilter= c->hLumFilter;
2682 int16_t *hChrFilter= c->hChrFilter;
2683 int32_t *lumMmxFilter= c->lumMmxFilter;
2684 int32_t *chrMmxFilter= c->chrMmxFilter;
2685 const int vLumFilterSize= c->vLumFilterSize;
2686 const int vChrFilterSize= c->vChrFilterSize;
2687 const int hLumFilterSize= c->hLumFilterSize;
2688 const int hChrFilterSize= c->hChrFilterSize;
2689 int16_t **lumPixBuf= c->lumPixBuf;
2690 int16_t **chrPixBuf= c->chrPixBuf;
2691 const int vLumBufSize= c->vLumBufSize;
2692 const int vChrBufSize= c->vChrBufSize;
2693 uint8_t *funnyYCode= c->funnyYCode;
2694 uint8_t *funnyUVCode= c->funnyUVCode;
2695 uint8_t *formatConvBuffer= c->formatConvBuffer;
2696 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2697 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2698 int lastDstY;
2699 uint32_t *pal=c->pal_yuv;
2701 /* vars which will change and which we need to store back in the context */
2702 int dstY= c->dstY;
2703 int lumBufIndex= c->lumBufIndex;
2704 int chrBufIndex= c->chrBufIndex;
2705 int lastInLumBuf= c->lastInLumBuf;
2706 int lastInChrBuf= c->lastInChrBuf;
2708 if (isPacked(c->srcFormat)){
2709 src[0]=
2710 src[1]=
2711 src[2]= src[0];
2712 srcStride[0]=
2713 srcStride[1]=
2714 srcStride[2]= srcStride[0];
2716 srcStride[1]<<= c->vChrDrop;
2717 srcStride[2]<<= c->vChrDrop;
2719 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2720 // (int)dst[0], (int)dst[1], (int)dst[2]);
2722 #if 0 //self test FIXME move to a vfilter or something
2724 static volatile int i=0;
2725 i++;
2726 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2727 selfTest(src, srcStride, c->srcW, c->srcH);
2728 i--;
2730 #endif
2732 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2733 //dstStride[0],dstStride[1],dstStride[2]);
2735 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2737 static int warnedAlready=0; //FIXME move this into the context perhaps
2738 if (flags & SWS_PRINT_INFO && !warnedAlready)
2740 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2741 " ->cannot do aligned memory accesses anymore\n");
2742 warnedAlready=1;
2746 /* Note the user might start scaling the picture in the middle so this
2747 will not get executed. This is not really intended but works
2748 currently, so people might do it. */
2749 if (srcSliceY ==0){
2750 lumBufIndex=0;
2751 chrBufIndex=0;
2752 dstY=0;
2753 lastInLumBuf= -1;
2754 lastInChrBuf= -1;
2757 lastDstY= dstY;
2759 for (;dstY < dstH; dstY++){
2760 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2761 const int chrDstY= dstY>>c->chrDstVSubSample;
2762 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2763 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2765 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2766 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2767 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2768 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2770 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2771 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2772 //handle holes (FAST_BILINEAR & weird filters)
2773 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2774 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2775 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2776 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2777 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2779 // Do we have enough lines in this slice to output the dstY line
2780 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2782 //Do horizontal scaling
2783 while(lastInLumBuf < lastLumSrcY)
2785 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2786 lumBufIndex++;
2787 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2788 assert(lumBufIndex < 2*vLumBufSize);
2789 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2790 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2791 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2792 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2793 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2794 funnyYCode, c->srcFormat, formatConvBuffer,
2795 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2796 lastInLumBuf++;
2798 while(lastInChrBuf < lastChrSrcY)
2800 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2801 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2802 chrBufIndex++;
2803 assert(chrBufIndex < 2*vChrBufSize);
2804 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2805 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2806 //FIXME replace parameters through context struct (some at least)
2808 if (!(isGray(srcFormat) || isGray(dstFormat)))
2809 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2810 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2811 funnyUVCode, c->srcFormat, formatConvBuffer,
2812 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2813 lastInChrBuf++;
2815 //wrap buf index around to stay inside the ring buffer
2816 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2817 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2819 else // not enough lines left in this slice -> load the rest in the buffer
2821 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2822 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2823 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2824 vChrBufSize, vLumBufSize);*/
2826 //Do horizontal scaling
2827 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2829 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2830 lumBufIndex++;
2831 assert(lumBufIndex < 2*vLumBufSize);
2832 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2833 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2834 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2835 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2836 funnyYCode, c->srcFormat, formatConvBuffer,
2837 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2838 lastInLumBuf++;
2840 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2842 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2843 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2844 chrBufIndex++;
2845 assert(chrBufIndex < 2*vChrBufSize);
2846 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
2847 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2849 if (!(isGray(srcFormat) || isGray(dstFormat)))
2850 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2851 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2852 funnyUVCode, c->srcFormat, formatConvBuffer,
2853 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2854 lastInChrBuf++;
2856 //wrap buf index around to stay inside the ring buffer
2857 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2858 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2859 break; //we can't output a dstY line so let's try with the next slice
2862 #if HAVE_MMX
2863 c->blueDither= ff_dither8[dstY&1];
2864 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2865 c->greenDither= ff_dither8[dstY&1];
2866 else
2867 c->greenDither= ff_dither4[dstY&1];
2868 c->redDither= ff_dither8[(dstY+1)&1];
2869 #endif
2870 if (dstY < dstH-2)
2872 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2873 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2874 #if HAVE_MMX
2875 int i;
2876 if (flags & SWS_ACCURATE_RND){
2877 int s= APCK_SIZE / 8;
2878 for (i=0; i<vLumFilterSize; i+=2){
2879 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2880 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2881 lumMmxFilter[s*i+APCK_COEF/4 ]=
2882 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2883 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2885 for (i=0; i<vChrFilterSize; i+=2){
2886 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2887 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2888 chrMmxFilter[s*i+APCK_COEF/4 ]=
2889 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2890 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2892 }else{
2893 for (i=0; i<vLumFilterSize; i++)
2895 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2896 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2897 lumMmxFilter[4*i+2]=
2898 lumMmxFilter[4*i+3]=
2899 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2901 for (i=0; i<vChrFilterSize; i++)
2903 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2904 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2905 chrMmxFilter[4*i+2]=
2906 chrMmxFilter[4*i+3]=
2907 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2910 #endif
2911 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2912 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2913 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2914 RENAME(yuv2nv12X)(c,
2915 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2916 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2917 dest, uDest, dstW, chrDstW, dstFormat);
2919 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2921 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2922 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2923 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2925 int16_t *lumBuf = lumPixBuf[0];
2926 int16_t *chrBuf= chrPixBuf[0];
2927 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2929 else //General YV12
2931 RENAME(yuv2yuvX)(c,
2932 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2933 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2934 dest, uDest, vDest, dstW, chrDstW);
2937 else
2939 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2940 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2941 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2943 int chrAlpha= vChrFilter[2*dstY+1];
2944 if(flags & SWS_FULL_CHR_H_INT){
2945 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2946 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2947 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2948 dest, dstW, dstY);
2949 }else{
2950 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2951 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2954 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2956 int lumAlpha= vLumFilter[2*dstY+1];
2957 int chrAlpha= vChrFilter[2*dstY+1];
2958 lumMmxFilter[2]=
2959 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2960 chrMmxFilter[2]=
2961 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2962 if(flags & SWS_FULL_CHR_H_INT){
2963 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2964 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2965 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2966 dest, dstW, dstY);
2967 }else{
2968 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2969 dest, dstW, lumAlpha, chrAlpha, dstY);
2972 else //general RGB
2974 if(flags & SWS_FULL_CHR_H_INT){
2975 yuv2rgbXinC_full(c,
2976 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2977 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2978 dest, dstW, dstY);
2979 }else{
2980 RENAME(yuv2packedX)(c,
2981 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2982 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2983 dest, dstW, dstY);
2988 else // hmm looks like we can't use MMX here without overwriting this array's tail
2990 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2991 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2992 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2993 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2994 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2995 yuv2nv12XinC(
2996 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2997 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2998 dest, uDest, dstW, chrDstW, dstFormat);
3000 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3002 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3003 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3004 yuv2yuvXinC(
3005 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3006 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3007 dest, uDest, vDest, dstW, chrDstW);
3009 else
3011 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3012 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3013 if(flags & SWS_FULL_CHR_H_INT){
3014 yuv2rgbXinC_full(c,
3015 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3016 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3017 dest, dstW, dstY);
3018 }else{
3019 yuv2packedXinC(c,
3020 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3021 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3022 dest, dstW, dstY);
3028 #if HAVE_MMX
3029 __asm__ volatile(SFENCE:::"memory");
3030 __asm__ volatile(EMMS:::"memory");
3031 #endif
3032 /* store changed local vars back in the context */
3033 c->dstY= dstY;
3034 c->lumBufIndex= lumBufIndex;
3035 c->chrBufIndex= chrBufIndex;
3036 c->lastInLumBuf= lastInLumBuf;
3037 c->lastInChrBuf= lastInChrBuf;
3039 return dstY - lastDstY;