Do _not_ use rbx on x86_64, it will fail to compile with PIC, besides it
[mplayer/glamo.git] / libswscale / swscale_template.c
blob971b69c98d5f82e2ee5a9c2e394418b3705a1e75
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
32 #if HAVE_AMD3DNOW
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
34 #define EMMS "femms"
35 #else
36 #define EMMS "emms"
37 #endif
39 #if HAVE_AMD3DNOW
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif HAVE_MMX2
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
48 #endif
50 #if HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
56 #if HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif HAVE_AMD3DNOW
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
62 #if HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
69 #if HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74 __asm__ volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 __asm__ volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX_UV \
210 __asm__ volatile(\
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
233 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
234 "lea "offset"(%0), %%"REG_d" \n\t"\
235 "mov (%%"REG_d"), %%"REG_S" \n\t"\
236 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
237 "movq "#dst1", "#dst2" \n\t"\
238 ASMALIGN(4)\
239 "2: \n\t"\
240 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
241 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
242 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
243 "add $16, %%"REG_d" \n\t"\
244 "mov (%%"REG_d"), %%"REG_S" \n\t"\
245 "pmulhw "#coeff", "#src1" \n\t"\
246 "pmulhw "#coeff", "#src2" \n\t"\
247 "paddw "#src1", "#dst1" \n\t"\
248 "paddw "#src2", "#dst2" \n\t"\
249 "test %%"REG_S", %%"REG_S" \n\t"\
250 " jnz 2b \n\t"\
252 #define YSCALEYUV2PACKEDX \
253 YSCALEYUV2PACKEDX_UV \
254 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
256 #define YSCALEYUV2PACKEDX_END \
257 :: "r" (&c->redDither), \
258 "m" (dummy), "m" (dummy), "m" (dummy),\
259 "r" (dest), "m" (dstW) \
260 : "%"REG_a, "%"REG_d, "%"REG_S \
263 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
264 __asm__ volatile(\
265 "xor %%"REG_a", %%"REG_a" \n\t"\
266 ASMALIGN(4)\
267 "nop \n\t"\
268 "1: \n\t"\
269 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
270 "mov (%%"REG_d"), %%"REG_S" \n\t"\
271 "pxor %%mm4, %%mm4 \n\t"\
272 "pxor %%mm5, %%mm5 \n\t"\
273 "pxor %%mm6, %%mm6 \n\t"\
274 "pxor %%mm7, %%mm7 \n\t"\
275 ASMALIGN(4)\
276 "2: \n\t"\
277 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
278 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
279 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
280 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
281 "movq %%mm0, %%mm3 \n\t"\
282 "punpcklwd %%mm1, %%mm0 \n\t"\
283 "punpckhwd %%mm1, %%mm3 \n\t"\
284 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
285 "pmaddwd %%mm1, %%mm0 \n\t"\
286 "pmaddwd %%mm1, %%mm3 \n\t"\
287 "paddd %%mm0, %%mm4 \n\t"\
288 "paddd %%mm3, %%mm5 \n\t"\
289 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
290 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
291 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
292 "test %%"REG_S", %%"REG_S" \n\t"\
293 "movq %%mm2, %%mm0 \n\t"\
294 "punpcklwd %%mm3, %%mm2 \n\t"\
295 "punpckhwd %%mm3, %%mm0 \n\t"\
296 "pmaddwd %%mm1, %%mm2 \n\t"\
297 "pmaddwd %%mm1, %%mm0 \n\t"\
298 "paddd %%mm2, %%mm6 \n\t"\
299 "paddd %%mm0, %%mm7 \n\t"\
300 " jnz 2b \n\t"\
301 "psrad $16, %%mm4 \n\t"\
302 "psrad $16, %%mm5 \n\t"\
303 "psrad $16, %%mm6 \n\t"\
304 "psrad $16, %%mm7 \n\t"\
305 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
306 "packssdw %%mm5, %%mm4 \n\t"\
307 "packssdw %%mm7, %%mm6 \n\t"\
308 "paddw %%mm0, %%mm4 \n\t"\
309 "paddw %%mm0, %%mm6 \n\t"\
310 "movq %%mm4, "U_TEMP"(%0) \n\t"\
311 "movq %%mm6, "V_TEMP"(%0) \n\t"\
313 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314 "lea "offset"(%0), %%"REG_d" \n\t"\
315 "mov (%%"REG_d"), %%"REG_S" \n\t"\
316 "pxor %%mm1, %%mm1 \n\t"\
317 "pxor %%mm5, %%mm5 \n\t"\
318 "pxor %%mm7, %%mm7 \n\t"\
319 "pxor %%mm6, %%mm6 \n\t"\
320 ASMALIGN(4)\
321 "2: \n\t"\
322 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
323 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
324 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
325 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
326 "movq %%mm0, %%mm3 \n\t"\
327 "punpcklwd %%mm4, %%mm0 \n\t"\
328 "punpckhwd %%mm4, %%mm3 \n\t"\
329 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
330 "pmaddwd %%mm4, %%mm0 \n\t"\
331 "pmaddwd %%mm4, %%mm3 \n\t"\
332 "paddd %%mm0, %%mm1 \n\t"\
333 "paddd %%mm3, %%mm5 \n\t"\
334 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
335 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
336 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
337 "test %%"REG_S", %%"REG_S" \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "punpcklwd %%mm3, %%mm2 \n\t"\
340 "punpckhwd %%mm3, %%mm0 \n\t"\
341 "pmaddwd %%mm4, %%mm2 \n\t"\
342 "pmaddwd %%mm4, %%mm0 \n\t"\
343 "paddd %%mm2, %%mm7 \n\t"\
344 "paddd %%mm0, %%mm6 \n\t"\
345 " jnz 2b \n\t"\
346 "psrad $16, %%mm1 \n\t"\
347 "psrad $16, %%mm5 \n\t"\
348 "psrad $16, %%mm7 \n\t"\
349 "psrad $16, %%mm6 \n\t"\
350 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
351 "packssdw %%mm5, %%mm1 \n\t"\
352 "packssdw %%mm6, %%mm7 \n\t"\
353 "paddw %%mm0, %%mm1 \n\t"\
354 "paddw %%mm0, %%mm7 \n\t"\
355 "movq "U_TEMP"(%0), %%mm3 \n\t"\
356 "movq "V_TEMP"(%0), %%mm4 \n\t"\
358 #define YSCALEYUV2PACKEDX_ACCURATE \
359 YSCALEYUV2PACKEDX_ACCURATE_UV \
360 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
362 #define YSCALEYUV2RGBX \
363 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
364 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
365 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
366 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
367 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
368 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
369 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
371 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
372 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
373 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
374 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
375 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
376 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377 "paddw %%mm3, %%mm4 \n\t"\
378 "movq %%mm2, %%mm0 \n\t"\
379 "movq %%mm5, %%mm6 \n\t"\
380 "movq %%mm4, %%mm3 \n\t"\
381 "punpcklwd %%mm2, %%mm2 \n\t"\
382 "punpcklwd %%mm5, %%mm5 \n\t"\
383 "punpcklwd %%mm4, %%mm4 \n\t"\
384 "paddw %%mm1, %%mm2 \n\t"\
385 "paddw %%mm1, %%mm5 \n\t"\
386 "paddw %%mm1, %%mm4 \n\t"\
387 "punpckhwd %%mm0, %%mm0 \n\t"\
388 "punpckhwd %%mm6, %%mm6 \n\t"\
389 "punpckhwd %%mm3, %%mm3 \n\t"\
390 "paddw %%mm7, %%mm0 \n\t"\
391 "paddw %%mm7, %%mm6 \n\t"\
392 "paddw %%mm7, %%mm3 \n\t"\
393 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394 "packuswb %%mm0, %%mm2 \n\t"\
395 "packuswb %%mm6, %%mm5 \n\t"\
396 "packuswb %%mm3, %%mm4 \n\t"\
398 #define REAL_YSCALEYUV2PACKED(index, c) \
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
401 "psraw $3, %%mm0 \n\t"\
402 "psraw $3, %%mm1 \n\t"\
403 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405 "xor "#index", "#index" \n\t"\
406 ASMALIGN(4)\
407 "1: \n\t"\
408 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
409 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
410 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
411 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
412 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
415 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
422 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
423 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
424 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
425 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
426 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
427 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
434 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
436 #define REAL_YSCALEYUV2RGB_UV(index, c) \
437 "xor "#index", "#index" \n\t"\
438 ASMALIGN(4)\
439 "1: \n\t"\
440 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
441 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
442 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
443 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
444 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
447 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
454 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
455 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
456 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
457 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
458 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
459 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
461 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
462 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
463 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
464 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
465 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
466 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
467 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
468 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
475 #define REAL_YSCALEYUV2RGB_COEFF(c) \
476 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
477 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
478 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
479 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
480 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
481 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
482 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483 "paddw %%mm3, %%mm4 \n\t"\
484 "movq %%mm2, %%mm0 \n\t"\
485 "movq %%mm5, %%mm6 \n\t"\
486 "movq %%mm4, %%mm3 \n\t"\
487 "punpcklwd %%mm2, %%mm2 \n\t"\
488 "punpcklwd %%mm5, %%mm5 \n\t"\
489 "punpcklwd %%mm4, %%mm4 \n\t"\
490 "paddw %%mm1, %%mm2 \n\t"\
491 "paddw %%mm1, %%mm5 \n\t"\
492 "paddw %%mm1, %%mm4 \n\t"\
493 "punpckhwd %%mm0, %%mm0 \n\t"\
494 "punpckhwd %%mm6, %%mm6 \n\t"\
495 "punpckhwd %%mm3, %%mm3 \n\t"\
496 "paddw %%mm7, %%mm0 \n\t"\
497 "paddw %%mm7, %%mm6 \n\t"\
498 "paddw %%mm7, %%mm3 \n\t"\
499 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500 "packuswb %%mm0, %%mm2 \n\t"\
501 "packuswb %%mm6, %%mm5 \n\t"\
502 "packuswb %%mm3, %%mm4 \n\t"\
504 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
506 #define YSCALEYUV2RGB(index, c) \
507 REAL_YSCALEYUV2RGB_UV(index, c) \
508 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
509 REAL_YSCALEYUV2RGB_COEFF(c)
511 #define REAL_YSCALEYUV2PACKED1(index, c) \
512 "xor "#index", "#index" \n\t"\
513 ASMALIGN(4)\
514 "1: \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $7, %%mm3 \n\t" \
518 "psraw $7, %%mm4 \n\t" \
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $7, %%mm1 \n\t" \
522 "psraw $7, %%mm7 \n\t" \
524 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
526 #define REAL_YSCALEYUV2RGB1(index, c) \
527 "xor "#index", "#index" \n\t"\
528 ASMALIGN(4)\
529 "1: \n\t"\
530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
531 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
532 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
535 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
536 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
537 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
538 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
539 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
540 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
542 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
543 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
546 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
547 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
548 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
549 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
550 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
551 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552 "paddw %%mm3, %%mm4 \n\t"\
553 "movq %%mm2, %%mm0 \n\t"\
554 "movq %%mm5, %%mm6 \n\t"\
555 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklwd %%mm2, %%mm2 \n\t"\
557 "punpcklwd %%mm5, %%mm5 \n\t"\
558 "punpcklwd %%mm4, %%mm4 \n\t"\
559 "paddw %%mm1, %%mm2 \n\t"\
560 "paddw %%mm1, %%mm5 \n\t"\
561 "paddw %%mm1, %%mm4 \n\t"\
562 "punpckhwd %%mm0, %%mm0 \n\t"\
563 "punpckhwd %%mm6, %%mm6 \n\t"\
564 "punpckhwd %%mm3, %%mm3 \n\t"\
565 "paddw %%mm7, %%mm0 \n\t"\
566 "paddw %%mm7, %%mm6 \n\t"\
567 "paddw %%mm7, %%mm3 \n\t"\
568 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569 "packuswb %%mm0, %%mm2 \n\t"\
570 "packuswb %%mm6, %%mm5 \n\t"\
571 "packuswb %%mm3, %%mm4 \n\t"\
573 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
575 #define REAL_YSCALEYUV2PACKED1b(index, c) \
576 "xor "#index", "#index" \n\t"\
577 ASMALIGN(4)\
578 "1: \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $8, %%mm3 \n\t" \
586 "psrlw $8, %%mm4 \n\t" \
587 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
588 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
589 "psraw $7, %%mm1 \n\t" \
590 "psraw $7, %%mm7 \n\t"
591 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
593 // do vertical chrominance interpolation
594 #define REAL_YSCALEYUV2RGB1b(index, c) \
595 "xor "#index", "#index" \n\t"\
596 ASMALIGN(4)\
597 "1: \n\t"\
598 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
599 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
600 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
601 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
602 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
605 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
606 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
607 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
608 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
609 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
610 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
611 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
612 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
614 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
615 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
618 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
619 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
620 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
621 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
622 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
623 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624 "paddw %%mm3, %%mm4 \n\t"\
625 "movq %%mm2, %%mm0 \n\t"\
626 "movq %%mm5, %%mm6 \n\t"\
627 "movq %%mm4, %%mm3 \n\t"\
628 "punpcklwd %%mm2, %%mm2 \n\t"\
629 "punpcklwd %%mm5, %%mm5 \n\t"\
630 "punpcklwd %%mm4, %%mm4 \n\t"\
631 "paddw %%mm1, %%mm2 \n\t"\
632 "paddw %%mm1, %%mm5 \n\t"\
633 "paddw %%mm1, %%mm4 \n\t"\
634 "punpckhwd %%mm0, %%mm0 \n\t"\
635 "punpckhwd %%mm6, %%mm6 \n\t"\
636 "punpckhwd %%mm3, %%mm3 \n\t"\
637 "paddw %%mm7, %%mm0 \n\t"\
638 "paddw %%mm7, %%mm6 \n\t"\
639 "paddw %%mm7, %%mm3 \n\t"\
640 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641 "packuswb %%mm0, %%mm2 \n\t"\
642 "packuswb %%mm6, %%mm5 \n\t"\
643 "packuswb %%mm3, %%mm4 \n\t"\
645 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
647 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
648 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
649 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
650 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
651 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
652 "packuswb %%mm1, %%mm7 \n\t"
653 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
655 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
656 "movq "#b", "#q2" \n\t" /* B */\
657 "movq "#r", "#t" \n\t" /* R */\
658 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
659 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
660 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
661 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
662 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
663 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
664 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
665 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
666 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
667 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
669 MOVNTQ( q0, (dst, index, 4))\
670 MOVNTQ( b, 8(dst, index, 4))\
671 MOVNTQ( q2, 16(dst, index, 4))\
672 MOVNTQ( q3, 24(dst, index, 4))\
674 "add $8, "#index" \n\t"\
675 "cmp "#dstw", "#index" \n\t"\
676 " jb 1b \n\t"
677 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
679 #define REAL_WRITERGB16(dst, dstw, index) \
680 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
681 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
682 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
683 "psrlq $3, %%mm2 \n\t"\
685 "movq %%mm2, %%mm1 \n\t"\
686 "movq %%mm4, %%mm3 \n\t"\
688 "punpcklbw %%mm7, %%mm3 \n\t"\
689 "punpcklbw %%mm5, %%mm2 \n\t"\
690 "punpckhbw %%mm7, %%mm4 \n\t"\
691 "punpckhbw %%mm5, %%mm1 \n\t"\
693 "psllq $3, %%mm3 \n\t"\
694 "psllq $3, %%mm4 \n\t"\
696 "por %%mm3, %%mm2 \n\t"\
697 "por %%mm4, %%mm1 \n\t"\
699 MOVNTQ(%%mm2, (dst, index, 2))\
700 MOVNTQ(%%mm1, 8(dst, index, 2))\
702 "add $8, "#index" \n\t"\
703 "cmp "#dstw", "#index" \n\t"\
704 " jb 1b \n\t"
705 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
707 #define REAL_WRITERGB15(dst, dstw, index) \
708 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
709 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
710 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
711 "psrlq $3, %%mm2 \n\t"\
712 "psrlq $1, %%mm5 \n\t"\
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
722 "psllq $2, %%mm3 \n\t"\
723 "psllq $2, %%mm4 \n\t"\
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
733 " jb 1b \n\t"
734 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
736 #define WRITEBGR24OLD(dst, dstw, index) \
737 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
738 "movq %%mm2, %%mm1 \n\t" /* B */\
739 "movq %%mm5, %%mm6 \n\t" /* R */\
740 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
741 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
742 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
743 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
744 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
745 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
746 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
747 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
748 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
749 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
751 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
752 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
753 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
754 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
755 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
756 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
757 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
758 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
760 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
761 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
762 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
763 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
764 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
765 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
766 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
767 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
768 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
769 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
770 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
771 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
772 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
774 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
775 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
776 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
777 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
778 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
779 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
780 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
781 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
783 MOVNTQ(%%mm0, (dst))\
784 MOVNTQ(%%mm2, 8(dst))\
785 MOVNTQ(%%mm3, 16(dst))\
786 "add $24, "#dst" \n\t"\
788 "add $8, "#index" \n\t"\
789 "cmp "#dstw", "#index" \n\t"\
790 " jb 1b \n\t"
792 #define WRITEBGR24MMX(dst, dstw, index) \
793 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
794 "movq %%mm2, %%mm1 \n\t" /* B */\
795 "movq %%mm5, %%mm6 \n\t" /* R */\
796 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
797 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
798 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
799 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
800 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
801 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
802 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
803 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
804 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
805 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
807 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
809 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
810 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
812 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
813 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
814 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
815 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
817 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
818 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
819 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
820 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
822 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
823 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
824 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
825 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
826 MOVNTQ(%%mm0, (dst))\
828 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
829 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
830 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
831 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
832 MOVNTQ(%%mm6, 8(dst))\
834 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
835 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
836 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
837 MOVNTQ(%%mm5, 16(dst))\
839 "add $24, "#dst" \n\t"\
841 "add $8, "#index" \n\t"\
842 "cmp "#dstw", "#index" \n\t"\
843 " jb 1b \n\t"
845 #define WRITEBGR24MMX2(dst, dstw, index) \
846 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
847 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
848 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
849 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
850 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
851 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
853 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
854 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
855 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
857 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
858 "por %%mm1, %%mm6 \n\t"\
859 "por %%mm3, %%mm6 \n\t"\
860 MOVNTQ(%%mm6, (dst))\
862 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
863 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
864 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
865 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
867 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
868 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
869 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
871 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
872 "por %%mm3, %%mm6 \n\t"\
873 MOVNTQ(%%mm6, 8(dst))\
875 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
876 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
877 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
879 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
880 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
881 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
883 "por %%mm1, %%mm3 \n\t"\
884 "por %%mm3, %%mm6 \n\t"\
885 MOVNTQ(%%mm6, 16(dst))\
887 "add $24, "#dst" \n\t"\
889 "add $8, "#index" \n\t"\
890 "cmp "#dstw", "#index" \n\t"\
891 " jb 1b \n\t"
893 #if HAVE_MMX2
894 #undef WRITEBGR24
895 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
896 #else
897 #undef WRITEBGR24
898 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
899 #endif
901 #define REAL_WRITEYUY2(dst, dstw, index) \
902 "packuswb %%mm3, %%mm3 \n\t"\
903 "packuswb %%mm4, %%mm4 \n\t"\
904 "packuswb %%mm7, %%mm1 \n\t"\
905 "punpcklbw %%mm4, %%mm3 \n\t"\
906 "movq %%mm1, %%mm7 \n\t"\
907 "punpcklbw %%mm3, %%mm1 \n\t"\
908 "punpckhbw %%mm3, %%mm7 \n\t"\
910 MOVNTQ(%%mm1, (dst, index, 2))\
911 MOVNTQ(%%mm7, 8(dst, index, 2))\
913 "add $8, "#index" \n\t"\
914 "cmp "#dstw", "#index" \n\t"\
915 " jb 1b \n\t"
916 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
919 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
920 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, int16_t **alpSrc,
921 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
923 #if HAVE_MMX
924 if(!(c->flags & SWS_BITEXACT)){
925 if (c->flags & SWS_ACCURATE_RND){
926 if (uDest){
927 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
928 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
930 if (CONFIG_SWSCALE_ALPHA && aDest){
931 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
934 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
935 }else{
936 if (uDest){
937 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
940 if (CONFIG_SWSCALE_ALPHA && aDest){
941 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
944 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
946 return;
948 #endif
949 #if HAVE_ALTIVEC
950 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
951 chrFilter, chrSrc, chrFilterSize,
952 dest, uDest, vDest, dstW, chrDstW);
953 #else //HAVE_ALTIVEC
954 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
955 chrFilter, chrSrc, chrFilterSize,
956 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
957 #endif //!HAVE_ALTIVEC
960 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
961 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
962 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
964 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
965 chrFilter, chrSrc, chrFilterSize,
966 dest, uDest, dstW, chrDstW, dstFormat);
969 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc, int16_t *alpSrc,
970 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
972 int i;
973 #if HAVE_MMX
974 if(!(c->flags & SWS_BITEXACT)){
975 long p= 4;
976 uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
977 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
978 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
980 if (c->flags & SWS_ACCURATE_RND){
981 while(p--){
982 if (dst[p]){
983 __asm__ volatile(
984 YSCALEYUV2YV121_ACCURATE
985 :: "r" (src[p]), "r" (dst[p] + counter[p]),
986 "g" (-counter[p])
987 : "%"REG_a
991 }else{
992 while(p--){
993 if (dst[p]){
994 __asm__ volatile(
995 YSCALEYUV2YV121
996 :: "r" (src[p]), "r" (dst[p] + counter[p]),
997 "g" (-counter[p])
998 : "%"REG_a
1003 return;
1005 #endif
1006 for (i=0; i<dstW; i++)
1008 int val= (lumSrc[i]+64)>>7;
1010 if (val&256){
1011 if (val<0) val=0;
1012 else val=255;
1015 dest[i]= val;
1018 if (uDest)
1019 for (i=0; i<chrDstW; i++)
1021 int u=(chrSrc[i ]+64)>>7;
1022 int v=(chrSrc[i + VOFW]+64)>>7;
1024 if ((u|v)&256){
1025 if (u<0) u=0;
1026 else if (u>255) u=255;
1027 if (v<0) v=0;
1028 else if (v>255) v=255;
1031 uDest[i]= u;
1032 vDest[i]= v;
1035 if (CONFIG_SWSCALE_ALPHA && aDest)
1036 for (i=0; i<dstW; i++){
1037 int val= (alpSrc[i]+64)>>7;
1038 aDest[i]= av_clip_uint8(val);
1044 * vertical scale YV12 to RGB
1046 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1047 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1048 int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1050 #if HAVE_MMX
1051 x86_reg dummy=0;
1052 if(!(c->flags & SWS_BITEXACT)){
1053 if (c->flags & SWS_ACCURATE_RND){
1054 switch(c->dstFormat){
1055 case PIX_FMT_RGB32:
1056 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1057 YSCALEYUV2PACKEDX_ACCURATE
1058 YSCALEYUV2RGBX
1059 "movq %%mm2, "U_TEMP"(%0) \n\t"
1060 "movq %%mm4, "V_TEMP"(%0) \n\t"
1061 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1062 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1063 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1064 "psraw $3, %%mm1 \n\t"
1065 "psraw $3, %%mm7 \n\t"
1066 "packuswb %%mm7, %%mm1 \n\t"
1067 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1069 YSCALEYUV2PACKEDX_END
1070 }else{
1071 YSCALEYUV2PACKEDX_ACCURATE
1072 YSCALEYUV2RGBX
1073 "pcmpeqd %%mm7, %%mm7 \n\t"
1074 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1076 YSCALEYUV2PACKEDX_END
1078 return;
1079 case PIX_FMT_BGR24:
1080 YSCALEYUV2PACKEDX_ACCURATE
1081 YSCALEYUV2RGBX
1082 "pxor %%mm7, %%mm7 \n\t"
1083 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1084 "add %4, %%"REG_c" \n\t"
1085 WRITEBGR24(%%REGc, %5, %%REGa)
1088 :: "r" (&c->redDither),
1089 "m" (dummy), "m" (dummy), "m" (dummy),
1090 "r" (dest), "m" (dstW)
1091 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1093 return;
1094 case PIX_FMT_RGB555:
1095 YSCALEYUV2PACKEDX_ACCURATE
1096 YSCALEYUV2RGBX
1097 "pxor %%mm7, %%mm7 \n\t"
1098 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1099 #ifdef DITHER1XBPP
1100 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1101 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1102 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1103 #endif
1105 WRITERGB15(%4, %5, %%REGa)
1106 YSCALEYUV2PACKEDX_END
1107 return;
1108 case PIX_FMT_RGB565:
1109 YSCALEYUV2PACKEDX_ACCURATE
1110 YSCALEYUV2RGBX
1111 "pxor %%mm7, %%mm7 \n\t"
1112 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1113 #ifdef DITHER1XBPP
1114 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1115 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1116 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1117 #endif
1119 WRITERGB16(%4, %5, %%REGa)
1120 YSCALEYUV2PACKEDX_END
1121 return;
1122 case PIX_FMT_YUYV422:
1123 YSCALEYUV2PACKEDX_ACCURATE
1124 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1126 "psraw $3, %%mm3 \n\t"
1127 "psraw $3, %%mm4 \n\t"
1128 "psraw $3, %%mm1 \n\t"
1129 "psraw $3, %%mm7 \n\t"
1130 WRITEYUY2(%4, %5, %%REGa)
1131 YSCALEYUV2PACKEDX_END
1132 return;
1134 }else{
1135 switch(c->dstFormat)
1137 case PIX_FMT_RGB32:
1138 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1139 YSCALEYUV2PACKEDX
1140 YSCALEYUV2RGBX
1141 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1142 "psraw $3, %%mm1 \n\t"
1143 "psraw $3, %%mm7 \n\t"
1144 "packuswb %%mm7, %%mm1 \n\t"
1145 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1146 YSCALEYUV2PACKEDX_END
1147 }else{
1148 YSCALEYUV2PACKEDX
1149 YSCALEYUV2RGBX
1150 "pcmpeqd %%mm7, %%mm7 \n\t"
1151 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1152 YSCALEYUV2PACKEDX_END
1154 return;
1155 case PIX_FMT_BGR24:
1156 YSCALEYUV2PACKEDX
1157 YSCALEYUV2RGBX
1158 "pxor %%mm7, %%mm7 \n\t"
1159 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1160 "add %4, %%"REG_c" \n\t"
1161 WRITEBGR24(%%REGc, %5, %%REGa)
1163 :: "r" (&c->redDither),
1164 "m" (dummy), "m" (dummy), "m" (dummy),
1165 "r" (dest), "m" (dstW)
1166 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1168 return;
1169 case PIX_FMT_RGB555:
1170 YSCALEYUV2PACKEDX
1171 YSCALEYUV2RGBX
1172 "pxor %%mm7, %%mm7 \n\t"
1173 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1174 #ifdef DITHER1XBPP
1175 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1176 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1177 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1178 #endif
1180 WRITERGB15(%4, %5, %%REGa)
1181 YSCALEYUV2PACKEDX_END
1182 return;
1183 case PIX_FMT_RGB565:
1184 YSCALEYUV2PACKEDX
1185 YSCALEYUV2RGBX
1186 "pxor %%mm7, %%mm7 \n\t"
1187 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1188 #ifdef DITHER1XBPP
1189 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1190 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1191 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1192 #endif
1194 WRITERGB16(%4, %5, %%REGa)
1195 YSCALEYUV2PACKEDX_END
1196 return;
1197 case PIX_FMT_YUYV422:
1198 YSCALEYUV2PACKEDX
1199 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1201 "psraw $3, %%mm3 \n\t"
1202 "psraw $3, %%mm4 \n\t"
1203 "psraw $3, %%mm1 \n\t"
1204 "psraw $3, %%mm7 \n\t"
1205 WRITEYUY2(%4, %5, %%REGa)
1206 YSCALEYUV2PACKEDX_END
1207 return;
1211 #endif /* HAVE_MMX */
1212 #if HAVE_ALTIVEC
1213 /* The following list of supported dstFormat values should
1214 match what's found in the body of ff_yuv2packedX_altivec() */
1215 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1216 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1217 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1218 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1219 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1220 chrFilter, chrSrc, chrFilterSize,
1221 dest, dstW, dstY);
1222 else
1223 #endif
1224 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1225 chrFilter, chrSrc, chrFilterSize,
1226 alpSrc, dest, dstW, dstY);
1230 * vertical bilinear scale YV12 to RGB
1232 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1233 uint16_t *abuf0, uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1235 int yalpha1=4095- yalpha;
1236 int uvalpha1=4095-uvalpha;
1237 int i;
1239 #if HAVE_MMX
1240 if(!(c->flags & SWS_BITEXACT)){
1241 switch(c->dstFormat)
1243 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1244 case PIX_FMT_RGB32:
1245 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1246 #if ARCH_X86_64
1247 __asm__ volatile(
1248 YSCALEYUV2RGB(%%REGBP, %5)
1249 YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1250 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252 "packuswb %%mm7, %%mm1 \n\t"
1253 WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1255 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1256 "a" (&c->redDither)
1257 ,"r" (abuf0), "r" (abuf1)
1258 : "%"REG_BP
1260 #else
1261 *(uint16_t **)(&c->u_temp)=abuf0;
1262 *(uint16_t **)(&c->v_temp)=abuf1;
1263 __asm__ volatile(
1264 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1265 "mov %4, %%"REG_b" \n\t"
1266 "push %%"REG_BP" \n\t"
1267 YSCALEYUV2RGB(%%REGBP, %5)
1268 "push %0 \n\t"
1269 "push %1 \n\t"
1270 "mov "U_TEMP"(%5), %0 \n\t"
1271 "mov "V_TEMP"(%5), %1 \n\t"
1272 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1273 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1274 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1275 "packuswb %%mm7, %%mm1 \n\t"
1276 "pop %1 \n\t"
1277 "pop %0 \n\t"
1278 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1279 "pop %%"REG_BP" \n\t"
1280 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1282 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1283 "a" (&c->redDither)
1285 #endif
1286 }else{
1287 __asm__ volatile(
1288 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1289 "mov %4, %%"REG_b" \n\t"
1290 "push %%"REG_BP" \n\t"
1291 YSCALEYUV2RGB(%%REGBP, %5)
1292 "pcmpeqd %%mm7, %%mm7 \n\t"
1293 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1294 "pop %%"REG_BP" \n\t"
1295 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1297 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1298 "a" (&c->redDither)
1301 return;
1302 case PIX_FMT_BGR24:
1303 __asm__ volatile(
1304 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1305 "mov %4, %%"REG_b" \n\t"
1306 "push %%"REG_BP" \n\t"
1307 YSCALEYUV2RGB(%%REGBP, %5)
1308 "pxor %%mm7, %%mm7 \n\t"
1309 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1310 "pop %%"REG_BP" \n\t"
1311 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1312 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1313 "a" (&c->redDither)
1315 return;
1316 case PIX_FMT_RGB555:
1317 __asm__ volatile(
1318 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1319 "mov %4, %%"REG_b" \n\t"
1320 "push %%"REG_BP" \n\t"
1321 YSCALEYUV2RGB(%%REGBP, %5)
1322 "pxor %%mm7, %%mm7 \n\t"
1323 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1324 #ifdef DITHER1XBPP
1325 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1326 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1327 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1328 #endif
1330 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1331 "pop %%"REG_BP" \n\t"
1332 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1334 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1335 "a" (&c->redDither)
1337 return;
1338 case PIX_FMT_RGB565:
1339 __asm__ volatile(
1340 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1341 "mov %4, %%"REG_b" \n\t"
1342 "push %%"REG_BP" \n\t"
1343 YSCALEYUV2RGB(%%REGBP, %5)
1344 "pxor %%mm7, %%mm7 \n\t"
1345 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1346 #ifdef DITHER1XBPP
1347 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1348 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1349 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1350 #endif
1352 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1353 "pop %%"REG_BP" \n\t"
1354 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1355 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1356 "a" (&c->redDither)
1358 return;
1359 case PIX_FMT_YUYV422:
1360 __asm__ volatile(
1361 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1362 "mov %4, %%"REG_b" \n\t"
1363 "push %%"REG_BP" \n\t"
1364 YSCALEYUV2PACKED(%%REGBP, %5)
1365 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1366 "pop %%"REG_BP" \n\t"
1367 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1368 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1369 "a" (&c->redDither)
1371 return;
1372 default: break;
1375 #endif //HAVE_MMX
1376 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1380 * YV12 to RGB without scaling or interpolating
1382 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1383 uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1385 const int yalpha1=0;
1386 int i;
1388 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1389 const int yalpha= 4096; //FIXME ...
1391 if (flags&SWS_FULL_CHR_H_INT)
1393 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1394 return;
1397 #if HAVE_MMX
1398 if(!(flags & SWS_BITEXACT)){
1399 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1401 switch(dstFormat)
1403 case PIX_FMT_RGB32:
1404 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1405 __asm__ volatile(
1406 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1407 "mov %4, %%"REG_b" \n\t"
1408 "push %%"REG_BP" \n\t"
1409 YSCALEYUV2RGB1(%%REGBP, %5)
1410 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1411 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1412 "pop %%"REG_BP" \n\t"
1413 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1415 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1416 "a" (&c->redDither)
1418 }else{
1419 __asm__ volatile(
1420 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1421 "mov %4, %%"REG_b" \n\t"
1422 "push %%"REG_BP" \n\t"
1423 YSCALEYUV2RGB1(%%REGBP, %5)
1424 "pcmpeqd %%mm7, %%mm7 \n\t"
1425 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1426 "pop %%"REG_BP" \n\t"
1427 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1429 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1430 "a" (&c->redDither)
1433 return;
1434 case PIX_FMT_BGR24:
1435 __asm__ volatile(
1436 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1437 "mov %4, %%"REG_b" \n\t"
1438 "push %%"REG_BP" \n\t"
1439 YSCALEYUV2RGB1(%%REGBP, %5)
1440 "pxor %%mm7, %%mm7 \n\t"
1441 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1442 "pop %%"REG_BP" \n\t"
1443 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1445 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1446 "a" (&c->redDither)
1448 return;
1449 case PIX_FMT_RGB555:
1450 __asm__ volatile(
1451 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1452 "mov %4, %%"REG_b" \n\t"
1453 "push %%"REG_BP" \n\t"
1454 YSCALEYUV2RGB1(%%REGBP, %5)
1455 "pxor %%mm7, %%mm7 \n\t"
1456 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1457 #ifdef DITHER1XBPP
1458 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1459 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1460 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1461 #endif
1462 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1463 "pop %%"REG_BP" \n\t"
1464 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1466 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1467 "a" (&c->redDither)
1469 return;
1470 case PIX_FMT_RGB565:
1471 __asm__ volatile(
1472 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1473 "mov %4, %%"REG_b" \n\t"
1474 "push %%"REG_BP" \n\t"
1475 YSCALEYUV2RGB1(%%REGBP, %5)
1476 "pxor %%mm7, %%mm7 \n\t"
1477 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1478 #ifdef DITHER1XBPP
1479 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1480 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1481 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1482 #endif
1484 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1485 "pop %%"REG_BP" \n\t"
1486 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1488 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1489 "a" (&c->redDither)
1491 return;
1492 case PIX_FMT_YUYV422:
1493 __asm__ volatile(
1494 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1495 "mov %4, %%"REG_b" \n\t"
1496 "push %%"REG_BP" \n\t"
1497 YSCALEYUV2PACKED1(%%REGBP, %5)
1498 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1499 "pop %%"REG_BP" \n\t"
1500 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1502 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1503 "a" (&c->redDither)
1505 return;
1508 else
1510 switch(dstFormat)
1512 case PIX_FMT_RGB32:
1513 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1514 __asm__ volatile(
1515 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1516 "mov %4, %%"REG_b" \n\t"
1517 "push %%"REG_BP" \n\t"
1518 YSCALEYUV2RGB1b(%%REGBP, %5)
1519 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1520 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1521 "pop %%"REG_BP" \n\t"
1522 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1524 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1525 "a" (&c->redDither)
1527 }else{
1528 __asm__ volatile(
1529 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1530 "mov %4, %%"REG_b" \n\t"
1531 "push %%"REG_BP" \n\t"
1532 YSCALEYUV2RGB1b(%%REGBP, %5)
1533 "pcmpeqd %%mm7, %%mm7 \n\t"
1534 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1535 "pop %%"REG_BP" \n\t"
1536 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1538 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1539 "a" (&c->redDither)
1542 return;
1543 case PIX_FMT_BGR24:
1544 __asm__ volatile(
1545 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1546 "mov %4, %%"REG_b" \n\t"
1547 "push %%"REG_BP" \n\t"
1548 YSCALEYUV2RGB1b(%%REGBP, %5)
1549 "pxor %%mm7, %%mm7 \n\t"
1550 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1551 "pop %%"REG_BP" \n\t"
1552 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1554 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1555 "a" (&c->redDither)
1557 return;
1558 case PIX_FMT_RGB555:
1559 __asm__ volatile(
1560 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1561 "mov %4, %%"REG_b" \n\t"
1562 "push %%"REG_BP" \n\t"
1563 YSCALEYUV2RGB1b(%%REGBP, %5)
1564 "pxor %%mm7, %%mm7 \n\t"
1565 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1566 #ifdef DITHER1XBPP
1567 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1568 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1569 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1570 #endif
1571 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1572 "pop %%"REG_BP" \n\t"
1573 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1575 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1576 "a" (&c->redDither)
1578 return;
1579 case PIX_FMT_RGB565:
1580 __asm__ volatile(
1581 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1582 "mov %4, %%"REG_b" \n\t"
1583 "push %%"REG_BP" \n\t"
1584 YSCALEYUV2RGB1b(%%REGBP, %5)
1585 "pxor %%mm7, %%mm7 \n\t"
1586 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1587 #ifdef DITHER1XBPP
1588 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1589 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1590 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1591 #endif
1593 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1594 "pop %%"REG_BP" \n\t"
1595 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1597 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1598 "a" (&c->redDither)
1600 return;
1601 case PIX_FMT_YUYV422:
1602 __asm__ volatile(
1603 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1604 "mov %4, %%"REG_b" \n\t"
1605 "push %%"REG_BP" \n\t"
1606 YSCALEYUV2PACKED1b(%%REGBP, %5)
1607 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1608 "pop %%"REG_BP" \n\t"
1609 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1611 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1612 "a" (&c->redDither)
1614 return;
1618 #endif /* HAVE_MMX */
1619 if (uvalpha < 2048)
1621 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1622 }else{
1623 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1627 //FIXME yuy2* can read up to 7 samples too much
1629 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1631 #if HAVE_MMX
1632 __asm__ volatile(
1633 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1634 "mov %0, %%"REG_a" \n\t"
1635 "1: \n\t"
1636 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1637 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1638 "pand %%mm2, %%mm0 \n\t"
1639 "pand %%mm2, %%mm1 \n\t"
1640 "packuswb %%mm1, %%mm0 \n\t"
1641 "movq %%mm0, (%2, %%"REG_a") \n\t"
1642 "add $8, %%"REG_a" \n\t"
1643 " js 1b \n\t"
1644 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1645 : "%"REG_a
1647 #else
1648 int i;
1649 for (i=0; i<width; i++)
1650 dst[i]= src[2*i];
1651 #endif
1654 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1656 #if HAVE_MMX
1657 __asm__ volatile(
1658 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1659 "mov %0, %%"REG_a" \n\t"
1660 "1: \n\t"
1661 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1662 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1663 "psrlw $8, %%mm0 \n\t"
1664 "psrlw $8, %%mm1 \n\t"
1665 "packuswb %%mm1, %%mm0 \n\t"
1666 "movq %%mm0, %%mm1 \n\t"
1667 "psrlw $8, %%mm0 \n\t"
1668 "pand %%mm4, %%mm1 \n\t"
1669 "packuswb %%mm0, %%mm0 \n\t"
1670 "packuswb %%mm1, %%mm1 \n\t"
1671 "movd %%mm0, (%3, %%"REG_a") \n\t"
1672 "movd %%mm1, (%2, %%"REG_a") \n\t"
1673 "add $4, %%"REG_a" \n\t"
1674 " js 1b \n\t"
1675 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1676 : "%"REG_a
1678 #else
1679 int i;
1680 for (i=0; i<width; i++)
1682 dstU[i]= src1[4*i + 1];
1683 dstV[i]= src1[4*i + 3];
1685 #endif
1686 assert(src1 == src2);
1689 /* This is almost identical to the previous, end exists only because
1690 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1691 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1693 #if HAVE_MMX
1694 __asm__ volatile(
1695 "mov %0, %%"REG_a" \n\t"
1696 "1: \n\t"
1697 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1698 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1699 "psrlw $8, %%mm0 \n\t"
1700 "psrlw $8, %%mm1 \n\t"
1701 "packuswb %%mm1, %%mm0 \n\t"
1702 "movq %%mm0, (%2, %%"REG_a") \n\t"
1703 "add $8, %%"REG_a" \n\t"
1704 " js 1b \n\t"
1705 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1706 : "%"REG_a
1708 #else
1709 int i;
1710 for (i=0; i<width; i++)
1711 dst[i]= src[2*i+1];
1712 #endif
1715 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1717 #if HAVE_MMX
1718 __asm__ volatile(
1719 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1720 "mov %0, %%"REG_a" \n\t"
1721 "1: \n\t"
1722 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1723 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1724 "pand %%mm4, %%mm0 \n\t"
1725 "pand %%mm4, %%mm1 \n\t"
1726 "packuswb %%mm1, %%mm0 \n\t"
1727 "movq %%mm0, %%mm1 \n\t"
1728 "psrlw $8, %%mm0 \n\t"
1729 "pand %%mm4, %%mm1 \n\t"
1730 "packuswb %%mm0, %%mm0 \n\t"
1731 "packuswb %%mm1, %%mm1 \n\t"
1732 "movd %%mm0, (%3, %%"REG_a") \n\t"
1733 "movd %%mm1, (%2, %%"REG_a") \n\t"
1734 "add $4, %%"REG_a" \n\t"
1735 " js 1b \n\t"
1736 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1737 : "%"REG_a
1739 #else
1740 int i;
1741 for (i=0; i<width; i++)
1743 dstU[i]= src1[4*i + 0];
1744 dstV[i]= src1[4*i + 2];
1746 #endif
1747 assert(src1 == src2);
1750 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1751 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1753 int i;\
1754 for (i=0; i<width; i++)\
1756 int b= (((type*)src)[i]>>shb)&maskb;\
1757 int g= (((type*)src)[i]>>shg)&maskg;\
1758 int r= (((type*)src)[i]>>shr)&maskr;\
1760 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1764 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1765 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1766 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1767 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1768 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1769 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1771 static inline void RENAME(abgrToA)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused){
1772 int i;
1773 for (i=0; i<width; i++){
1774 dst[i]= src[4*i];
1778 #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1779 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1781 int i;\
1782 for (i=0; i<width; i++)\
1784 int b= (((type*)src)[i]&maskb)>>shb;\
1785 int g= (((type*)src)[i]&maskg)>>shg;\
1786 int r= (((type*)src)[i]&maskr)>>shr;\
1788 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1789 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1792 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1794 int i;\
1795 for (i=0; i<width; i++)\
1797 int pix0= ((type*)src)[2*i+0];\
1798 int pix1= ((type*)src)[2*i+1];\
1799 int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1800 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1801 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1802 g&= maskg|(2*maskg);\
1804 g>>=shg;\
1806 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1807 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1811 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1812 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1813 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1814 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1815 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1816 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1818 #if HAVE_MMX
1819 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1822 if(srcFormat == PIX_FMT_BGR24){
1823 __asm__ volatile(
1824 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1825 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1828 }else{
1829 __asm__ volatile(
1830 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1831 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1836 __asm__ volatile(
1837 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1838 "mov %2, %%"REG_a" \n\t"
1839 "pxor %%mm7, %%mm7 \n\t"
1840 "1: \n\t"
1841 PREFETCH" 64(%0) \n\t"
1842 "movd (%0), %%mm0 \n\t"
1843 "movd 2(%0), %%mm1 \n\t"
1844 "movd 6(%0), %%mm2 \n\t"
1845 "movd 8(%0), %%mm3 \n\t"
1846 "add $12, %0 \n\t"
1847 "punpcklbw %%mm7, %%mm0 \n\t"
1848 "punpcklbw %%mm7, %%mm1 \n\t"
1849 "punpcklbw %%mm7, %%mm2 \n\t"
1850 "punpcklbw %%mm7, %%mm3 \n\t"
1851 "pmaddwd %%mm5, %%mm0 \n\t"
1852 "pmaddwd %%mm6, %%mm1 \n\t"
1853 "pmaddwd %%mm5, %%mm2 \n\t"
1854 "pmaddwd %%mm6, %%mm3 \n\t"
1855 "paddd %%mm1, %%mm0 \n\t"
1856 "paddd %%mm3, %%mm2 \n\t"
1857 "paddd %%mm4, %%mm0 \n\t"
1858 "paddd %%mm4, %%mm2 \n\t"
1859 "psrad $15, %%mm0 \n\t"
1860 "psrad $15, %%mm2 \n\t"
1861 "packssdw %%mm2, %%mm0 \n\t"
1862 "packuswb %%mm0, %%mm0 \n\t"
1863 "movd %%mm0, (%1, %%"REG_a") \n\t"
1864 "add $4, %%"REG_a" \n\t"
1865 " js 1b \n\t"
1866 : "+r" (src)
1867 : "r" (dst+width), "g" ((x86_reg)-width)
1868 : "%"REG_a
1872 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1874 __asm__ volatile(
1875 "movq 24+%4, %%mm6 \n\t"
1876 "mov %3, %%"REG_a" \n\t"
1877 "pxor %%mm7, %%mm7 \n\t"
1878 "1: \n\t"
1879 PREFETCH" 64(%0) \n\t"
1880 "movd (%0), %%mm0 \n\t"
1881 "movd 2(%0), %%mm1 \n\t"
1882 "punpcklbw %%mm7, %%mm0 \n\t"
1883 "punpcklbw %%mm7, %%mm1 \n\t"
1884 "movq %%mm0, %%mm2 \n\t"
1885 "movq %%mm1, %%mm3 \n\t"
1886 "pmaddwd %4, %%mm0 \n\t"
1887 "pmaddwd 8+%4, %%mm1 \n\t"
1888 "pmaddwd 16+%4, %%mm2 \n\t"
1889 "pmaddwd %%mm6, %%mm3 \n\t"
1890 "paddd %%mm1, %%mm0 \n\t"
1891 "paddd %%mm3, %%mm2 \n\t"
1893 "movd 6(%0), %%mm1 \n\t"
1894 "movd 8(%0), %%mm3 \n\t"
1895 "add $12, %0 \n\t"
1896 "punpcklbw %%mm7, %%mm1 \n\t"
1897 "punpcklbw %%mm7, %%mm3 \n\t"
1898 "movq %%mm1, %%mm4 \n\t"
1899 "movq %%mm3, %%mm5 \n\t"
1900 "pmaddwd %4, %%mm1 \n\t"
1901 "pmaddwd 8+%4, %%mm3 \n\t"
1902 "pmaddwd 16+%4, %%mm4 \n\t"
1903 "pmaddwd %%mm6, %%mm5 \n\t"
1904 "paddd %%mm3, %%mm1 \n\t"
1905 "paddd %%mm5, %%mm4 \n\t"
1907 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1908 "paddd %%mm3, %%mm0 \n\t"
1909 "paddd %%mm3, %%mm2 \n\t"
1910 "paddd %%mm3, %%mm1 \n\t"
1911 "paddd %%mm3, %%mm4 \n\t"
1912 "psrad $15, %%mm0 \n\t"
1913 "psrad $15, %%mm2 \n\t"
1914 "psrad $15, %%mm1 \n\t"
1915 "psrad $15, %%mm4 \n\t"
1916 "packssdw %%mm1, %%mm0 \n\t"
1917 "packssdw %%mm4, %%mm2 \n\t"
1918 "packuswb %%mm0, %%mm0 \n\t"
1919 "packuswb %%mm2, %%mm2 \n\t"
1920 "movd %%mm0, (%1, %%"REG_a") \n\t"
1921 "movd %%mm2, (%2, %%"REG_a") \n\t"
1922 "add $4, %%"REG_a" \n\t"
1923 " js 1b \n\t"
1924 : "+r" (src)
1925 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1926 : "%"REG_a
1929 #endif
1931 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1933 #if HAVE_MMX
1934 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1935 #else
1936 int i;
1937 for (i=0; i<width; i++)
1939 int b= src[i*3+0];
1940 int g= src[i*3+1];
1941 int r= src[i*3+2];
1943 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1945 #endif /* HAVE_MMX */
1948 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1950 #if HAVE_MMX
1951 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1952 #else
1953 int i;
1954 for (i=0; i<width; i++)
1956 int b= src1[3*i + 0];
1957 int g= src1[3*i + 1];
1958 int r= src1[3*i + 2];
1960 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1961 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1963 #endif /* HAVE_MMX */
1964 assert(src1 == src2);
1967 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1969 int i;
1970 for (i=0; i<width; i++)
1972 int b= src1[6*i + 0] + src1[6*i + 3];
1973 int g= src1[6*i + 1] + src1[6*i + 4];
1974 int r= src1[6*i + 2] + src1[6*i + 5];
1976 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1977 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1979 assert(src1 == src2);
1982 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1984 #if HAVE_MMX
1985 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1986 #else
1987 int i;
1988 for (i=0; i<width; i++)
1990 int r= src[i*3+0];
1991 int g= src[i*3+1];
1992 int b= src[i*3+2];
1994 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1996 #endif
1999 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2001 #if HAVE_MMX
2002 assert(src1==src2);
2003 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2004 #else
2005 int i;
2006 assert(src1==src2);
2007 for (i=0; i<width; i++)
2009 int r= src1[3*i + 0];
2010 int g= src1[3*i + 1];
2011 int b= src1[3*i + 2];
2013 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2014 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2016 #endif
2019 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2021 int i;
2022 assert(src1==src2);
2023 for (i=0; i<width; i++)
2025 int r= src1[6*i + 0] + src1[6*i + 3];
2026 int g= src1[6*i + 1] + src1[6*i + 4];
2027 int b= src1[6*i + 2] + src1[6*i + 5];
2029 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2030 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2035 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2037 int i;
2038 for (i=0; i<width; i++)
2040 int d= src[i];
2042 dst[i]= pal[d] & 0xFF;
2046 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2048 int i;
2049 assert(src1 == src2);
2050 for (i=0; i<width; i++)
2052 int p= pal[src1[i]];
2054 dstU[i]= p>>8;
2055 dstV[i]= p>>16;
2059 static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2061 int i, j;
2062 for (i=0; i<width/8; i++){
2063 int d= ~src[i];
2064 for(j=0; j<8; j++)
2065 dst[8*i+j]= ((d>>(7-j))&1)*255;
2069 static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2071 int i, j;
2072 for (i=0; i<width/8; i++){
2073 int d= src[i];
2074 for(j=0; j<8; j++)
2075 dst[8*i+j]= ((d>>(7-j))&1)*255;
2079 // bilinear / bicubic scaling
2080 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2081 int16_t *filter, int16_t *filterPos, long filterSize)
2083 #if HAVE_MMX
2084 assert(filterSize % 4 == 0 && filterSize>0);
2085 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2087 x86_reg counter= -2*dstW;
2088 filter-= counter*2;
2089 filterPos-= counter/2;
2090 dst-= counter/2;
2091 __asm__ volatile(
2092 #if defined(PIC)
2093 "push %%"REG_b" \n\t"
2094 #endif
2095 "pxor %%mm7, %%mm7 \n\t"
2096 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2097 "mov %%"REG_a", %%"REG_BP" \n\t"
2098 ASMALIGN(4)
2099 "1: \n\t"
2100 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2101 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2102 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2103 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2104 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2105 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2106 "punpcklbw %%mm7, %%mm0 \n\t"
2107 "punpcklbw %%mm7, %%mm2 \n\t"
2108 "pmaddwd %%mm1, %%mm0 \n\t"
2109 "pmaddwd %%mm2, %%mm3 \n\t"
2110 "movq %%mm0, %%mm4 \n\t"
2111 "punpckldq %%mm3, %%mm0 \n\t"
2112 "punpckhdq %%mm3, %%mm4 \n\t"
2113 "paddd %%mm4, %%mm0 \n\t"
2114 "psrad $7, %%mm0 \n\t"
2115 "packssdw %%mm0, %%mm0 \n\t"
2116 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2117 "add $4, %%"REG_BP" \n\t"
2118 " jnc 1b \n\t"
2120 "pop %%"REG_BP" \n\t"
2121 #if defined(PIC)
2122 "pop %%"REG_b" \n\t"
2123 #endif
2124 : "+a" (counter)
2125 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2126 #if !defined(PIC)
2127 : "%"REG_b
2128 #endif
2131 else if (filterSize==8)
2133 x86_reg counter= -2*dstW;
2134 filter-= counter*4;
2135 filterPos-= counter/2;
2136 dst-= counter/2;
2137 __asm__ volatile(
2138 #if defined(PIC)
2139 "push %%"REG_b" \n\t"
2140 #endif
2141 "pxor %%mm7, %%mm7 \n\t"
2142 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2143 "mov %%"REG_a", %%"REG_BP" \n\t"
2144 ASMALIGN(4)
2145 "1: \n\t"
2146 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2147 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2148 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2149 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2150 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2151 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2152 "punpcklbw %%mm7, %%mm0 \n\t"
2153 "punpcklbw %%mm7, %%mm2 \n\t"
2154 "pmaddwd %%mm1, %%mm0 \n\t"
2155 "pmaddwd %%mm2, %%mm3 \n\t"
2157 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2158 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2159 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2160 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2161 "punpcklbw %%mm7, %%mm4 \n\t"
2162 "punpcklbw %%mm7, %%mm2 \n\t"
2163 "pmaddwd %%mm1, %%mm4 \n\t"
2164 "pmaddwd %%mm2, %%mm5 \n\t"
2165 "paddd %%mm4, %%mm0 \n\t"
2166 "paddd %%mm5, %%mm3 \n\t"
2167 "movq %%mm0, %%mm4 \n\t"
2168 "punpckldq %%mm3, %%mm0 \n\t"
2169 "punpckhdq %%mm3, %%mm4 \n\t"
2170 "paddd %%mm4, %%mm0 \n\t"
2171 "psrad $7, %%mm0 \n\t"
2172 "packssdw %%mm0, %%mm0 \n\t"
2173 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2174 "add $4, %%"REG_BP" \n\t"
2175 " jnc 1b \n\t"
2177 "pop %%"REG_BP" \n\t"
2178 #if defined(PIC)
2179 "pop %%"REG_b" \n\t"
2180 #endif
2181 : "+a" (counter)
2182 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2183 #if !defined(PIC)
2184 : "%"REG_b
2185 #endif
2188 else
2190 uint8_t *offset = src+filterSize;
2191 x86_reg counter= -2*dstW;
2192 //filter-= counter*filterSize/2;
2193 filterPos-= counter/2;
2194 dst-= counter/2;
2195 __asm__ volatile(
2196 "pxor %%mm7, %%mm7 \n\t"
2197 ASMALIGN(4)
2198 "1: \n\t"
2199 "mov %2, %%"REG_c" \n\t"
2200 "movzwl (%%"REG_c", %0), %%eax \n\t"
2201 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2202 "mov %5, %%"REG_c" \n\t"
2203 "pxor %%mm4, %%mm4 \n\t"
2204 "pxor %%mm5, %%mm5 \n\t"
2205 "2: \n\t"
2206 "movq (%1), %%mm1 \n\t"
2207 "movq (%1, %6), %%mm3 \n\t"
2208 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2209 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2210 "punpcklbw %%mm7, %%mm0 \n\t"
2211 "punpcklbw %%mm7, %%mm2 \n\t"
2212 "pmaddwd %%mm1, %%mm0 \n\t"
2213 "pmaddwd %%mm2, %%mm3 \n\t"
2214 "paddd %%mm3, %%mm5 \n\t"
2215 "paddd %%mm0, %%mm4 \n\t"
2216 "add $8, %1 \n\t"
2217 "add $4, %%"REG_c" \n\t"
2218 "cmp %4, %%"REG_c" \n\t"
2219 " jb 2b \n\t"
2220 "add %6, %1 \n\t"
2221 "movq %%mm4, %%mm0 \n\t"
2222 "punpckldq %%mm5, %%mm4 \n\t"
2223 "punpckhdq %%mm5, %%mm0 \n\t"
2224 "paddd %%mm0, %%mm4 \n\t"
2225 "psrad $7, %%mm4 \n\t"
2226 "packssdw %%mm4, %%mm4 \n\t"
2227 "mov %3, %%"REG_a" \n\t"
2228 "movd %%mm4, (%%"REG_a", %0) \n\t"
2229 "add $4, %0 \n\t"
2230 " jnc 1b \n\t"
2232 : "+r" (counter), "+r" (filter)
2233 : "m" (filterPos), "m" (dst), "m"(offset),
2234 "m" (src), "r" ((x86_reg)filterSize*2)
2235 : "%"REG_a, "%"REG_c, "%"REG_d
2238 #else
2239 #if HAVE_ALTIVEC
2240 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2241 #else
2242 int i;
2243 for (i=0; i<dstW; i++)
2245 int j;
2246 int srcPos= filterPos[i];
2247 int val=0;
2248 //printf("filterPos: %d\n", filterPos[i]);
2249 for (j=0; j<filterSize; j++)
2251 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2252 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2254 //filter += hFilterSize;
2255 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2256 //dst[i] = val>>7;
2258 #endif /* HAVE_ALTIVEC */
2259 #endif /* HAVE_MMX */
2261 // *** horizontal scale Y line to temp buffer
2262 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2263 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2264 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2265 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2266 int32_t *mmx2FilterPos, uint32_t *pal, int isAlpha)
2268 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2270 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2271 src= formatConvBuffer;
2273 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2275 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2276 src= formatConvBuffer;
2278 else if (srcFormat==PIX_FMT_RGB32)
2280 if (isAlpha)
2281 RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal);
2282 else
2283 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2284 src= formatConvBuffer;
2286 else if (srcFormat==PIX_FMT_RGB32_1)
2288 if (isAlpha)
2289 RENAME(abgrToA)(formatConvBuffer, src, srcW, pal);
2290 else
2291 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2292 src= formatConvBuffer;
2294 else if (srcFormat==PIX_FMT_BGR24)
2296 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2297 src= formatConvBuffer;
2299 else if (srcFormat==PIX_FMT_BGR565)
2301 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2302 src= formatConvBuffer;
2304 else if (srcFormat==PIX_FMT_BGR555)
2306 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2307 src= formatConvBuffer;
2309 else if (srcFormat==PIX_FMT_BGR32)
2311 if (isAlpha)
2312 RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal);
2313 else
2314 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2315 src= formatConvBuffer;
2317 else if (srcFormat==PIX_FMT_BGR32_1)
2319 if (isAlpha)
2320 RENAME(abgrToA)(formatConvBuffer, src, srcW, pal);
2321 else
2322 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2323 src= formatConvBuffer;
2325 else if (srcFormat==PIX_FMT_RGB24)
2327 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2328 src= formatConvBuffer;
2330 else if (srcFormat==PIX_FMT_RGB565)
2332 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2333 src= formatConvBuffer;
2335 else if (srcFormat==PIX_FMT_RGB555)
2337 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2338 src= formatConvBuffer;
2340 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2342 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2343 src= formatConvBuffer;
2345 else if (srcFormat==PIX_FMT_MONOBLACK)
2347 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2348 src= formatConvBuffer;
2350 else if (srcFormat==PIX_FMT_MONOWHITE)
2352 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2353 src= formatConvBuffer;
2356 #if HAVE_MMX
2357 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2358 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2359 #else
2360 if (!(flags&SWS_FAST_BILINEAR))
2361 #endif
2363 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2365 else // fast bilinear upscale / crap downscale
2367 #if ARCH_X86 && CONFIG_GPL
2368 #if HAVE_MMX2
2369 int i;
2370 #if defined(PIC)
2371 uint64_t ebxsave __attribute__((aligned(8)));
2372 #endif
2373 if (canMMX2BeUsed)
2375 __asm__ volatile(
2376 #if defined(PIC)
2377 "mov %%"REG_b", %5 \n\t"
2378 #endif
2379 "pxor %%mm7, %%mm7 \n\t"
2380 "mov %0, %%"REG_c" \n\t"
2381 "mov %1, %%"REG_D" \n\t"
2382 "mov %2, %%"REG_d" \n\t"
2383 "mov %3, %%"REG_b" \n\t"
2384 "xor %%"REG_a", %%"REG_a" \n\t" // i
2385 PREFETCH" (%%"REG_c") \n\t"
2386 PREFETCH" 32(%%"REG_c") \n\t"
2387 PREFETCH" 64(%%"REG_c") \n\t"
2389 #if ARCH_X86_64
2391 #define FUNNY_Y_CODE \
2392 "movl (%%"REG_b"), %%esi \n\t"\
2393 "call *%4 \n\t"\
2394 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2395 "add %%"REG_S", %%"REG_c" \n\t"\
2396 "add %%"REG_a", %%"REG_D" \n\t"\
2397 "xor %%"REG_a", %%"REG_a" \n\t"\
2399 #else
2401 #define FUNNY_Y_CODE \
2402 "movl (%%"REG_b"), %%esi \n\t"\
2403 "call *%4 \n\t"\
2404 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2405 "add %%"REG_a", %%"REG_D" \n\t"\
2406 "xor %%"REG_a", %%"REG_a" \n\t"\
2408 #endif /* ARCH_X86_64 */
2410 FUNNY_Y_CODE
2411 FUNNY_Y_CODE
2412 FUNNY_Y_CODE
2413 FUNNY_Y_CODE
2414 FUNNY_Y_CODE
2415 FUNNY_Y_CODE
2416 FUNNY_Y_CODE
2417 FUNNY_Y_CODE
2419 #if defined(PIC)
2420 "mov %5, %%"REG_b" \n\t"
2421 #endif
2422 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2423 "m" (funnyYCode)
2424 #if defined(PIC)
2425 ,"m" (ebxsave)
2426 #endif
2427 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2428 #if !defined(PIC)
2429 ,"%"REG_b
2430 #endif
2432 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2434 else
2436 #endif /* HAVE_MMX2 */
2437 x86_reg xInc_shr16 = xInc >> 16;
2438 uint16_t xInc_mask = xInc & 0xffff;
2439 //NO MMX just normal asm ...
2440 __asm__ volatile(
2441 "xor %%"REG_a", %%"REG_a" \n\t" // i
2442 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2443 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2444 ASMALIGN(4)
2445 "1: \n\t"
2446 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2447 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2448 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2449 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2450 "shll $16, %%edi \n\t"
2451 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2452 "mov %1, %%"REG_D" \n\t"
2453 "shrl $9, %%esi \n\t"
2454 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2455 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2456 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2458 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2459 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2460 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2461 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2462 "shll $16, %%edi \n\t"
2463 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2464 "mov %1, %%"REG_D" \n\t"
2465 "shrl $9, %%esi \n\t"
2466 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2467 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2468 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2471 "add $2, %%"REG_a" \n\t"
2472 "cmp %2, %%"REG_a" \n\t"
2473 " jb 1b \n\t"
2476 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2477 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2479 #if HAVE_MMX2
2480 } //if MMX2 can't be used
2481 #endif
2482 #else
2483 int i;
2484 unsigned int xpos=0;
2485 for (i=0;i<dstWidth;i++)
2487 register unsigned int xx=xpos>>16;
2488 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2489 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2490 xpos+=xInc;
2492 #endif /* ARCH_X86 */
2495 if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2496 int i;
2497 //FIXME all pal and rgb srcFormats could do this convertion as well
2498 //FIXME all scalers more complex than bilinear could do half of this transform
2499 if(c->srcRange){
2500 for (i=0; i<dstWidth; i++)
2501 dst[i]= (dst[i]*14071 + 33561947)>>14;
2502 }else{
2503 for (i=0; i<dstWidth; i++)
2504 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2509 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2510 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2511 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2512 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2513 int32_t *mmx2FilterPos, uint32_t *pal)
2515 if (srcFormat==PIX_FMT_YUYV422)
2517 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2518 src1= formatConvBuffer;
2519 src2= formatConvBuffer+VOFW;
2521 else if (srcFormat==PIX_FMT_UYVY422)
2523 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2524 src1= formatConvBuffer;
2525 src2= formatConvBuffer+VOFW;
2527 else if (srcFormat==PIX_FMT_RGB32)
2529 if(c->chrSrcHSubSample)
2530 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2531 else
2532 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2533 src1= formatConvBuffer;
2534 src2= formatConvBuffer+VOFW;
2536 else if (srcFormat==PIX_FMT_RGB32_1)
2538 if(c->chrSrcHSubSample)
2539 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2540 else
2541 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2542 src1= formatConvBuffer;
2543 src2= formatConvBuffer+VOFW;
2545 else if (srcFormat==PIX_FMT_BGR24)
2547 if(c->chrSrcHSubSample)
2548 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2549 else
2550 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2551 src1= formatConvBuffer;
2552 src2= formatConvBuffer+VOFW;
2554 else if (srcFormat==PIX_FMT_BGR565)
2556 if(c->chrSrcHSubSample)
2557 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2558 else
2559 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2560 src1= formatConvBuffer;
2561 src2= formatConvBuffer+VOFW;
2563 else if (srcFormat==PIX_FMT_BGR555)
2565 if(c->chrSrcHSubSample)
2566 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2567 else
2568 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2569 src1= formatConvBuffer;
2570 src2= formatConvBuffer+VOFW;
2572 else if (srcFormat==PIX_FMT_BGR32)
2574 if(c->chrSrcHSubSample)
2575 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2576 else
2577 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2578 src1= formatConvBuffer;
2579 src2= formatConvBuffer+VOFW;
2581 else if (srcFormat==PIX_FMT_BGR32_1)
2583 if(c->chrSrcHSubSample)
2584 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2585 else
2586 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2587 src1= formatConvBuffer;
2588 src2= formatConvBuffer+VOFW;
2590 else if (srcFormat==PIX_FMT_RGB24)
2592 if(c->chrSrcHSubSample)
2593 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2594 else
2595 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2596 src1= formatConvBuffer;
2597 src2= formatConvBuffer+VOFW;
2599 else if (srcFormat==PIX_FMT_RGB565)
2601 if(c->chrSrcHSubSample)
2602 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2603 else
2604 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2605 src1= formatConvBuffer;
2606 src2= formatConvBuffer+VOFW;
2608 else if (srcFormat==PIX_FMT_RGB555)
2610 if(c->chrSrcHSubSample)
2611 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2612 else
2613 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2614 src1= formatConvBuffer;
2615 src2= formatConvBuffer+VOFW;
2617 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2619 return;
2621 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2623 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2624 src1= formatConvBuffer;
2625 src2= formatConvBuffer+VOFW;
2628 #if HAVE_MMX
2629 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2630 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2631 #else
2632 if (!(flags&SWS_FAST_BILINEAR))
2633 #endif
2635 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2636 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2638 else // fast bilinear upscale / crap downscale
2640 #if ARCH_X86 && CONFIG_GPL
2641 #if HAVE_MMX2
2642 int i;
2643 #if defined(PIC)
2644 uint64_t ebxsave __attribute__((aligned(8)));
2645 #endif
2646 if (canMMX2BeUsed)
2648 __asm__ volatile(
2649 #if defined(PIC)
2650 "mov %%"REG_b", %6 \n\t"
2651 #endif
2652 "pxor %%mm7, %%mm7 \n\t"
2653 "mov %0, %%"REG_c" \n\t"
2654 "mov %1, %%"REG_D" \n\t"
2655 "mov %2, %%"REG_d" \n\t"
2656 "mov %3, %%"REG_b" \n\t"
2657 "xor %%"REG_a", %%"REG_a" \n\t" // i
2658 PREFETCH" (%%"REG_c") \n\t"
2659 PREFETCH" 32(%%"REG_c") \n\t"
2660 PREFETCH" 64(%%"REG_c") \n\t"
2662 #if ARCH_X86_64
2664 #define FUNNY_UV_CODE \
2665 "movl (%%"REG_b"), %%esi \n\t"\
2666 "call *%4 \n\t"\
2667 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2668 "add %%"REG_S", %%"REG_c" \n\t"\
2669 "add %%"REG_a", %%"REG_D" \n\t"\
2670 "xor %%"REG_a", %%"REG_a" \n\t"\
2672 #else
2674 #define FUNNY_UV_CODE \
2675 "movl (%%"REG_b"), %%esi \n\t"\
2676 "call *%4 \n\t"\
2677 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2678 "add %%"REG_a", %%"REG_D" \n\t"\
2679 "xor %%"REG_a", %%"REG_a" \n\t"\
2681 #endif /* ARCH_X86_64 */
2683 FUNNY_UV_CODE
2684 FUNNY_UV_CODE
2685 FUNNY_UV_CODE
2686 FUNNY_UV_CODE
2687 "xor %%"REG_a", %%"REG_a" \n\t" // i
2688 "mov %5, %%"REG_c" \n\t" // src
2689 "mov %1, %%"REG_D" \n\t" // buf1
2690 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2691 PREFETCH" (%%"REG_c") \n\t"
2692 PREFETCH" 32(%%"REG_c") \n\t"
2693 PREFETCH" 64(%%"REG_c") \n\t"
2695 FUNNY_UV_CODE
2696 FUNNY_UV_CODE
2697 FUNNY_UV_CODE
2698 FUNNY_UV_CODE
2700 #if defined(PIC)
2701 "mov %6, %%"REG_b" \n\t"
2702 #endif
2703 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2704 "m" (funnyUVCode), "m" (src2)
2705 #if defined(PIC)
2706 ,"m" (ebxsave)
2707 #endif
2708 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2709 #if !defined(PIC)
2710 ,"%"REG_b
2711 #endif
2713 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2715 //printf("%d %d %d\n", dstWidth, i, srcW);
2716 dst[i] = src1[srcW-1]*128;
2717 dst[i+VOFW] = src2[srcW-1]*128;
2720 else
2722 #endif /* HAVE_MMX2 */
2723 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2724 uint16_t xInc_mask = xInc & 0xffff;
2725 __asm__ volatile(
2726 "xor %%"REG_a", %%"REG_a" \n\t" // i
2727 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2728 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2729 ASMALIGN(4)
2730 "1: \n\t"
2731 "mov %0, %%"REG_S" \n\t"
2732 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2733 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2734 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2735 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2736 "shll $16, %%edi \n\t"
2737 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2738 "mov %1, %%"REG_D" \n\t"
2739 "shrl $9, %%esi \n\t"
2740 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2742 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2743 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2744 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2745 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2746 "shll $16, %%edi \n\t"
2747 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2748 "mov %1, %%"REG_D" \n\t"
2749 "shrl $9, %%esi \n\t"
2750 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2752 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2753 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2754 "add $1, %%"REG_a" \n\t"
2755 "cmp %2, %%"REG_a" \n\t"
2756 " jb 1b \n\t"
2758 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2759 which is needed to support GCC 4.0. */
2760 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2761 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2762 #else
2763 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2764 #endif
2765 "r" (src2)
2766 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2768 #if HAVE_MMX2
2769 } //if MMX2 can't be used
2770 #endif
2771 #else
2772 int i;
2773 unsigned int xpos=0;
2774 for (i=0;i<dstWidth;i++)
2776 register unsigned int xx=xpos>>16;
2777 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2778 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2779 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2780 /* slower
2781 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2782 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2784 xpos+=xInc;
2786 #endif /* ARCH_X86 */
2788 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2789 int i;
2790 //FIXME all pal and rgb srcFormats could do this convertion as well
2791 //FIXME all scalers more complex than bilinear could do half of this transform
2792 if(c->srcRange){
2793 for (i=0; i<dstWidth; i++){
2794 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2795 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2797 }else{
2798 for (i=0; i<dstWidth; i++){
2799 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2800 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2806 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2807 int srcSliceH, uint8_t* dst[], int dstStride[]){
2809 /* load a few things into local vars to make the code more readable? and faster */
2810 const int srcW= c->srcW;
2811 const int dstW= c->dstW;
2812 const int dstH= c->dstH;
2813 const int chrDstW= c->chrDstW;
2814 const int chrSrcW= c->chrSrcW;
2815 const int lumXInc= c->lumXInc;
2816 const int chrXInc= c->chrXInc;
2817 const int dstFormat= c->dstFormat;
2818 const int srcFormat= c->srcFormat;
2819 const int flags= c->flags;
2820 const int canMMX2BeUsed= c->canMMX2BeUsed;
2821 int16_t *vLumFilterPos= c->vLumFilterPos;
2822 int16_t *vChrFilterPos= c->vChrFilterPos;
2823 int16_t *hLumFilterPos= c->hLumFilterPos;
2824 int16_t *hChrFilterPos= c->hChrFilterPos;
2825 int16_t *vLumFilter= c->vLumFilter;
2826 int16_t *vChrFilter= c->vChrFilter;
2827 int16_t *hLumFilter= c->hLumFilter;
2828 int16_t *hChrFilter= c->hChrFilter;
2829 int32_t *lumMmxFilter= c->lumMmxFilter;
2830 int32_t *chrMmxFilter= c->chrMmxFilter;
2831 int32_t *alpMmxFilter= c->alpMmxFilter;
2832 const int vLumFilterSize= c->vLumFilterSize;
2833 const int vChrFilterSize= c->vChrFilterSize;
2834 const int hLumFilterSize= c->hLumFilterSize;
2835 const int hChrFilterSize= c->hChrFilterSize;
2836 int16_t **lumPixBuf= c->lumPixBuf;
2837 int16_t **chrPixBuf= c->chrPixBuf;
2838 int16_t **alpPixBuf= c->alpPixBuf;
2839 const int vLumBufSize= c->vLumBufSize;
2840 const int vChrBufSize= c->vChrBufSize;
2841 uint8_t *funnyYCode= c->funnyYCode;
2842 uint8_t *funnyUVCode= c->funnyUVCode;
2843 uint8_t *formatConvBuffer= c->formatConvBuffer;
2844 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2845 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2846 int lastDstY;
2847 uint32_t *pal=c->pal_yuv;
2849 /* vars which will change and which we need to store back in the context */
2850 int dstY= c->dstY;
2851 int lumBufIndex= c->lumBufIndex;
2852 int chrBufIndex= c->chrBufIndex;
2853 int lastInLumBuf= c->lastInLumBuf;
2854 int lastInChrBuf= c->lastInChrBuf;
2856 if (isPacked(c->srcFormat)){
2857 src[0]=
2858 src[1]=
2859 src[2]=
2860 src[3]= src[0];
2861 srcStride[0]=
2862 srcStride[1]=
2863 srcStride[2]=
2864 srcStride[3]= srcStride[0];
2866 srcStride[1]<<= c->vChrDrop;
2867 srcStride[2]<<= c->vChrDrop;
2869 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2870 // (int)dst[0], (int)dst[1], (int)dst[2]);
2872 #if 0 //self test FIXME move to a vfilter or something
2874 static volatile int i=0;
2875 i++;
2876 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2877 selfTest(src, srcStride, c->srcW, c->srcH);
2878 i--;
2880 #endif
2882 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2883 //dstStride[0],dstStride[1],dstStride[2]);
2885 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
2887 static int warnedAlready=0; //FIXME move this into the context perhaps
2888 if (flags & SWS_PRINT_INFO && !warnedAlready)
2890 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2891 " ->cannot do aligned memory accesses anymore\n");
2892 warnedAlready=1;
2896 /* Note the user might start scaling the picture in the middle so this
2897 will not get executed. This is not really intended but works
2898 currently, so people might do it. */
2899 if (srcSliceY ==0){
2900 lumBufIndex=0;
2901 chrBufIndex=0;
2902 dstY=0;
2903 lastInLumBuf= -1;
2904 lastInChrBuf= -1;
2907 lastDstY= dstY;
2909 for (;dstY < dstH; dstY++){
2910 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2911 const int chrDstY= dstY>>c->chrDstVSubSample;
2912 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2913 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2914 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2916 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2917 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2918 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2919 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2921 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2922 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2923 //handle holes (FAST_BILINEAR & weird filters)
2924 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2925 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2926 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2927 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2928 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2930 // Do we have enough lines in this slice to output the dstY line
2931 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2933 //Do horizontal scaling
2934 while(lastInLumBuf < lastLumSrcY)
2936 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2937 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2938 lumBufIndex++;
2939 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2940 assert(lumBufIndex < 2*vLumBufSize);
2941 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2942 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2943 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2944 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2945 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2946 funnyYCode, c->srcFormat, formatConvBuffer,
2947 c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 0);
2948 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2949 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2950 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2951 funnyYCode, c->srcFormat, formatConvBuffer,
2952 c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 1);
2953 lastInLumBuf++;
2955 while(lastInChrBuf < lastChrSrcY)
2957 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2958 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2959 chrBufIndex++;
2960 assert(chrBufIndex < 2*vChrBufSize);
2961 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2962 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2963 //FIXME replace parameters through context struct (some at least)
2965 if (!(isGray(srcFormat) || isGray(dstFormat)))
2966 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2967 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2968 funnyUVCode, c->srcFormat, formatConvBuffer,
2969 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2970 lastInChrBuf++;
2972 //wrap buf index around to stay inside the ring buffer
2973 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2974 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2976 else // not enough lines left in this slice -> load the rest in the buffer
2978 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2979 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2980 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2981 vChrBufSize, vLumBufSize);*/
2983 //Do horizontal scaling
2984 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2986 uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2987 uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2988 lumBufIndex++;
2989 assert(lumBufIndex < 2*vLumBufSize);
2990 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2991 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2992 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2993 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2994 funnyYCode, c->srcFormat, formatConvBuffer,
2995 c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 0);
2996 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2997 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2998 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2999 funnyYCode, c->srcFormat, formatConvBuffer,
3000 c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 1);
3001 lastInLumBuf++;
3003 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3005 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3006 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3007 chrBufIndex++;
3008 assert(chrBufIndex < 2*vChrBufSize);
3009 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3010 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3012 if (!(isGray(srcFormat) || isGray(dstFormat)))
3013 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3014 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3015 funnyUVCode, c->srcFormat, formatConvBuffer,
3016 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3017 lastInChrBuf++;
3019 //wrap buf index around to stay inside the ring buffer
3020 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3021 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3022 break; //we can't output a dstY line so let's try with the next slice
3025 #if HAVE_MMX
3026 c->blueDither= ff_dither8[dstY&1];
3027 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
3028 c->greenDither= ff_dither8[dstY&1];
3029 else
3030 c->greenDither= ff_dither4[dstY&1];
3031 c->redDither= ff_dither8[(dstY+1)&1];
3032 #endif
3033 if (dstY < dstH-2)
3035 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3036 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3037 int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
3038 #if HAVE_MMX
3039 int i;
3040 if (flags & SWS_ACCURATE_RND){
3041 int s= APCK_SIZE / 8;
3042 for (i=0; i<vLumFilterSize; i+=2){
3043 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
3044 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
3045 lumMmxFilter[s*i+APCK_COEF/4 ]=
3046 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
3047 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3048 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
3049 *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
3050 *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
3051 alpMmxFilter[s*i+APCK_COEF/4 ]=
3052 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
3055 for (i=0; i<vChrFilterSize; i+=2){
3056 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
3057 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
3058 chrMmxFilter[s*i+APCK_COEF/4 ]=
3059 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
3060 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3062 }else{
3063 for (i=0; i<vLumFilterSize; i++)
3065 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3066 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3067 lumMmxFilter[4*i+2]=
3068 lumMmxFilter[4*i+3]=
3069 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3070 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
3071 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
3072 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
3073 alpMmxFilter[4*i+2]=
3074 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
3077 for (i=0; i<vChrFilterSize; i++)
3079 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3080 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3081 chrMmxFilter[4*i+2]=
3082 chrMmxFilter[4*i+3]=
3083 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3086 #endif
3087 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3088 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3089 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3090 RENAME(yuv2nv12X)(c,
3091 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3092 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3093 dest, uDest, dstW, chrDstW, dstFormat);
3095 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3097 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3098 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3099 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3101 int16_t *lumBuf = lumPixBuf[0];
3102 int16_t *chrBuf= chrPixBuf[0];
3103 int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL;
3104 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
3106 else //General YV12
3108 RENAME(yuv2yuvX)(c,
3109 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3110 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3111 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
3114 else
3116 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3117 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3118 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3120 int chrAlpha= vChrFilter[2*dstY+1];
3121 if(flags & SWS_FULL_CHR_H_INT){
3122 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3123 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3124 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3125 alpSrcPtr, dest, dstW, dstY);
3126 }else{
3127 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3128 alpPixBuf ? *alpSrcPtr : NULL,
3129 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3132 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3134 int lumAlpha= vLumFilter[2*dstY+1];
3135 int chrAlpha= vChrFilter[2*dstY+1];
3136 lumMmxFilter[2]=
3137 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3138 chrMmxFilter[2]=
3139 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3140 if(flags & SWS_FULL_CHR_H_INT){
3141 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3142 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3143 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3144 alpSrcPtr, dest, dstW, dstY);
3145 }else{
3146 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3147 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
3148 dest, dstW, lumAlpha, chrAlpha, dstY);
3151 else //general RGB
3153 if(flags & SWS_FULL_CHR_H_INT){
3154 yuv2rgbXinC_full(c,
3155 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3156 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3157 alpSrcPtr, dest, dstW, dstY);
3158 }else{
3159 RENAME(yuv2packedX)(c,
3160 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3161 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3162 alpSrcPtr, dest, dstW, dstY);
3167 else // hmm looks like we can't use MMX here without overwriting this array's tail
3169 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3170 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3171 int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
3172 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3173 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3174 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3175 yuv2nv12XinC(
3176 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3177 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3178 dest, uDest, dstW, chrDstW, dstFormat);
3180 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3182 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3183 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3184 yuv2yuvXinC(
3185 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3186 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3187 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
3189 else
3191 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3192 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3193 if(flags & SWS_FULL_CHR_H_INT){
3194 yuv2rgbXinC_full(c,
3195 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3196 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3197 alpSrcPtr, dest, dstW, dstY);
3198 }else{
3199 yuv2packedXinC(c,
3200 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3201 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3202 alpSrcPtr, dest, dstW, dstY);
3208 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
3209 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
3211 #if HAVE_MMX
3212 __asm__ volatile(SFENCE:::"memory");
3213 __asm__ volatile(EMMS:::"memory");
3214 #endif
3215 /* store changed local vars back in the context */
3216 c->dstY= dstY;
3217 c->lumBufIndex= lumBufIndex;
3218 c->chrBufIndex= chrBufIndex;
3219 c->lastInLumBuf= lastInLumBuf;
3220 c->lastInChrBuf= lastInChrBuf;
3222 return dstY - lastDstY;