Document sws_scale().
[libswscale.git] / swscale_template.c
blob1f5a10de419ca23bd5cbc7d2e1ff865cb8d487b1
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
32 #if HAVE_AMD3DNOW
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
34 #define EMMS "femms"
35 #else
36 #define EMMS "emms"
37 #endif
39 #if HAVE_AMD3DNOW
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif HAVE_MMX2
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
48 #endif
50 #if HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
56 #if HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif HAVE_AMD3DNOW
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
62 #if HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
69 #if HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74 __asm__ volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 __asm__ volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX \
210 __asm__ volatile(\
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
237 ASMALIGN(4)\
238 "2: \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
249 " jnz 2b \n\t"\
251 #define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
258 #define YSCALEYUV2PACKEDX_ACCURATE \
259 __asm__ volatile(\
260 "xor %%"REG_a", %%"REG_a" \n\t"\
261 ASMALIGN(4)\
262 "nop \n\t"\
263 "1: \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
270 ASMALIGN(4)\
271 "2: \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
295 " jnz 2b \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
314 ASMALIGN(4)\
315 "2: \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
339 " jnz 2b \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
352 #define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
389 #define REAL_YSCALEYUV2PACKED(index, c) \
390 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
391 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
392 "psraw $3, %%mm0 \n\t"\
393 "psraw $3, %%mm1 \n\t"\
394 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
395 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
396 "xor "#index", "#index" \n\t"\
397 ASMALIGN(4)\
398 "1: \n\t"\
399 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
400 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
401 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
402 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
403 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
404 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
405 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
406 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
407 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
408 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
409 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
410 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
411 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
412 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
413 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
414 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
415 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
416 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
417 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
418 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
419 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
420 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
421 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
422 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
423 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
425 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
427 #define REAL_YSCALEYUV2RGB(index, c) \
428 "xor "#index", "#index" \n\t"\
429 ASMALIGN(4)\
430 "1: \n\t"\
431 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
432 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
433 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
434 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
435 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
436 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
437 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
438 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
439 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
440 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
441 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
442 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
443 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
444 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
445 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
446 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
447 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
448 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
449 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
450 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
451 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
452 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
453 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
454 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
455 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
456 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
457 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
458 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
459 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
460 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
461 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
462 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
463 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
491 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
493 #define REAL_YSCALEYUV2PACKED1(index, c) \
494 "xor "#index", "#index" \n\t"\
495 ASMALIGN(4)\
496 "1: \n\t"\
497 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
498 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
499 "psraw $7, %%mm3 \n\t" \
500 "psraw $7, %%mm4 \n\t" \
501 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
502 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
503 "psraw $7, %%mm1 \n\t" \
504 "psraw $7, %%mm7 \n\t" \
506 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
508 #define REAL_YSCALEYUV2RGB1(index, c) \
509 "xor "#index", "#index" \n\t"\
510 ASMALIGN(4)\
511 "1: \n\t"\
512 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
513 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
514 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
515 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
516 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
517 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
518 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
519 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
520 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
521 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
522 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
523 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
524 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
525 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
527 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
528 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
529 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
530 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
531 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
532 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
533 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
534 "paddw %%mm3, %%mm4 \n\t"\
535 "movq %%mm2, %%mm0 \n\t"\
536 "movq %%mm5, %%mm6 \n\t"\
537 "movq %%mm4, %%mm3 \n\t"\
538 "punpcklwd %%mm2, %%mm2 \n\t"\
539 "punpcklwd %%mm5, %%mm5 \n\t"\
540 "punpcklwd %%mm4, %%mm4 \n\t"\
541 "paddw %%mm1, %%mm2 \n\t"\
542 "paddw %%mm1, %%mm5 \n\t"\
543 "paddw %%mm1, %%mm4 \n\t"\
544 "punpckhwd %%mm0, %%mm0 \n\t"\
545 "punpckhwd %%mm6, %%mm6 \n\t"\
546 "punpckhwd %%mm3, %%mm3 \n\t"\
547 "paddw %%mm7, %%mm0 \n\t"\
548 "paddw %%mm7, %%mm6 \n\t"\
549 "paddw %%mm7, %%mm3 \n\t"\
550 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
551 "packuswb %%mm0, %%mm2 \n\t"\
552 "packuswb %%mm6, %%mm5 \n\t"\
553 "packuswb %%mm3, %%mm4 \n\t"\
554 "pxor %%mm7, %%mm7 \n\t"
555 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
557 #define REAL_YSCALEYUV2PACKED1b(index, c) \
558 "xor "#index", "#index" \n\t"\
559 ASMALIGN(4)\
560 "1: \n\t"\
561 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
562 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
563 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
564 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
565 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
566 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
567 "psrlw $8, %%mm3 \n\t" \
568 "psrlw $8, %%mm4 \n\t" \
569 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
570 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
571 "psraw $7, %%mm1 \n\t" \
572 "psraw $7, %%mm7 \n\t"
573 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
575 // do vertical chrominance interpolation
576 #define REAL_YSCALEYUV2RGB1b(index, c) \
577 "xor "#index", "#index" \n\t"\
578 ASMALIGN(4)\
579 "1: \n\t"\
580 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
581 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
582 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
583 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
584 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
585 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
586 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
587 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
588 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
589 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
590 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
591 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
592 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
593 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
594 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
595 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
596 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
597 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
599 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
600 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
601 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
602 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
603 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
604 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
605 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
606 "paddw %%mm3, %%mm4 \n\t"\
607 "movq %%mm2, %%mm0 \n\t"\
608 "movq %%mm5, %%mm6 \n\t"\
609 "movq %%mm4, %%mm3 \n\t"\
610 "punpcklwd %%mm2, %%mm2 \n\t"\
611 "punpcklwd %%mm5, %%mm5 \n\t"\
612 "punpcklwd %%mm4, %%mm4 \n\t"\
613 "paddw %%mm1, %%mm2 \n\t"\
614 "paddw %%mm1, %%mm5 \n\t"\
615 "paddw %%mm1, %%mm4 \n\t"\
616 "punpckhwd %%mm0, %%mm0 \n\t"\
617 "punpckhwd %%mm6, %%mm6 \n\t"\
618 "punpckhwd %%mm3, %%mm3 \n\t"\
619 "paddw %%mm7, %%mm0 \n\t"\
620 "paddw %%mm7, %%mm6 \n\t"\
621 "paddw %%mm7, %%mm3 \n\t"\
622 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
623 "packuswb %%mm0, %%mm2 \n\t"\
624 "packuswb %%mm6, %%mm5 \n\t"\
625 "packuswb %%mm3, %%mm4 \n\t"\
626 "pxor %%mm7, %%mm7 \n\t"
627 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
629 #define REAL_WRITEBGR32(dst, dstw, index) \
630 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
631 "movq %%mm2, %%mm1 \n\t" /* B */\
632 "movq %%mm5, %%mm6 \n\t" /* R */\
633 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
634 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
635 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
636 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
637 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
638 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
639 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
640 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
641 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
642 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
644 MOVNTQ(%%mm0, (dst, index, 4))\
645 MOVNTQ(%%mm2, 8(dst, index, 4))\
646 MOVNTQ(%%mm1, 16(dst, index, 4))\
647 MOVNTQ(%%mm3, 24(dst, index, 4))\
649 "add $8, "#index" \n\t"\
650 "cmp "#dstw", "#index" \n\t"\
651 " jb 1b \n\t"
652 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
654 #define REAL_WRITERGB16(dst, dstw, index) \
655 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
656 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
657 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
658 "psrlq $3, %%mm2 \n\t"\
660 "movq %%mm2, %%mm1 \n\t"\
661 "movq %%mm4, %%mm3 \n\t"\
663 "punpcklbw %%mm7, %%mm3 \n\t"\
664 "punpcklbw %%mm5, %%mm2 \n\t"\
665 "punpckhbw %%mm7, %%mm4 \n\t"\
666 "punpckhbw %%mm5, %%mm1 \n\t"\
668 "psllq $3, %%mm3 \n\t"\
669 "psllq $3, %%mm4 \n\t"\
671 "por %%mm3, %%mm2 \n\t"\
672 "por %%mm4, %%mm1 \n\t"\
674 MOVNTQ(%%mm2, (dst, index, 2))\
675 MOVNTQ(%%mm1, 8(dst, index, 2))\
677 "add $8, "#index" \n\t"\
678 "cmp "#dstw", "#index" \n\t"\
679 " jb 1b \n\t"
680 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
682 #define REAL_WRITERGB15(dst, dstw, index) \
683 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
684 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
685 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
686 "psrlq $3, %%mm2 \n\t"\
687 "psrlq $1, %%mm5 \n\t"\
689 "movq %%mm2, %%mm1 \n\t"\
690 "movq %%mm4, %%mm3 \n\t"\
692 "punpcklbw %%mm7, %%mm3 \n\t"\
693 "punpcklbw %%mm5, %%mm2 \n\t"\
694 "punpckhbw %%mm7, %%mm4 \n\t"\
695 "punpckhbw %%mm5, %%mm1 \n\t"\
697 "psllq $2, %%mm3 \n\t"\
698 "psllq $2, %%mm4 \n\t"\
700 "por %%mm3, %%mm2 \n\t"\
701 "por %%mm4, %%mm1 \n\t"\
703 MOVNTQ(%%mm2, (dst, index, 2))\
704 MOVNTQ(%%mm1, 8(dst, index, 2))\
706 "add $8, "#index" \n\t"\
707 "cmp "#dstw", "#index" \n\t"\
708 " jb 1b \n\t"
709 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
711 #define WRITEBGR24OLD(dst, dstw, index) \
712 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
713 "movq %%mm2, %%mm1 \n\t" /* B */\
714 "movq %%mm5, %%mm6 \n\t" /* R */\
715 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
716 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
717 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
718 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
719 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
720 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
721 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
722 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
723 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
724 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
726 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
727 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
728 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
729 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
730 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
731 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
732 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
733 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
735 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
736 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
737 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
738 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
739 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
740 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
741 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
742 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
743 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
744 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
745 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
746 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
747 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
749 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
750 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
751 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
752 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
753 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
754 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
755 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
756 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
758 MOVNTQ(%%mm0, (dst))\
759 MOVNTQ(%%mm2, 8(dst))\
760 MOVNTQ(%%mm3, 16(dst))\
761 "add $24, "#dst" \n\t"\
763 "add $8, "#index" \n\t"\
764 "cmp "#dstw", "#index" \n\t"\
765 " jb 1b \n\t"
767 #define WRITEBGR24MMX(dst, dstw, index) \
768 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
769 "movq %%mm2, %%mm1 \n\t" /* B */\
770 "movq %%mm5, %%mm6 \n\t" /* R */\
771 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
772 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
773 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
774 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
775 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
776 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
777 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
778 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
779 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
780 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
782 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
783 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
784 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
785 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
787 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
788 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
789 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
790 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
792 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
793 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
794 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
795 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
797 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
798 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
799 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
800 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
801 MOVNTQ(%%mm0, (dst))\
803 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
804 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
805 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
806 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
807 MOVNTQ(%%mm6, 8(dst))\
809 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
810 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
811 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
812 MOVNTQ(%%mm5, 16(dst))\
814 "add $24, "#dst" \n\t"\
816 "add $8, "#index" \n\t"\
817 "cmp "#dstw", "#index" \n\t"\
818 " jb 1b \n\t"
820 #define WRITEBGR24MMX2(dst, dstw, index) \
821 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
822 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
823 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
824 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
825 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
826 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
828 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
829 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
830 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
832 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
833 "por %%mm1, %%mm6 \n\t"\
834 "por %%mm3, %%mm6 \n\t"\
835 MOVNTQ(%%mm6, (dst))\
837 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
838 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
839 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
840 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
842 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
843 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
844 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
846 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
847 "por %%mm3, %%mm6 \n\t"\
848 MOVNTQ(%%mm6, 8(dst))\
850 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
851 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
852 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
854 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
855 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
856 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
858 "por %%mm1, %%mm3 \n\t"\
859 "por %%mm3, %%mm6 \n\t"\
860 MOVNTQ(%%mm6, 16(dst))\
862 "add $24, "#dst" \n\t"\
864 "add $8, "#index" \n\t"\
865 "cmp "#dstw", "#index" \n\t"\
866 " jb 1b \n\t"
868 #if HAVE_MMX2
869 #undef WRITEBGR24
870 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
871 #else
872 #undef WRITEBGR24
873 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
874 #endif
876 #define REAL_WRITEYUY2(dst, dstw, index) \
877 "packuswb %%mm3, %%mm3 \n\t"\
878 "packuswb %%mm4, %%mm4 \n\t"\
879 "packuswb %%mm7, %%mm1 \n\t"\
880 "punpcklbw %%mm4, %%mm3 \n\t"\
881 "movq %%mm1, %%mm7 \n\t"\
882 "punpcklbw %%mm3, %%mm1 \n\t"\
883 "punpckhbw %%mm3, %%mm7 \n\t"\
885 MOVNTQ(%%mm1, (dst, index, 2))\
886 MOVNTQ(%%mm7, 8(dst, index, 2))\
888 "add $8, "#index" \n\t"\
889 "cmp "#dstw", "#index" \n\t"\
890 " jb 1b \n\t"
891 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
894 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
895 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
896 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
898 #if HAVE_MMX
899 if(!(c->flags & SWS_BITEXACT)){
900 if (c->flags & SWS_ACCURATE_RND){
901 if (uDest){
902 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
903 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
906 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
907 }else{
908 if (uDest){
909 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
910 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
913 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
915 return;
917 #endif
918 #if HAVE_ALTIVEC
919 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
920 chrFilter, chrSrc, chrFilterSize,
921 dest, uDest, vDest, dstW, chrDstW);
922 #else //HAVE_ALTIVEC
923 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
924 chrFilter, chrSrc, chrFilterSize,
925 dest, uDest, vDest, dstW, chrDstW);
926 #endif //!HAVE_ALTIVEC
929 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
930 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
931 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
933 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
934 chrFilter, chrSrc, chrFilterSize,
935 dest, uDest, dstW, chrDstW, dstFormat);
938 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
939 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
941 int i;
942 #if HAVE_MMX
943 if(!(c->flags & SWS_BITEXACT)){
944 long p= uDest ? 3 : 1;
945 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
946 uint8_t *dst[3]= {dest, uDest, vDest};
947 long counter[3] = {dstW, chrDstW, chrDstW};
949 if (c->flags & SWS_ACCURATE_RND){
950 while(p--){
951 __asm__ volatile(
952 YSCALEYUV2YV121_ACCURATE
953 :: "r" (src[p]), "r" (dst[p] + counter[p]),
954 "g" (-counter[p])
955 : "%"REG_a
958 }else{
959 while(p--){
960 __asm__ volatile(
961 YSCALEYUV2YV121
962 :: "r" (src[p]), "r" (dst[p] + counter[p]),
963 "g" (-counter[p])
964 : "%"REG_a
968 return;
970 #endif
971 for (i=0; i<dstW; i++)
973 int val= (lumSrc[i]+64)>>7;
975 if (val&256){
976 if (val<0) val=0;
977 else val=255;
980 dest[i]= val;
983 if (uDest)
984 for (i=0; i<chrDstW; i++)
986 int u=(chrSrc[i ]+64)>>7;
987 int v=(chrSrc[i + VOFW]+64)>>7;
989 if ((u|v)&256){
990 if (u<0) u=0;
991 else if (u>255) u=255;
992 if (v<0) v=0;
993 else if (v>255) v=255;
996 uDest[i]= u;
997 vDest[i]= v;
1003 * vertical scale YV12 to RGB
1005 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1006 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1007 uint8_t *dest, long dstW, long dstY)
1009 #if HAVE_MMX
1010 long dummy=0;
1011 if(!(c->flags & SWS_BITEXACT)){
1012 if (c->flags & SWS_ACCURATE_RND){
1013 switch(c->dstFormat){
1014 case PIX_FMT_RGB32:
1015 YSCALEYUV2PACKEDX_ACCURATE
1016 YSCALEYUV2RGBX
1017 WRITEBGR32(%4, %5, %%REGa)
1019 YSCALEYUV2PACKEDX_END
1020 return;
1021 case PIX_FMT_BGR24:
1022 YSCALEYUV2PACKEDX_ACCURATE
1023 YSCALEYUV2RGBX
1024 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1025 "add %4, %%"REG_c" \n\t"
1026 WRITEBGR24(%%REGc, %5, %%REGa)
1029 :: "r" (&c->redDither),
1030 "m" (dummy), "m" (dummy), "m" (dummy),
1031 "r" (dest), "m" (dstW)
1032 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1034 return;
1035 case PIX_FMT_RGB555:
1036 YSCALEYUV2PACKEDX_ACCURATE
1037 YSCALEYUV2RGBX
1038 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1039 #ifdef DITHER1XBPP
1040 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1041 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1042 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1043 #endif
1045 WRITERGB15(%4, %5, %%REGa)
1046 YSCALEYUV2PACKEDX_END
1047 return;
1048 case PIX_FMT_RGB565:
1049 YSCALEYUV2PACKEDX_ACCURATE
1050 YSCALEYUV2RGBX
1051 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1052 #ifdef DITHER1XBPP
1053 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1054 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1055 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1056 #endif
1058 WRITERGB16(%4, %5, %%REGa)
1059 YSCALEYUV2PACKEDX_END
1060 return;
1061 case PIX_FMT_YUYV422:
1062 YSCALEYUV2PACKEDX_ACCURATE
1063 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1065 "psraw $3, %%mm3 \n\t"
1066 "psraw $3, %%mm4 \n\t"
1067 "psraw $3, %%mm1 \n\t"
1068 "psraw $3, %%mm7 \n\t"
1069 WRITEYUY2(%4, %5, %%REGa)
1070 YSCALEYUV2PACKEDX_END
1071 return;
1073 }else{
1074 switch(c->dstFormat)
1076 case PIX_FMT_RGB32:
1077 YSCALEYUV2PACKEDX
1078 YSCALEYUV2RGBX
1079 WRITEBGR32(%4, %5, %%REGa)
1080 YSCALEYUV2PACKEDX_END
1081 return;
1082 case PIX_FMT_BGR24:
1083 YSCALEYUV2PACKEDX
1084 YSCALEYUV2RGBX
1085 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1086 "add %4, %%"REG_c" \n\t"
1087 WRITEBGR24(%%REGc, %5, %%REGa)
1089 :: "r" (&c->redDither),
1090 "m" (dummy), "m" (dummy), "m" (dummy),
1091 "r" (dest), "m" (dstW)
1092 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1094 return;
1095 case PIX_FMT_RGB555:
1096 YSCALEYUV2PACKEDX
1097 YSCALEYUV2RGBX
1098 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1099 #ifdef DITHER1XBPP
1100 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1101 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1102 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1103 #endif
1105 WRITERGB15(%4, %5, %%REGa)
1106 YSCALEYUV2PACKEDX_END
1107 return;
1108 case PIX_FMT_RGB565:
1109 YSCALEYUV2PACKEDX
1110 YSCALEYUV2RGBX
1111 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1112 #ifdef DITHER1XBPP
1113 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1114 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1115 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1116 #endif
1118 WRITERGB16(%4, %5, %%REGa)
1119 YSCALEYUV2PACKEDX_END
1120 return;
1121 case PIX_FMT_YUYV422:
1122 YSCALEYUV2PACKEDX
1123 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1125 "psraw $3, %%mm3 \n\t"
1126 "psraw $3, %%mm4 \n\t"
1127 "psraw $3, %%mm1 \n\t"
1128 "psraw $3, %%mm7 \n\t"
1129 WRITEYUY2(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1131 return;
1135 #endif /* HAVE_MMX */
1136 #if HAVE_ALTIVEC
1137 /* The following list of supported dstFormat values should
1138 match what's found in the body of altivec_yuv2packedX() */
1139 if (!(c->flags & SWS_BITEXACT) &&
1140 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1141 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1142 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1143 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1144 chrFilter, chrSrc, chrFilterSize,
1145 dest, dstW, dstY);
1146 else
1147 #endif
1148 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1149 chrFilter, chrSrc, chrFilterSize,
1150 dest, dstW, dstY);
1154 * vertical bilinear scale YV12 to RGB
1156 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1157 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1159 int yalpha1=4095- yalpha;
1160 int uvalpha1=4095-uvalpha;
1161 int i;
1163 #if HAVE_MMX
1164 if(!(c->flags & SWS_BITEXACT)){
1165 switch(c->dstFormat)
1167 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1168 case PIX_FMT_RGB32:
1169 __asm__ volatile(
1170 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1171 "mov %4, %%"REG_b" \n\t"
1172 "push %%"REG_BP" \n\t"
1173 YSCALEYUV2RGB(%%REGBP, %5)
1174 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1175 "pop %%"REG_BP" \n\t"
1176 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1178 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1179 "a" (&c->redDither)
1181 return;
1182 case PIX_FMT_BGR24:
1183 __asm__ volatile(
1184 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1185 "mov %4, %%"REG_b" \n\t"
1186 "push %%"REG_BP" \n\t"
1187 YSCALEYUV2RGB(%%REGBP, %5)
1188 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1189 "pop %%"REG_BP" \n\t"
1190 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1191 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1192 "a" (&c->redDither)
1194 return;
1195 case PIX_FMT_RGB555:
1196 __asm__ volatile(
1197 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1198 "mov %4, %%"REG_b" \n\t"
1199 "push %%"REG_BP" \n\t"
1200 YSCALEYUV2RGB(%%REGBP, %5)
1201 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1202 #ifdef DITHER1XBPP
1203 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1204 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1205 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1206 #endif
1208 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1209 "pop %%"REG_BP" \n\t"
1210 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1212 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1213 "a" (&c->redDither)
1215 return;
1216 case PIX_FMT_RGB565:
1217 __asm__ volatile(
1218 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1219 "mov %4, %%"REG_b" \n\t"
1220 "push %%"REG_BP" \n\t"
1221 YSCALEYUV2RGB(%%REGBP, %5)
1222 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1223 #ifdef DITHER1XBPP
1224 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1225 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1226 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1227 #endif
1229 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1230 "pop %%"REG_BP" \n\t"
1231 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1232 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1233 "a" (&c->redDither)
1235 return;
1236 case PIX_FMT_YUYV422:
1237 __asm__ volatile(
1238 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1239 "mov %4, %%"REG_b" \n\t"
1240 "push %%"REG_BP" \n\t"
1241 YSCALEYUV2PACKED(%%REGBP, %5)
1242 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1243 "pop %%"REG_BP" \n\t"
1244 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1245 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1246 "a" (&c->redDither)
1248 return;
1249 default: break;
1252 #endif //HAVE_MMX
1253 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1257 * YV12 to RGB without scaling or interpolating
1259 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1260 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1262 const int yalpha1=0;
1263 int i;
1265 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1266 const int yalpha= 4096; //FIXME ...
1268 if (flags&SWS_FULL_CHR_H_INT)
1270 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1271 return;
1274 #if HAVE_MMX
1275 if(!(flags & SWS_BITEXACT)){
1276 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1278 switch(dstFormat)
1280 case PIX_FMT_RGB32:
1281 __asm__ volatile(
1282 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1283 "mov %4, %%"REG_b" \n\t"
1284 "push %%"REG_BP" \n\t"
1285 YSCALEYUV2RGB1(%%REGBP, %5)
1286 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1287 "pop %%"REG_BP" \n\t"
1288 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1290 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1291 "a" (&c->redDither)
1293 return;
1294 case PIX_FMT_BGR24:
1295 __asm__ volatile(
1296 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1297 "mov %4, %%"REG_b" \n\t"
1298 "push %%"REG_BP" \n\t"
1299 YSCALEYUV2RGB1(%%REGBP, %5)
1300 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1301 "pop %%"REG_BP" \n\t"
1302 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1304 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1305 "a" (&c->redDither)
1307 return;
1308 case PIX_FMT_RGB555:
1309 __asm__ volatile(
1310 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1311 "mov %4, %%"REG_b" \n\t"
1312 "push %%"REG_BP" \n\t"
1313 YSCALEYUV2RGB1(%%REGBP, %5)
1314 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1315 #ifdef DITHER1XBPP
1316 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1317 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1318 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1319 #endif
1320 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1321 "pop %%"REG_BP" \n\t"
1322 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1324 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1325 "a" (&c->redDither)
1327 return;
1328 case PIX_FMT_RGB565:
1329 __asm__ volatile(
1330 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1331 "mov %4, %%"REG_b" \n\t"
1332 "push %%"REG_BP" \n\t"
1333 YSCALEYUV2RGB1(%%REGBP, %5)
1334 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1335 #ifdef DITHER1XBPP
1336 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1337 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1338 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1339 #endif
1341 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1342 "pop %%"REG_BP" \n\t"
1343 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1345 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1346 "a" (&c->redDither)
1348 return;
1349 case PIX_FMT_YUYV422:
1350 __asm__ volatile(
1351 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1352 "mov %4, %%"REG_b" \n\t"
1353 "push %%"REG_BP" \n\t"
1354 YSCALEYUV2PACKED1(%%REGBP, %5)
1355 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1356 "pop %%"REG_BP" \n\t"
1357 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1359 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1360 "a" (&c->redDither)
1362 return;
1365 else
1367 switch(dstFormat)
1369 case PIX_FMT_RGB32:
1370 __asm__ volatile(
1371 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1372 "mov %4, %%"REG_b" \n\t"
1373 "push %%"REG_BP" \n\t"
1374 YSCALEYUV2RGB1b(%%REGBP, %5)
1375 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1376 "pop %%"REG_BP" \n\t"
1377 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1379 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1380 "a" (&c->redDither)
1382 return;
1383 case PIX_FMT_BGR24:
1384 __asm__ volatile(
1385 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1386 "mov %4, %%"REG_b" \n\t"
1387 "push %%"REG_BP" \n\t"
1388 YSCALEYUV2RGB1b(%%REGBP, %5)
1389 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1390 "pop %%"REG_BP" \n\t"
1391 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1393 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1394 "a" (&c->redDither)
1396 return;
1397 case PIX_FMT_RGB555:
1398 __asm__ volatile(
1399 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1400 "mov %4, %%"REG_b" \n\t"
1401 "push %%"REG_BP" \n\t"
1402 YSCALEYUV2RGB1b(%%REGBP, %5)
1403 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1404 #ifdef DITHER1XBPP
1405 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1406 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1407 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1408 #endif
1409 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1410 "pop %%"REG_BP" \n\t"
1411 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1413 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414 "a" (&c->redDither)
1416 return;
1417 case PIX_FMT_RGB565:
1418 __asm__ volatile(
1419 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1420 "mov %4, %%"REG_b" \n\t"
1421 "push %%"REG_BP" \n\t"
1422 YSCALEYUV2RGB1b(%%REGBP, %5)
1423 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1424 #ifdef DITHER1XBPP
1425 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1426 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1427 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1428 #endif
1430 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1431 "pop %%"REG_BP" \n\t"
1432 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1434 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1435 "a" (&c->redDither)
1437 return;
1438 case PIX_FMT_YUYV422:
1439 __asm__ volatile(
1440 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1441 "mov %4, %%"REG_b" \n\t"
1442 "push %%"REG_BP" \n\t"
1443 YSCALEYUV2PACKED1b(%%REGBP, %5)
1444 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1445 "pop %%"REG_BP" \n\t"
1446 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1448 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1449 "a" (&c->redDither)
1451 return;
1455 #endif /* HAVE_MMX */
1456 if (uvalpha < 2048)
1458 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1459 }else{
1460 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1464 //FIXME yuy2* can read up to 7 samples too much
1466 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1468 #if HAVE_MMX
1469 __asm__ volatile(
1470 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1471 "mov %0, %%"REG_a" \n\t"
1472 "1: \n\t"
1473 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1474 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1475 "pand %%mm2, %%mm0 \n\t"
1476 "pand %%mm2, %%mm1 \n\t"
1477 "packuswb %%mm1, %%mm0 \n\t"
1478 "movq %%mm0, (%2, %%"REG_a") \n\t"
1479 "add $8, %%"REG_a" \n\t"
1480 " js 1b \n\t"
1481 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1482 : "%"REG_a
1484 #else
1485 int i;
1486 for (i=0; i<width; i++)
1487 dst[i]= src[2*i];
1488 #endif
1491 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1493 #if HAVE_MMX
1494 __asm__ volatile(
1495 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1496 "mov %0, %%"REG_a" \n\t"
1497 "1: \n\t"
1498 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1499 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1500 "psrlw $8, %%mm0 \n\t"
1501 "psrlw $8, %%mm1 \n\t"
1502 "packuswb %%mm1, %%mm0 \n\t"
1503 "movq %%mm0, %%mm1 \n\t"
1504 "psrlw $8, %%mm0 \n\t"
1505 "pand %%mm4, %%mm1 \n\t"
1506 "packuswb %%mm0, %%mm0 \n\t"
1507 "packuswb %%mm1, %%mm1 \n\t"
1508 "movd %%mm0, (%3, %%"REG_a") \n\t"
1509 "movd %%mm1, (%2, %%"REG_a") \n\t"
1510 "add $4, %%"REG_a" \n\t"
1511 " js 1b \n\t"
1512 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1513 : "%"REG_a
1515 #else
1516 int i;
1517 for (i=0; i<width; i++)
1519 dstU[i]= src1[4*i + 1];
1520 dstV[i]= src1[4*i + 3];
1522 #endif
1523 assert(src1 == src2);
1526 /* This is almost identical to the previous, end exists only because
1527 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1528 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1530 #if HAVE_MMX
1531 __asm__ volatile(
1532 "mov %0, %%"REG_a" \n\t"
1533 "1: \n\t"
1534 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1535 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1536 "psrlw $8, %%mm0 \n\t"
1537 "psrlw $8, %%mm1 \n\t"
1538 "packuswb %%mm1, %%mm0 \n\t"
1539 "movq %%mm0, (%2, %%"REG_a") \n\t"
1540 "add $8, %%"REG_a" \n\t"
1541 " js 1b \n\t"
1542 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1543 : "%"REG_a
1545 #else
1546 int i;
1547 for (i=0; i<width; i++)
1548 dst[i]= src[2*i+1];
1549 #endif
1552 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1554 #if HAVE_MMX
1555 __asm__ volatile(
1556 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1557 "mov %0, %%"REG_a" \n\t"
1558 "1: \n\t"
1559 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1560 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1561 "pand %%mm4, %%mm0 \n\t"
1562 "pand %%mm4, %%mm1 \n\t"
1563 "packuswb %%mm1, %%mm0 \n\t"
1564 "movq %%mm0, %%mm1 \n\t"
1565 "psrlw $8, %%mm0 \n\t"
1566 "pand %%mm4, %%mm1 \n\t"
1567 "packuswb %%mm0, %%mm0 \n\t"
1568 "packuswb %%mm1, %%mm1 \n\t"
1569 "movd %%mm0, (%3, %%"REG_a") \n\t"
1570 "movd %%mm1, (%2, %%"REG_a") \n\t"
1571 "add $4, %%"REG_a" \n\t"
1572 " js 1b \n\t"
1573 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1574 : "%"REG_a
1576 #else
1577 int i;
1578 for (i=0; i<width; i++)
1580 dstU[i]= src1[4*i + 0];
1581 dstV[i]= src1[4*i + 2];
1583 #endif
1584 assert(src1 == src2);
1587 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1588 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1590 int i;\
1591 for (i=0; i<width; i++)\
1593 int b= (((type*)src)[i]>>shb)&maskb;\
1594 int g= (((type*)src)[i]>>shg)&maskg;\
1595 int r= (((type*)src)[i]>>shr)&maskr;\
1597 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1601 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1602 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY , BY<< 8, RGB2YUV_SHIFT+8)
1603 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY , RGB2YUV_SHIFT+8)
1604 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY , RGB2YUV_SHIFT+7)
1605 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1606 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1608 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1609 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1611 int i;\
1612 for (i=0; i<width; i++)\
1614 int b= (((type*)src)[i]&maskb)>>shb;\
1615 int g= (((type*)src)[i]&maskg)>>shg;\
1616 int r= (((type*)src)[i]&maskr)>>shr;\
1618 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1619 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1622 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1624 int i;\
1625 for (i=0; i<width; i++)\
1627 int pix0= ((type*)src)[2*i+0];\
1628 int pix1= ((type*)src)[2*i+1];\
1629 int g= (pix0&maskg)+(pix1&maskg);\
1630 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1631 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1633 g>>=shg;\
1635 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1636 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1640 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1641 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8)
1642 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8)
1643 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7)
1644 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1645 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1647 #if HAVE_MMX
1648 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1651 if(srcFormat == PIX_FMT_BGR24){
1652 __asm__ volatile(
1653 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1654 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1657 }else{
1658 __asm__ volatile(
1659 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1660 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1665 __asm__ volatile(
1666 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1667 "mov %2, %%"REG_a" \n\t"
1668 "pxor %%mm7, %%mm7 \n\t"
1669 "1: \n\t"
1670 PREFETCH" 64(%0) \n\t"
1671 "movd (%0), %%mm0 \n\t"
1672 "movd 2(%0), %%mm1 \n\t"
1673 "movd 6(%0), %%mm2 \n\t"
1674 "movd 8(%0), %%mm3 \n\t"
1675 "add $12, %0 \n\t"
1676 "punpcklbw %%mm7, %%mm0 \n\t"
1677 "punpcklbw %%mm7, %%mm1 \n\t"
1678 "punpcklbw %%mm7, %%mm2 \n\t"
1679 "punpcklbw %%mm7, %%mm3 \n\t"
1680 "pmaddwd %%mm5, %%mm0 \n\t"
1681 "pmaddwd %%mm6, %%mm1 \n\t"
1682 "pmaddwd %%mm5, %%mm2 \n\t"
1683 "pmaddwd %%mm6, %%mm3 \n\t"
1684 "paddd %%mm1, %%mm0 \n\t"
1685 "paddd %%mm3, %%mm2 \n\t"
1686 "paddd %%mm4, %%mm0 \n\t"
1687 "paddd %%mm4, %%mm2 \n\t"
1688 "psrad $15, %%mm0 \n\t"
1689 "psrad $15, %%mm2 \n\t"
1690 "packssdw %%mm2, %%mm0 \n\t"
1691 "packuswb %%mm0, %%mm0 \n\t"
1692 "movd %%mm0, (%1, %%"REG_a") \n\t"
1693 "add $4, %%"REG_a" \n\t"
1694 " js 1b \n\t"
1695 : "+r" (src)
1696 : "r" (dst+width), "g" (-width)
1697 : "%"REG_a
1701 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1703 __asm__ volatile(
1704 "movq 24+%4, %%mm6 \n\t"
1705 "mov %3, %%"REG_a" \n\t"
1706 "pxor %%mm7, %%mm7 \n\t"
1707 "1: \n\t"
1708 PREFETCH" 64(%0) \n\t"
1709 "movd (%0), %%mm0 \n\t"
1710 "movd 2(%0), %%mm1 \n\t"
1711 "punpcklbw %%mm7, %%mm0 \n\t"
1712 "punpcklbw %%mm7, %%mm1 \n\t"
1713 "movq %%mm0, %%mm2 \n\t"
1714 "movq %%mm1, %%mm3 \n\t"
1715 "pmaddwd %4, %%mm0 \n\t"
1716 "pmaddwd 8+%4, %%mm1 \n\t"
1717 "pmaddwd 16+%4, %%mm2 \n\t"
1718 "pmaddwd %%mm6, %%mm3 \n\t"
1719 "paddd %%mm1, %%mm0 \n\t"
1720 "paddd %%mm3, %%mm2 \n\t"
1722 "movd 6(%0), %%mm1 \n\t"
1723 "movd 8(%0), %%mm3 \n\t"
1724 "add $12, %0 \n\t"
1725 "punpcklbw %%mm7, %%mm1 \n\t"
1726 "punpcklbw %%mm7, %%mm3 \n\t"
1727 "movq %%mm1, %%mm4 \n\t"
1728 "movq %%mm3, %%mm5 \n\t"
1729 "pmaddwd %4, %%mm1 \n\t"
1730 "pmaddwd 8+%4, %%mm3 \n\t"
1731 "pmaddwd 16+%4, %%mm4 \n\t"
1732 "pmaddwd %%mm6, %%mm5 \n\t"
1733 "paddd %%mm3, %%mm1 \n\t"
1734 "paddd %%mm5, %%mm4 \n\t"
1736 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1737 "paddd %%mm3, %%mm0 \n\t"
1738 "paddd %%mm3, %%mm2 \n\t"
1739 "paddd %%mm3, %%mm1 \n\t"
1740 "paddd %%mm3, %%mm4 \n\t"
1741 "psrad $15, %%mm0 \n\t"
1742 "psrad $15, %%mm2 \n\t"
1743 "psrad $15, %%mm1 \n\t"
1744 "psrad $15, %%mm4 \n\t"
1745 "packssdw %%mm1, %%mm0 \n\t"
1746 "packssdw %%mm4, %%mm2 \n\t"
1747 "packuswb %%mm0, %%mm0 \n\t"
1748 "packuswb %%mm2, %%mm2 \n\t"
1749 "movd %%mm0, (%1, %%"REG_a") \n\t"
1750 "movd %%mm2, (%2, %%"REG_a") \n\t"
1751 "add $4, %%"REG_a" \n\t"
1752 " js 1b \n\t"
1753 : "+r" (src)
1754 : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1755 : "%"REG_a
1758 #endif
1760 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1762 #if HAVE_MMX
1763 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1764 #else
1765 int i;
1766 for (i=0; i<width; i++)
1768 int b= src[i*3+0];
1769 int g= src[i*3+1];
1770 int r= src[i*3+2];
1772 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1774 #endif /* HAVE_MMX */
1777 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1779 #if HAVE_MMX
1780 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1781 #else
1782 int i;
1783 for (i=0; i<width; i++)
1785 int b= src1[3*i + 0];
1786 int g= src1[3*i + 1];
1787 int r= src1[3*i + 2];
1789 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1790 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1792 #endif /* HAVE_MMX */
1793 assert(src1 == src2);
1796 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1798 int i;
1799 for (i=0; i<width; i++)
1801 int b= src1[6*i + 0] + src1[6*i + 3];
1802 int g= src1[6*i + 1] + src1[6*i + 4];
1803 int r= src1[6*i + 2] + src1[6*i + 5];
1805 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1806 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1808 assert(src1 == src2);
1811 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1813 #if HAVE_MMX
1814 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1815 #else
1816 int i;
1817 for (i=0; i<width; i++)
1819 int r= src[i*3+0];
1820 int g= src[i*3+1];
1821 int b= src[i*3+2];
1823 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1825 #endif
1828 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1830 #if HAVE_MMX
1831 assert(src1==src2);
1832 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1833 #else
1834 int i;
1835 assert(src1==src2);
1836 for (i=0; i<width; i++)
1838 int r= src1[3*i + 0];
1839 int g= src1[3*i + 1];
1840 int b= src1[3*i + 2];
1842 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1843 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1845 #endif
1848 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1850 int i;
1851 assert(src1==src2);
1852 for (i=0; i<width; i++)
1854 int r= src1[6*i + 0] + src1[6*i + 3];
1855 int g= src1[6*i + 1] + src1[6*i + 4];
1856 int b= src1[6*i + 2] + src1[6*i + 5];
1858 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1859 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1864 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
1866 int i;
1867 for (i=0; i<width; i++)
1869 int d= src[i];
1871 dst[i]= pal[d] & 0xFF;
1875 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
1877 int i;
1878 assert(src1 == src2);
1879 for (i=0; i<width; i++)
1881 int p= pal[src1[i]];
1883 dstU[i]= p>>8;
1884 dstV[i]= p>>16;
1888 static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1890 int i, j;
1891 for (i=0; i<width/8; i++){
1892 int d= ~src[i];
1893 for(j=0; j<8; j++)
1894 dst[8*i+j]= ((d>>(7-j))&1)*255;
1898 static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1900 int i, j;
1901 for (i=0; i<width/8; i++){
1902 int d= src[i];
1903 for(j=0; j<8; j++)
1904 dst[8*i+j]= ((d>>(7-j))&1)*255;
1908 // bilinear / bicubic scaling
1909 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1910 int16_t *filter, int16_t *filterPos, long filterSize)
1912 #if HAVE_MMX
1913 assert(filterSize % 4 == 0 && filterSize>0);
1914 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
1916 long counter= -2*dstW;
1917 filter-= counter*2;
1918 filterPos-= counter/2;
1919 dst-= counter/2;
1920 __asm__ volatile(
1921 #if defined(PIC)
1922 "push %%"REG_b" \n\t"
1923 #endif
1924 "pxor %%mm7, %%mm7 \n\t"
1925 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1926 "mov %%"REG_a", %%"REG_BP" \n\t"
1927 ASMALIGN(4)
1928 "1: \n\t"
1929 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1930 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1931 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
1932 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
1933 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1934 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1935 "punpcklbw %%mm7, %%mm0 \n\t"
1936 "punpcklbw %%mm7, %%mm2 \n\t"
1937 "pmaddwd %%mm1, %%mm0 \n\t"
1938 "pmaddwd %%mm2, %%mm3 \n\t"
1939 "movq %%mm0, %%mm4 \n\t"
1940 "punpckldq %%mm3, %%mm0 \n\t"
1941 "punpckhdq %%mm3, %%mm4 \n\t"
1942 "paddd %%mm4, %%mm0 \n\t"
1943 "psrad $7, %%mm0 \n\t"
1944 "packssdw %%mm0, %%mm0 \n\t"
1945 "movd %%mm0, (%4, %%"REG_BP") \n\t"
1946 "add $4, %%"REG_BP" \n\t"
1947 " jnc 1b \n\t"
1949 "pop %%"REG_BP" \n\t"
1950 #if defined(PIC)
1951 "pop %%"REG_b" \n\t"
1952 #endif
1953 : "+a" (counter)
1954 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1955 #if !defined(PIC)
1956 : "%"REG_b
1957 #endif
1960 else if (filterSize==8)
1962 long counter= -2*dstW;
1963 filter-= counter*4;
1964 filterPos-= counter/2;
1965 dst-= counter/2;
1966 __asm__ volatile(
1967 #if defined(PIC)
1968 "push %%"REG_b" \n\t"
1969 #endif
1970 "pxor %%mm7, %%mm7 \n\t"
1971 "push %%"REG_BP" \n\t" // we use 7 regs here ...
1972 "mov %%"REG_a", %%"REG_BP" \n\t"
1973 ASMALIGN(4)
1974 "1: \n\t"
1975 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
1976 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
1977 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
1978 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
1979 "movd (%3, %%"REG_a"), %%mm0 \n\t"
1980 "movd (%3, %%"REG_b"), %%mm2 \n\t"
1981 "punpcklbw %%mm7, %%mm0 \n\t"
1982 "punpcklbw %%mm7, %%mm2 \n\t"
1983 "pmaddwd %%mm1, %%mm0 \n\t"
1984 "pmaddwd %%mm2, %%mm3 \n\t"
1986 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
1987 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
1988 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
1989 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
1990 "punpcklbw %%mm7, %%mm4 \n\t"
1991 "punpcklbw %%mm7, %%mm2 \n\t"
1992 "pmaddwd %%mm1, %%mm4 \n\t"
1993 "pmaddwd %%mm2, %%mm5 \n\t"
1994 "paddd %%mm4, %%mm0 \n\t"
1995 "paddd %%mm5, %%mm3 \n\t"
1996 "movq %%mm0, %%mm4 \n\t"
1997 "punpckldq %%mm3, %%mm0 \n\t"
1998 "punpckhdq %%mm3, %%mm4 \n\t"
1999 "paddd %%mm4, %%mm0 \n\t"
2000 "psrad $7, %%mm0 \n\t"
2001 "packssdw %%mm0, %%mm0 \n\t"
2002 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2003 "add $4, %%"REG_BP" \n\t"
2004 " jnc 1b \n\t"
2006 "pop %%"REG_BP" \n\t"
2007 #if defined(PIC)
2008 "pop %%"REG_b" \n\t"
2009 #endif
2010 : "+a" (counter)
2011 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2012 #if !defined(PIC)
2013 : "%"REG_b
2014 #endif
2017 else
2019 uint8_t *offset = src+filterSize;
2020 long counter= -2*dstW;
2021 //filter-= counter*filterSize/2;
2022 filterPos-= counter/2;
2023 dst-= counter/2;
2024 __asm__ volatile(
2025 "pxor %%mm7, %%mm7 \n\t"
2026 ASMALIGN(4)
2027 "1: \n\t"
2028 "mov %2, %%"REG_c" \n\t"
2029 "movzwl (%%"REG_c", %0), %%eax \n\t"
2030 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2031 "mov %5, %%"REG_c" \n\t"
2032 "pxor %%mm4, %%mm4 \n\t"
2033 "pxor %%mm5, %%mm5 \n\t"
2034 "2: \n\t"
2035 "movq (%1), %%mm1 \n\t"
2036 "movq (%1, %6), %%mm3 \n\t"
2037 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2038 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2039 "punpcklbw %%mm7, %%mm0 \n\t"
2040 "punpcklbw %%mm7, %%mm2 \n\t"
2041 "pmaddwd %%mm1, %%mm0 \n\t"
2042 "pmaddwd %%mm2, %%mm3 \n\t"
2043 "paddd %%mm3, %%mm5 \n\t"
2044 "paddd %%mm0, %%mm4 \n\t"
2045 "add $8, %1 \n\t"
2046 "add $4, %%"REG_c" \n\t"
2047 "cmp %4, %%"REG_c" \n\t"
2048 " jb 2b \n\t"
2049 "add %6, %1 \n\t"
2050 "movq %%mm4, %%mm0 \n\t"
2051 "punpckldq %%mm5, %%mm4 \n\t"
2052 "punpckhdq %%mm5, %%mm0 \n\t"
2053 "paddd %%mm0, %%mm4 \n\t"
2054 "psrad $7, %%mm4 \n\t"
2055 "packssdw %%mm4, %%mm4 \n\t"
2056 "mov %3, %%"REG_a" \n\t"
2057 "movd %%mm4, (%%"REG_a", %0) \n\t"
2058 "add $4, %0 \n\t"
2059 " jnc 1b \n\t"
2061 : "+r" (counter), "+r" (filter)
2062 : "m" (filterPos), "m" (dst), "m"(offset),
2063 "m" (src), "r" (filterSize*2)
2064 : "%"REG_a, "%"REG_c, "%"REG_d
2067 #else
2068 #if HAVE_ALTIVEC
2069 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2070 #else
2071 int i;
2072 for (i=0; i<dstW; i++)
2074 int j;
2075 int srcPos= filterPos[i];
2076 int val=0;
2077 //printf("filterPos: %d\n", filterPos[i]);
2078 for (j=0; j<filterSize; j++)
2080 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2081 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2083 //filter += hFilterSize;
2084 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2085 //dst[i] = val>>7;
2087 #endif /* HAVE_ALTIVEC */
2088 #endif /* HAVE_MMX */
2090 // *** horizontal scale Y line to temp buffer
2091 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2092 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2093 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2094 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2095 int32_t *mmx2FilterPos, uint32_t *pal)
2097 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2099 RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2100 src= formatConvBuffer;
2102 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2104 RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2105 src= formatConvBuffer;
2107 else if (srcFormat==PIX_FMT_RGB32)
2109 RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2110 src= formatConvBuffer;
2112 else if (srcFormat==PIX_FMT_RGB32_1)
2114 RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2115 src= formatConvBuffer;
2117 else if (srcFormat==PIX_FMT_BGR24)
2119 RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2120 src= formatConvBuffer;
2122 else if (srcFormat==PIX_FMT_BGR565)
2124 RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2125 src= formatConvBuffer;
2127 else if (srcFormat==PIX_FMT_BGR555)
2129 RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2130 src= formatConvBuffer;
2132 else if (srcFormat==PIX_FMT_BGR32)
2134 RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2135 src= formatConvBuffer;
2137 else if (srcFormat==PIX_FMT_BGR32_1)
2139 RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2140 src= formatConvBuffer;
2142 else if (srcFormat==PIX_FMT_RGB24)
2144 RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2145 src= formatConvBuffer;
2147 else if (srcFormat==PIX_FMT_RGB565)
2149 RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2150 src= formatConvBuffer;
2152 else if (srcFormat==PIX_FMT_RGB555)
2154 RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2155 src= formatConvBuffer;
2157 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2159 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2160 src= formatConvBuffer;
2162 else if (srcFormat==PIX_FMT_MONOBLACK)
2164 RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2165 src= formatConvBuffer;
2167 else if (srcFormat==PIX_FMT_MONOWHITE)
2169 RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2170 src= formatConvBuffer;
2173 #if HAVE_MMX
2174 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2175 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2176 #else
2177 if (!(flags&SWS_FAST_BILINEAR))
2178 #endif
2180 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2182 else // fast bilinear upscale / crap downscale
2184 #if ARCH_X86
2185 #if HAVE_MMX2
2186 int i;
2187 #if defined(PIC)
2188 uint64_t ebxsave __attribute__((aligned(8)));
2189 #endif
2190 if (canMMX2BeUsed)
2192 __asm__ volatile(
2193 #if defined(PIC)
2194 "mov %%"REG_b", %5 \n\t"
2195 #endif
2196 "pxor %%mm7, %%mm7 \n\t"
2197 "mov %0, %%"REG_c" \n\t"
2198 "mov %1, %%"REG_D" \n\t"
2199 "mov %2, %%"REG_d" \n\t"
2200 "mov %3, %%"REG_b" \n\t"
2201 "xor %%"REG_a", %%"REG_a" \n\t" // i
2202 PREFETCH" (%%"REG_c") \n\t"
2203 PREFETCH" 32(%%"REG_c") \n\t"
2204 PREFETCH" 64(%%"REG_c") \n\t"
2206 #if ARCH_X86_64
2208 #define FUNNY_Y_CODE \
2209 "movl (%%"REG_b"), %%esi \n\t"\
2210 "call *%4 \n\t"\
2211 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2212 "add %%"REG_S", %%"REG_c" \n\t"\
2213 "add %%"REG_a", %%"REG_D" \n\t"\
2214 "xor %%"REG_a", %%"REG_a" \n\t"\
2216 #else
2218 #define FUNNY_Y_CODE \
2219 "movl (%%"REG_b"), %%esi \n\t"\
2220 "call *%4 \n\t"\
2221 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2222 "add %%"REG_a", %%"REG_D" \n\t"\
2223 "xor %%"REG_a", %%"REG_a" \n\t"\
2225 #endif /* ARCH_X86_64 */
2227 FUNNY_Y_CODE
2228 FUNNY_Y_CODE
2229 FUNNY_Y_CODE
2230 FUNNY_Y_CODE
2231 FUNNY_Y_CODE
2232 FUNNY_Y_CODE
2233 FUNNY_Y_CODE
2234 FUNNY_Y_CODE
2236 #if defined(PIC)
2237 "mov %5, %%"REG_b" \n\t"
2238 #endif
2239 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2240 "m" (funnyYCode)
2241 #if defined(PIC)
2242 ,"m" (ebxsave)
2243 #endif
2244 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2245 #if !defined(PIC)
2246 ,"%"REG_b
2247 #endif
2249 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2251 else
2253 #endif /* HAVE_MMX2 */
2254 long xInc_shr16 = xInc >> 16;
2255 uint16_t xInc_mask = xInc & 0xffff;
2256 //NO MMX just normal asm ...
2257 __asm__ volatile(
2258 "xor %%"REG_a", %%"REG_a" \n\t" // i
2259 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2260 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2261 ASMALIGN(4)
2262 "1: \n\t"
2263 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2264 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2265 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2266 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2267 "shll $16, %%edi \n\t"
2268 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2269 "mov %1, %%"REG_D" \n\t"
2270 "shrl $9, %%esi \n\t"
2271 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2272 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2273 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2275 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2276 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2277 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2278 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2279 "shll $16, %%edi \n\t"
2280 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2281 "mov %1, %%"REG_D" \n\t"
2282 "shrl $9, %%esi \n\t"
2283 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2284 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2285 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2288 "add $2, %%"REG_a" \n\t"
2289 "cmp %2, %%"REG_a" \n\t"
2290 " jb 1b \n\t"
2293 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2294 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2296 #if HAVE_MMX2
2297 } //if MMX2 can't be used
2298 #endif
2299 #else
2300 int i;
2301 unsigned int xpos=0;
2302 for (i=0;i<dstWidth;i++)
2304 register unsigned int xx=xpos>>16;
2305 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2306 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2307 xpos+=xInc;
2309 #endif /* ARCH_X86 */
2312 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2313 int i;
2314 //FIXME all pal and rgb srcFormats could do this convertion as well
2315 //FIXME all scalers more complex than bilinear could do half of this transform
2316 if(c->srcRange){
2317 for (i=0; i<dstWidth; i++)
2318 dst[i]= (dst[i]*14071 + 33561947)>>14;
2319 }else{
2320 for (i=0; i<dstWidth; i++)
2321 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2326 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2327 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2328 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2329 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2330 int32_t *mmx2FilterPos, uint32_t *pal)
2332 if (srcFormat==PIX_FMT_YUYV422)
2334 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2335 src1= formatConvBuffer;
2336 src2= formatConvBuffer+VOFW;
2338 else if (srcFormat==PIX_FMT_UYVY422)
2340 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2341 src1= formatConvBuffer;
2342 src2= formatConvBuffer+VOFW;
2344 else if (srcFormat==PIX_FMT_RGB32)
2346 if(c->chrSrcHSubSample)
2347 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2348 else
2349 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2350 src1= formatConvBuffer;
2351 src2= formatConvBuffer+VOFW;
2353 else if (srcFormat==PIX_FMT_RGB32_1)
2355 if(c->chrSrcHSubSample)
2356 RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2357 else
2358 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2359 src1= formatConvBuffer;
2360 src2= formatConvBuffer+VOFW;
2362 else if (srcFormat==PIX_FMT_BGR24)
2364 if(c->chrSrcHSubSample)
2365 RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2366 else
2367 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2368 src1= formatConvBuffer;
2369 src2= formatConvBuffer+VOFW;
2371 else if (srcFormat==PIX_FMT_BGR565)
2373 if(c->chrSrcHSubSample)
2374 RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2375 else
2376 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2377 src1= formatConvBuffer;
2378 src2= formatConvBuffer+VOFW;
2380 else if (srcFormat==PIX_FMT_BGR555)
2382 if(c->chrSrcHSubSample)
2383 RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2384 else
2385 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2386 src1= formatConvBuffer;
2387 src2= formatConvBuffer+VOFW;
2389 else if (srcFormat==PIX_FMT_BGR32)
2391 if(c->chrSrcHSubSample)
2392 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2393 else
2394 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2395 src1= formatConvBuffer;
2396 src2= formatConvBuffer+VOFW;
2398 else if (srcFormat==PIX_FMT_BGR32_1)
2400 if(c->chrSrcHSubSample)
2401 RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2402 else
2403 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2404 src1= formatConvBuffer;
2405 src2= formatConvBuffer+VOFW;
2407 else if (srcFormat==PIX_FMT_RGB24)
2409 if(c->chrSrcHSubSample)
2410 RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2411 else
2412 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2413 src1= formatConvBuffer;
2414 src2= formatConvBuffer+VOFW;
2416 else if (srcFormat==PIX_FMT_RGB565)
2418 if(c->chrSrcHSubSample)
2419 RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2420 else
2421 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2422 src1= formatConvBuffer;
2423 src2= formatConvBuffer+VOFW;
2425 else if (srcFormat==PIX_FMT_RGB555)
2427 if(c->chrSrcHSubSample)
2428 RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2429 else
2430 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2431 src1= formatConvBuffer;
2432 src2= formatConvBuffer+VOFW;
2434 else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2436 return;
2438 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2440 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2441 src1= formatConvBuffer;
2442 src2= formatConvBuffer+VOFW;
2445 #if HAVE_MMX
2446 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2447 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2448 #else
2449 if (!(flags&SWS_FAST_BILINEAR))
2450 #endif
2452 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2453 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2455 else // fast bilinear upscale / crap downscale
2457 #if ARCH_X86
2458 #if HAVE_MMX2
2459 int i;
2460 #if defined(PIC)
2461 uint64_t ebxsave __attribute__((aligned(8)));
2462 #endif
2463 if (canMMX2BeUsed)
2465 __asm__ volatile(
2466 #if defined(PIC)
2467 "mov %%"REG_b", %6 \n\t"
2468 #endif
2469 "pxor %%mm7, %%mm7 \n\t"
2470 "mov %0, %%"REG_c" \n\t"
2471 "mov %1, %%"REG_D" \n\t"
2472 "mov %2, %%"REG_d" \n\t"
2473 "mov %3, %%"REG_b" \n\t"
2474 "xor %%"REG_a", %%"REG_a" \n\t" // i
2475 PREFETCH" (%%"REG_c") \n\t"
2476 PREFETCH" 32(%%"REG_c") \n\t"
2477 PREFETCH" 64(%%"REG_c") \n\t"
2479 #if ARCH_X86_64
2481 #define FUNNY_UV_CODE \
2482 "movl (%%"REG_b"), %%esi \n\t"\
2483 "call *%4 \n\t"\
2484 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2485 "add %%"REG_S", %%"REG_c" \n\t"\
2486 "add %%"REG_a", %%"REG_D" \n\t"\
2487 "xor %%"REG_a", %%"REG_a" \n\t"\
2489 #else
2491 #define FUNNY_UV_CODE \
2492 "movl (%%"REG_b"), %%esi \n\t"\
2493 "call *%4 \n\t"\
2494 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2495 "add %%"REG_a", %%"REG_D" \n\t"\
2496 "xor %%"REG_a", %%"REG_a" \n\t"\
2498 #endif /* ARCH_X86_64 */
2500 FUNNY_UV_CODE
2501 FUNNY_UV_CODE
2502 FUNNY_UV_CODE
2503 FUNNY_UV_CODE
2504 "xor %%"REG_a", %%"REG_a" \n\t" // i
2505 "mov %5, %%"REG_c" \n\t" // src
2506 "mov %1, %%"REG_D" \n\t" // buf1
2507 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2508 PREFETCH" (%%"REG_c") \n\t"
2509 PREFETCH" 32(%%"REG_c") \n\t"
2510 PREFETCH" 64(%%"REG_c") \n\t"
2512 FUNNY_UV_CODE
2513 FUNNY_UV_CODE
2514 FUNNY_UV_CODE
2515 FUNNY_UV_CODE
2517 #if defined(PIC)
2518 "mov %6, %%"REG_b" \n\t"
2519 #endif
2520 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2521 "m" (funnyUVCode), "m" (src2)
2522 #if defined(PIC)
2523 ,"m" (ebxsave)
2524 #endif
2525 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2526 #if !defined(PIC)
2527 ,"%"REG_b
2528 #endif
2530 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2532 //printf("%d %d %d\n", dstWidth, i, srcW);
2533 dst[i] = src1[srcW-1]*128;
2534 dst[i+VOFW] = src2[srcW-1]*128;
2537 else
2539 #endif /* HAVE_MMX2 */
2540 long xInc_shr16 = (long) (xInc >> 16);
2541 uint16_t xInc_mask = xInc & 0xffff;
2542 __asm__ volatile(
2543 "xor %%"REG_a", %%"REG_a" \n\t" // i
2544 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2545 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2546 ASMALIGN(4)
2547 "1: \n\t"
2548 "mov %0, %%"REG_S" \n\t"
2549 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2550 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2551 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2552 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2553 "shll $16, %%edi \n\t"
2554 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2555 "mov %1, %%"REG_D" \n\t"
2556 "shrl $9, %%esi \n\t"
2557 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2559 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2560 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2561 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2562 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2563 "shll $16, %%edi \n\t"
2564 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2565 "mov %1, %%"REG_D" \n\t"
2566 "shrl $9, %%esi \n\t"
2567 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2569 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2570 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2571 "add $1, %%"REG_a" \n\t"
2572 "cmp %2, %%"REG_a" \n\t"
2573 " jb 1b \n\t"
2575 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2576 which is needed to support GCC 4.0. */
2577 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2578 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2579 #else
2580 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2581 #endif
2582 "r" (src2)
2583 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2585 #if HAVE_MMX2
2586 } //if MMX2 can't be used
2587 #endif
2588 #else
2589 int i;
2590 unsigned int xpos=0;
2591 for (i=0;i<dstWidth;i++)
2593 register unsigned int xx=xpos>>16;
2594 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2595 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2596 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2597 /* slower
2598 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2599 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2601 xpos+=xInc;
2603 #endif /* ARCH_X86 */
2605 if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2606 int i;
2607 //FIXME all pal and rgb srcFormats could do this convertion as well
2608 //FIXME all scalers more complex than bilinear could do half of this transform
2609 if(c->srcRange){
2610 for (i=0; i<dstWidth; i++){
2611 dst[i ]= (dst[i ]*1799 + 4081085)>>11; //1469
2612 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2614 }else{
2615 for (i=0; i<dstWidth; i++){
2616 dst[i ]= (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2617 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2623 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2624 int srcSliceH, uint8_t* dst[], int dstStride[]){
2626 /* load a few things into local vars to make the code more readable? and faster */
2627 const int srcW= c->srcW;
2628 const int dstW= c->dstW;
2629 const int dstH= c->dstH;
2630 const int chrDstW= c->chrDstW;
2631 const int chrSrcW= c->chrSrcW;
2632 const int lumXInc= c->lumXInc;
2633 const int chrXInc= c->chrXInc;
2634 const int dstFormat= c->dstFormat;
2635 const int srcFormat= c->srcFormat;
2636 const int flags= c->flags;
2637 const int canMMX2BeUsed= c->canMMX2BeUsed;
2638 int16_t *vLumFilterPos= c->vLumFilterPos;
2639 int16_t *vChrFilterPos= c->vChrFilterPos;
2640 int16_t *hLumFilterPos= c->hLumFilterPos;
2641 int16_t *hChrFilterPos= c->hChrFilterPos;
2642 int16_t *vLumFilter= c->vLumFilter;
2643 int16_t *vChrFilter= c->vChrFilter;
2644 int16_t *hLumFilter= c->hLumFilter;
2645 int16_t *hChrFilter= c->hChrFilter;
2646 int32_t *lumMmxFilter= c->lumMmxFilter;
2647 int32_t *chrMmxFilter= c->chrMmxFilter;
2648 const int vLumFilterSize= c->vLumFilterSize;
2649 const int vChrFilterSize= c->vChrFilterSize;
2650 const int hLumFilterSize= c->hLumFilterSize;
2651 const int hChrFilterSize= c->hChrFilterSize;
2652 int16_t **lumPixBuf= c->lumPixBuf;
2653 int16_t **chrPixBuf= c->chrPixBuf;
2654 const int vLumBufSize= c->vLumBufSize;
2655 const int vChrBufSize= c->vChrBufSize;
2656 uint8_t *funnyYCode= c->funnyYCode;
2657 uint8_t *funnyUVCode= c->funnyUVCode;
2658 uint8_t *formatConvBuffer= c->formatConvBuffer;
2659 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2660 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2661 int lastDstY;
2662 uint32_t *pal=c->pal_yuv;
2664 /* vars which will change and which we need to store back in the context */
2665 int dstY= c->dstY;
2666 int lumBufIndex= c->lumBufIndex;
2667 int chrBufIndex= c->chrBufIndex;
2668 int lastInLumBuf= c->lastInLumBuf;
2669 int lastInChrBuf= c->lastInChrBuf;
2671 if (isPacked(c->srcFormat)){
2672 src[0]=
2673 src[1]=
2674 src[2]= src[0];
2675 srcStride[0]=
2676 srcStride[1]=
2677 srcStride[2]= srcStride[0];
2679 srcStride[1]<<= c->vChrDrop;
2680 srcStride[2]<<= c->vChrDrop;
2682 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2683 // (int)dst[0], (int)dst[1], (int)dst[2]);
2685 #if 0 //self test FIXME move to a vfilter or something
2687 static volatile int i=0;
2688 i++;
2689 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2690 selfTest(src, srcStride, c->srcW, c->srcH);
2691 i--;
2693 #endif
2695 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2696 //dstStride[0],dstStride[1],dstStride[2]);
2698 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2700 static int warnedAlready=0; //FIXME move this into the context perhaps
2701 if (flags & SWS_PRINT_INFO && !warnedAlready)
2703 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2704 " ->cannot do aligned memory accesses anymore\n");
2705 warnedAlready=1;
2709 /* Note the user might start scaling the picture in the middle so this
2710 will not get executed. This is not really intended but works
2711 currently, so people might do it. */
2712 if (srcSliceY ==0){
2713 lumBufIndex=0;
2714 chrBufIndex=0;
2715 dstY=0;
2716 lastInLumBuf= -1;
2717 lastInChrBuf= -1;
2720 lastDstY= dstY;
2722 for (;dstY < dstH; dstY++){
2723 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2724 const int chrDstY= dstY>>c->chrDstVSubSample;
2725 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2726 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2728 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2729 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2730 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2731 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2733 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2734 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2735 //handle holes (FAST_BILINEAR & weird filters)
2736 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2737 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2738 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2739 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2740 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2742 // Do we have enough lines in this slice to output the dstY line
2743 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2745 //Do horizontal scaling
2746 while(lastInLumBuf < lastLumSrcY)
2748 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2749 lumBufIndex++;
2750 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2751 assert(lumBufIndex < 2*vLumBufSize);
2752 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2753 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2754 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2755 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2756 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2757 funnyYCode, c->srcFormat, formatConvBuffer,
2758 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2759 lastInLumBuf++;
2761 while(lastInChrBuf < lastChrSrcY)
2763 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2764 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2765 chrBufIndex++;
2766 assert(chrBufIndex < 2*vChrBufSize);
2767 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2768 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2769 //FIXME replace parameters through context struct (some at least)
2771 if (!(isGray(srcFormat) || isGray(dstFormat)))
2772 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2773 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2774 funnyUVCode, c->srcFormat, formatConvBuffer,
2775 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2776 lastInChrBuf++;
2778 //wrap buf index around to stay inside the ring buffer
2779 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2780 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2782 else // not enough lines left in this slice -> load the rest in the buffer
2784 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2785 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2786 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2787 vChrBufSize, vLumBufSize);*/
2789 //Do horizontal scaling
2790 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2792 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2793 lumBufIndex++;
2794 assert(lumBufIndex < 2*vLumBufSize);
2795 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2796 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2797 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2798 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2799 funnyYCode, c->srcFormat, formatConvBuffer,
2800 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2801 lastInLumBuf++;
2803 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2805 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2806 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2807 chrBufIndex++;
2808 assert(chrBufIndex < 2*vChrBufSize);
2809 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
2810 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2812 if (!(isGray(srcFormat) || isGray(dstFormat)))
2813 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2814 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2815 funnyUVCode, c->srcFormat, formatConvBuffer,
2816 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2817 lastInChrBuf++;
2819 //wrap buf index around to stay inside the ring buffer
2820 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2821 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2822 break; //we can't output a dstY line so let's try with the next slice
2825 #if HAVE_MMX
2826 c->blueDither= ff_dither8[dstY&1];
2827 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2828 c->greenDither= ff_dither8[dstY&1];
2829 else
2830 c->greenDither= ff_dither4[dstY&1];
2831 c->redDither= ff_dither8[(dstY+1)&1];
2832 #endif
2833 if (dstY < dstH-2)
2835 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2836 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2837 #if HAVE_MMX
2838 int i;
2839 if (flags & SWS_ACCURATE_RND){
2840 int s= APCK_SIZE / 8;
2841 for (i=0; i<vLumFilterSize; i+=2){
2842 *(void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2843 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2844 lumMmxFilter[s*i+APCK_COEF/4 ]=
2845 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2846 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2848 for (i=0; i<vChrFilterSize; i+=2){
2849 *(void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2850 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2851 chrMmxFilter[s*i+APCK_COEF/4 ]=
2852 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2853 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2855 }else{
2856 for (i=0; i<vLumFilterSize; i++)
2858 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2859 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2860 lumMmxFilter[4*i+2]=
2861 lumMmxFilter[4*i+3]=
2862 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2864 for (i=0; i<vChrFilterSize; i++)
2866 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2867 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2868 chrMmxFilter[4*i+2]=
2869 chrMmxFilter[4*i+3]=
2870 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2873 #endif
2874 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2875 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2876 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2877 RENAME(yuv2nv12X)(c,
2878 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2879 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2880 dest, uDest, dstW, chrDstW, dstFormat);
2882 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2884 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2885 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2886 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2888 int16_t *lumBuf = lumPixBuf[0];
2889 int16_t *chrBuf= chrPixBuf[0];
2890 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2892 else //General YV12
2894 RENAME(yuv2yuvX)(c,
2895 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2896 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2897 dest, uDest, vDest, dstW, chrDstW);
2900 else
2902 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2903 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2904 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2906 int chrAlpha= vChrFilter[2*dstY+1];
2907 if(flags & SWS_FULL_CHR_H_INT){
2908 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2909 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2910 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2911 dest, dstW, dstY);
2912 }else{
2913 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2914 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2917 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2919 int lumAlpha= vLumFilter[2*dstY+1];
2920 int chrAlpha= vChrFilter[2*dstY+1];
2921 lumMmxFilter[2]=
2922 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2923 chrMmxFilter[2]=
2924 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2925 if(flags & SWS_FULL_CHR_H_INT){
2926 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2927 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2928 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2929 dest, dstW, dstY);
2930 }else{
2931 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2932 dest, dstW, lumAlpha, chrAlpha, dstY);
2935 else //general RGB
2937 if(flags & SWS_FULL_CHR_H_INT){
2938 yuv2rgbXinC_full(c,
2939 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2940 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2941 dest, dstW, dstY);
2942 }else{
2943 RENAME(yuv2packedX)(c,
2944 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2945 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2946 dest, dstW, dstY);
2951 else // hmm looks like we can't use MMX here without overwriting this array's tail
2953 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2954 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2955 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2956 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2957 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2958 yuv2nv12XinC(
2959 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2960 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2961 dest, uDest, dstW, chrDstW, dstFormat);
2963 else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
2965 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2966 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2967 yuv2yuvXinC(
2968 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2969 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2970 dest, uDest, vDest, dstW, chrDstW);
2972 else
2974 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2975 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2976 if(flags & SWS_FULL_CHR_H_INT){
2977 yuv2rgbXinC_full(c,
2978 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2979 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2980 dest, dstW, dstY);
2981 }else{
2982 yuv2packedXinC(c,
2983 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2984 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2985 dest, dstW, dstY);
2991 #if HAVE_MMX
2992 __asm__ volatile(SFENCE:::"memory");
2993 __asm__ volatile(EMMS:::"memory");
2994 #endif
2995 /* store changed local vars back in the context */
2996 c->dstY= dstY;
2997 c->lumBufIndex= lumBufIndex;
2998 c->chrBufIndex= chrBufIndex;
2999 c->lastInLumBuf= lastInLumBuf;
3000 c->lastInChrBuf= lastInChrBuf;
3002 return dstY - lastDstY;