remove useless casts
[libswscale.git] / swscale_template.c
blob360b592cf4d8bb4ae7f643028d33a49a327bd62a
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * the C code (not assembly, mmx, ...) of this file can be used
21 * under the LGPL license too
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
32 #ifdef HAVE_3DNOW
33 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34 #define EMMS "femms"
35 #else
36 #define EMMS "emms"
37 #endif
39 #ifdef HAVE_3DNOW
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined (HAVE_MMX2)
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
48 #endif
50 #ifdef HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
56 #ifdef HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
62 #ifdef HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
69 #ifdef HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74 asm volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 asm volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $16, %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
185 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187 "r" (dest), "m" (dstW),
188 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
191 #define YSCALEYUV2PACKEDX \
192 asm volatile(\
193 "xor %%"REG_a", %%"REG_a" \n\t"\
194 ASMALIGN(4)\
195 "nop \n\t"\
196 "1: \n\t"\
197 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
198 "mov (%%"REG_d"), %%"REG_S" \n\t"\
199 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
200 "movq %%mm3, %%mm4 \n\t"\
201 ASMALIGN(4)\
202 "2: \n\t"\
203 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
204 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
205 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
206 "add $16, %%"REG_d" \n\t"\
207 "mov (%%"REG_d"), %%"REG_S" \n\t"\
208 "pmulhw %%mm0, %%mm2 \n\t"\
209 "pmulhw %%mm0, %%mm5 \n\t"\
210 "paddw %%mm2, %%mm3 \n\t"\
211 "paddw %%mm5, %%mm4 \n\t"\
212 "test %%"REG_S", %%"REG_S" \n\t"\
213 " jnz 2b \n\t"\
215 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
218 "movq %%mm1, %%mm7 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm1 \n\t"\
229 "paddw %%mm5, %%mm7 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
233 #define YSCALEYUV2PACKEDX_END \
234 :: "r" (&c->redDither), \
235 "m" (dummy), "m" (dummy), "m" (dummy),\
236 "r" (dest), "m" (dstW) \
237 : "%"REG_a, "%"REG_d, "%"REG_S \
240 #define YSCALEYUV2PACKEDX_ACCURATE \
241 asm volatile(\
242 "xor %%"REG_a", %%"REG_a" \n\t"\
243 ASMALIGN(4)\
244 "nop \n\t"\
245 "1: \n\t"\
246 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
247 "mov (%%"REG_d"), %%"REG_S" \n\t"\
248 "pxor %%mm4, %%mm4 \n\t"\
249 "pxor %%mm5, %%mm5 \n\t"\
250 "pxor %%mm6, %%mm6 \n\t"\
251 "pxor %%mm7, %%mm7 \n\t"\
252 ASMALIGN(4)\
253 "2: \n\t"\
254 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
255 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
256 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
257 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
258 "movq %%mm0, %%mm3 \n\t"\
259 "punpcklwd %%mm1, %%mm0 \n\t"\
260 "punpckhwd %%mm1, %%mm3 \n\t"\
261 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
262 "pmaddwd %%mm1, %%mm0 \n\t"\
263 "pmaddwd %%mm1, %%mm3 \n\t"\
264 "paddd %%mm0, %%mm4 \n\t"\
265 "paddd %%mm3, %%mm5 \n\t"\
266 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
267 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
268 "add $16, %%"REG_d" \n\t"\
269 "test %%"REG_S", %%"REG_S" \n\t"\
270 "movq %%mm2, %%mm0 \n\t"\
271 "punpcklwd %%mm3, %%mm2 \n\t"\
272 "punpckhwd %%mm3, %%mm0 \n\t"\
273 "pmaddwd %%mm1, %%mm2 \n\t"\
274 "pmaddwd %%mm1, %%mm0 \n\t"\
275 "paddd %%mm2, %%mm6 \n\t"\
276 "paddd %%mm0, %%mm7 \n\t"\
277 " jnz 2b \n\t"\
278 "psrad $16, %%mm4 \n\t"\
279 "psrad $16, %%mm5 \n\t"\
280 "psrad $16, %%mm6 \n\t"\
281 "psrad $16, %%mm7 \n\t"\
282 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
283 "packssdw %%mm5, %%mm4 \n\t"\
284 "packssdw %%mm7, %%mm6 \n\t"\
285 "paddw %%mm0, %%mm4 \n\t"\
286 "paddw %%mm0, %%mm6 \n\t"\
287 "movq %%mm4, "U_TEMP"(%0) \n\t"\
288 "movq %%mm6, "V_TEMP"(%0) \n\t"\
290 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
291 "mov (%%"REG_d"), %%"REG_S" \n\t"\
292 "pxor %%mm1, %%mm1 \n\t"\
293 "pxor %%mm5, %%mm5 \n\t"\
294 "pxor %%mm7, %%mm7 \n\t"\
295 "pxor %%mm6, %%mm6 \n\t"\
296 ASMALIGN(4)\
297 "2: \n\t"\
298 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
299 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
300 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
301 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
302 "movq %%mm0, %%mm3 \n\t"\
303 "punpcklwd %%mm4, %%mm0 \n\t"\
304 "punpckhwd %%mm4, %%mm3 \n\t"\
305 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
306 "pmaddwd %%mm4, %%mm0 \n\t"\
307 "pmaddwd %%mm4, %%mm3 \n\t"\
308 "paddd %%mm0, %%mm1 \n\t"\
309 "paddd %%mm3, %%mm5 \n\t"\
310 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
311 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
312 "add $16, %%"REG_d" \n\t"\
313 "test %%"REG_S", %%"REG_S" \n\t"\
314 "movq %%mm2, %%mm0 \n\t"\
315 "punpcklwd %%mm3, %%mm2 \n\t"\
316 "punpckhwd %%mm3, %%mm0 \n\t"\
317 "pmaddwd %%mm4, %%mm2 \n\t"\
318 "pmaddwd %%mm4, %%mm0 \n\t"\
319 "paddd %%mm2, %%mm7 \n\t"\
320 "paddd %%mm0, %%mm6 \n\t"\
321 " jnz 2b \n\t"\
322 "psrad $16, %%mm1 \n\t"\
323 "psrad $16, %%mm5 \n\t"\
324 "psrad $16, %%mm7 \n\t"\
325 "psrad $16, %%mm6 \n\t"\
326 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
327 "packssdw %%mm5, %%mm1 \n\t"\
328 "packssdw %%mm6, %%mm7 \n\t"\
329 "paddw %%mm0, %%mm1 \n\t"\
330 "paddw %%mm0, %%mm7 \n\t"\
331 "movq "U_TEMP"(%0), %%mm3 \n\t"\
332 "movq "V_TEMP"(%0), %%mm4 \n\t"\
334 #define YSCALEYUV2RGBX \
335 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
336 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
337 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
338 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
339 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
340 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
341 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
343 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
344 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
345 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
346 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
347 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
348 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349 "paddw %%mm3, %%mm4 \n\t"\
350 "movq %%mm2, %%mm0 \n\t"\
351 "movq %%mm5, %%mm6 \n\t"\
352 "movq %%mm4, %%mm3 \n\t"\
353 "punpcklwd %%mm2, %%mm2 \n\t"\
354 "punpcklwd %%mm5, %%mm5 \n\t"\
355 "punpcklwd %%mm4, %%mm4 \n\t"\
356 "paddw %%mm1, %%mm2 \n\t"\
357 "paddw %%mm1, %%mm5 \n\t"\
358 "paddw %%mm1, %%mm4 \n\t"\
359 "punpckhwd %%mm0, %%mm0 \n\t"\
360 "punpckhwd %%mm6, %%mm6 \n\t"\
361 "punpckhwd %%mm3, %%mm3 \n\t"\
362 "paddw %%mm7, %%mm0 \n\t"\
363 "paddw %%mm7, %%mm6 \n\t"\
364 "paddw %%mm7, %%mm3 \n\t"\
365 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366 "packuswb %%mm0, %%mm2 \n\t"\
367 "packuswb %%mm6, %%mm5 \n\t"\
368 "packuswb %%mm3, %%mm4 \n\t"\
369 "pxor %%mm7, %%mm7 \n\t"
370 #if 0
371 #define FULL_YSCALEYUV2RGB \
372 "pxor %%mm7, %%mm7 \n\t"\
373 "movd %6, %%mm6 \n\t" /*yalpha1*/\
374 "punpcklwd %%mm6, %%mm6 \n\t"\
375 "punpcklwd %%mm6, %%mm6 \n\t"\
376 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
377 "punpcklwd %%mm5, %%mm5 \n\t"\
378 "punpcklwd %%mm5, %%mm5 \n\t"\
379 "xor %%"REG_a", %%"REG_a" \n\t"\
380 ASMALIGN(4)\
381 "1: \n\t"\
382 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
383 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
384 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
385 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
386 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
387 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
392 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
395 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
398 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
399 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
402 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
404 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
405 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
407 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
411 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
412 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
413 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
414 "paddw %%mm1, %%mm3 \n\t" /* B*/\
415 "paddw %%mm1, %%mm0 \n\t" /* R*/\
416 "packuswb %%mm3, %%mm3 \n\t"\
418 "packuswb %%mm0, %%mm0 \n\t"\
419 "paddw %%mm4, %%mm2 \n\t"\
420 "paddw %%mm2, %%mm1 \n\t" /* G*/\
422 "packuswb %%mm1, %%mm1 \n\t"
423 #endif
425 #define REAL_YSCALEYUV2PACKED(index, c) \
426 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
427 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
428 "psraw $3, %%mm0 \n\t"\
429 "psraw $3, %%mm1 \n\t"\
430 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432 "xor "#index", "#index" \n\t"\
433 ASMALIGN(4)\
434 "1: \n\t"\
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
439 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
442 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
449 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
450 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
451 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
452 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
461 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
463 #define REAL_YSCALEYUV2RGB(index, c) \
464 "xor "#index", "#index" \n\t"\
465 ASMALIGN(4)\
466 "1: \n\t"\
467 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
468 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
469 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
470 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
471 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
474 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
481 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
482 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
483 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
484 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
485 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
486 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
488 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
489 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
490 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
491 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
492 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
493 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
500 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
501 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
502 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
503 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
504 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
505 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506 "paddw %%mm3, %%mm4 \n\t"\
507 "movq %%mm2, %%mm0 \n\t"\
508 "movq %%mm5, %%mm6 \n\t"\
509 "movq %%mm4, %%mm3 \n\t"\
510 "punpcklwd %%mm2, %%mm2 \n\t"\
511 "punpcklwd %%mm5, %%mm5 \n\t"\
512 "punpcklwd %%mm4, %%mm4 \n\t"\
513 "paddw %%mm1, %%mm2 \n\t"\
514 "paddw %%mm1, %%mm5 \n\t"\
515 "paddw %%mm1, %%mm4 \n\t"\
516 "punpckhwd %%mm0, %%mm0 \n\t"\
517 "punpckhwd %%mm6, %%mm6 \n\t"\
518 "punpckhwd %%mm3, %%mm3 \n\t"\
519 "paddw %%mm7, %%mm0 \n\t"\
520 "paddw %%mm7, %%mm6 \n\t"\
521 "paddw %%mm7, %%mm3 \n\t"\
522 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523 "packuswb %%mm0, %%mm2 \n\t"\
524 "packuswb %%mm6, %%mm5 \n\t"\
525 "packuswb %%mm3, %%mm4 \n\t"\
526 "pxor %%mm7, %%mm7 \n\t"
527 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
529 #define REAL_YSCALEYUV2PACKED1(index, c) \
530 "xor "#index", "#index" \n\t"\
531 ASMALIGN(4)\
532 "1: \n\t"\
533 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
534 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
535 "psraw $7, %%mm3 \n\t" \
536 "psraw $7, %%mm4 \n\t" \
537 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
538 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
539 "psraw $7, %%mm1 \n\t" \
540 "psraw $7, %%mm7 \n\t" \
542 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
544 #define REAL_YSCALEYUV2RGB1(index, c) \
545 "xor "#index", "#index" \n\t"\
546 ASMALIGN(4)\
547 "1: \n\t"\
548 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
549 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
550 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
553 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
554 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
555 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
556 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
557 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
558 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
560 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
561 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
564 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
565 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
566 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
567 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
568 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
569 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570 "paddw %%mm3, %%mm4 \n\t"\
571 "movq %%mm2, %%mm0 \n\t"\
572 "movq %%mm5, %%mm6 \n\t"\
573 "movq %%mm4, %%mm3 \n\t"\
574 "punpcklwd %%mm2, %%mm2 \n\t"\
575 "punpcklwd %%mm5, %%mm5 \n\t"\
576 "punpcklwd %%mm4, %%mm4 \n\t"\
577 "paddw %%mm1, %%mm2 \n\t"\
578 "paddw %%mm1, %%mm5 \n\t"\
579 "paddw %%mm1, %%mm4 \n\t"\
580 "punpckhwd %%mm0, %%mm0 \n\t"\
581 "punpckhwd %%mm6, %%mm6 \n\t"\
582 "punpckhwd %%mm3, %%mm3 \n\t"\
583 "paddw %%mm7, %%mm0 \n\t"\
584 "paddw %%mm7, %%mm6 \n\t"\
585 "paddw %%mm7, %%mm3 \n\t"\
586 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587 "packuswb %%mm0, %%mm2 \n\t"\
588 "packuswb %%mm6, %%mm5 \n\t"\
589 "packuswb %%mm3, %%mm4 \n\t"\
590 "pxor %%mm7, %%mm7 \n\t"
591 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
593 #define REAL_YSCALEYUV2PACKED1b(index, c) \
594 "xor "#index", "#index" \n\t"\
595 ASMALIGN(4)\
596 "1: \n\t"\
597 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
598 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
599 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
601 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603 "psrlw $8, %%mm3 \n\t" \
604 "psrlw $8, %%mm4 \n\t" \
605 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
606 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
607 "psraw $7, %%mm1 \n\t" \
608 "psraw $7, %%mm7 \n\t"
609 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
611 // do vertical chrominance interpolation
612 #define REAL_YSCALEYUV2RGB1b(index, c) \
613 "xor "#index", "#index" \n\t"\
614 ASMALIGN(4)\
615 "1: \n\t"\
616 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
617 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
618 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
620 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
623 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
624 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
625 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
626 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
627 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
628 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
629 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
630 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
632 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
633 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
636 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
637 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
638 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
639 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
640 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
641 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642 "paddw %%mm3, %%mm4 \n\t"\
643 "movq %%mm2, %%mm0 \n\t"\
644 "movq %%mm5, %%mm6 \n\t"\
645 "movq %%mm4, %%mm3 \n\t"\
646 "punpcklwd %%mm2, %%mm2 \n\t"\
647 "punpcklwd %%mm5, %%mm5 \n\t"\
648 "punpcklwd %%mm4, %%mm4 \n\t"\
649 "paddw %%mm1, %%mm2 \n\t"\
650 "paddw %%mm1, %%mm5 \n\t"\
651 "paddw %%mm1, %%mm4 \n\t"\
652 "punpckhwd %%mm0, %%mm0 \n\t"\
653 "punpckhwd %%mm6, %%mm6 \n\t"\
654 "punpckhwd %%mm3, %%mm3 \n\t"\
655 "paddw %%mm7, %%mm0 \n\t"\
656 "paddw %%mm7, %%mm6 \n\t"\
657 "paddw %%mm7, %%mm3 \n\t"\
658 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659 "packuswb %%mm0, %%mm2 \n\t"\
660 "packuswb %%mm6, %%mm5 \n\t"\
661 "packuswb %%mm3, %%mm4 \n\t"\
662 "pxor %%mm7, %%mm7 \n\t"
663 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
665 #define REAL_WRITEBGR32(dst, dstw, index) \
666 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667 "movq %%mm2, %%mm1 \n\t" /* B */\
668 "movq %%mm5, %%mm6 \n\t" /* R */\
669 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
670 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
671 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
672 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
673 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
674 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
675 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
676 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
677 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
678 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
680 MOVNTQ(%%mm0, (dst, index, 4))\
681 MOVNTQ(%%mm2, 8(dst, index, 4))\
682 MOVNTQ(%%mm1, 16(dst, index, 4))\
683 MOVNTQ(%%mm3, 24(dst, index, 4))\
685 "add $8, "#index" \n\t"\
686 "cmp "#dstw", "#index" \n\t"\
687 " jb 1b \n\t"
688 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
690 #define REAL_WRITEBGR16(dst, dstw, index) \
691 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
692 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
693 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
694 "psrlq $3, %%mm2 \n\t"\
696 "movq %%mm2, %%mm1 \n\t"\
697 "movq %%mm4, %%mm3 \n\t"\
699 "punpcklbw %%mm7, %%mm3 \n\t"\
700 "punpcklbw %%mm5, %%mm2 \n\t"\
701 "punpckhbw %%mm7, %%mm4 \n\t"\
702 "punpckhbw %%mm5, %%mm1 \n\t"\
704 "psllq $3, %%mm3 \n\t"\
705 "psllq $3, %%mm4 \n\t"\
707 "por %%mm3, %%mm2 \n\t"\
708 "por %%mm4, %%mm1 \n\t"\
710 MOVNTQ(%%mm2, (dst, index, 2))\
711 MOVNTQ(%%mm1, 8(dst, index, 2))\
713 "add $8, "#index" \n\t"\
714 "cmp "#dstw", "#index" \n\t"\
715 " jb 1b \n\t"
716 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
718 #define REAL_WRITEBGR15(dst, dstw, index) \
719 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
720 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
721 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
722 "psrlq $3, %%mm2 \n\t"\
723 "psrlq $1, %%mm5 \n\t"\
725 "movq %%mm2, %%mm1 \n\t"\
726 "movq %%mm4, %%mm3 \n\t"\
728 "punpcklbw %%mm7, %%mm3 \n\t"\
729 "punpcklbw %%mm5, %%mm2 \n\t"\
730 "punpckhbw %%mm7, %%mm4 \n\t"\
731 "punpckhbw %%mm5, %%mm1 \n\t"\
733 "psllq $2, %%mm3 \n\t"\
734 "psllq $2, %%mm4 \n\t"\
736 "por %%mm3, %%mm2 \n\t"\
737 "por %%mm4, %%mm1 \n\t"\
739 MOVNTQ(%%mm2, (dst, index, 2))\
740 MOVNTQ(%%mm1, 8(dst, index, 2))\
742 "add $8, "#index" \n\t"\
743 "cmp "#dstw", "#index" \n\t"\
744 " jb 1b \n\t"
745 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
747 #define WRITEBGR24OLD(dst, dstw, index) \
748 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749 "movq %%mm2, %%mm1 \n\t" /* B */\
750 "movq %%mm5, %%mm6 \n\t" /* R */\
751 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
752 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
753 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
754 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
755 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
756 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
757 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
758 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
759 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
760 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
762 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
763 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
764 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
765 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
766 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
767 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
768 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
769 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
771 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
772 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
773 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
774 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
775 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
776 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
777 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
778 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
779 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
780 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
781 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
782 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
783 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
785 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
786 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
787 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
788 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
789 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
790 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
791 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
792 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
794 MOVNTQ(%%mm0, (dst))\
795 MOVNTQ(%%mm2, 8(dst))\
796 MOVNTQ(%%mm3, 16(dst))\
797 "add $24, "#dst" \n\t"\
799 "add $8, "#index" \n\t"\
800 "cmp "#dstw", "#index" \n\t"\
801 " jb 1b \n\t"
803 #define WRITEBGR24MMX(dst, dstw, index) \
804 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805 "movq %%mm2, %%mm1 \n\t" /* B */\
806 "movq %%mm5, %%mm6 \n\t" /* R */\
807 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
808 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
809 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
810 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
811 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
812 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
813 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
814 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
815 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
816 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
818 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
819 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
820 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
821 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
823 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
824 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
825 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
826 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
828 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
829 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
830 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
831 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
833 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
834 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
835 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
836 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
837 MOVNTQ(%%mm0, (dst))\
839 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
840 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
841 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
842 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
843 MOVNTQ(%%mm6, 8(dst))\
845 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
846 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
847 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
848 MOVNTQ(%%mm5, 16(dst))\
850 "add $24, "#dst" \n\t"\
852 "add $8, "#index" \n\t"\
853 "cmp "#dstw", "#index" \n\t"\
854 " jb 1b \n\t"
856 #define WRITEBGR24MMX2(dst, dstw, index) \
857 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
859 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
860 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
861 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
862 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
864 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
865 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
866 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
868 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
869 "por %%mm1, %%mm6 \n\t"\
870 "por %%mm3, %%mm6 \n\t"\
871 MOVNTQ(%%mm6, (dst))\
873 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
874 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
875 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
876 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
878 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
879 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
880 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
882 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
883 "por %%mm3, %%mm6 \n\t"\
884 MOVNTQ(%%mm6, 8(dst))\
886 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
887 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
888 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
890 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
891 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
892 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
894 "por %%mm1, %%mm3 \n\t"\
895 "por %%mm3, %%mm6 \n\t"\
896 MOVNTQ(%%mm6, 16(dst))\
898 "add $24, "#dst" \n\t"\
900 "add $8, "#index" \n\t"\
901 "cmp "#dstw", "#index" \n\t"\
902 " jb 1b \n\t"
904 #ifdef HAVE_MMX2
905 #undef WRITEBGR24
906 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
907 #else
908 #undef WRITEBGR24
909 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
910 #endif
912 #define REAL_WRITEYUY2(dst, dstw, index) \
913 "packuswb %%mm3, %%mm3 \n\t"\
914 "packuswb %%mm4, %%mm4 \n\t"\
915 "packuswb %%mm7, %%mm1 \n\t"\
916 "punpcklbw %%mm4, %%mm3 \n\t"\
917 "movq %%mm1, %%mm7 \n\t"\
918 "punpcklbw %%mm3, %%mm1 \n\t"\
919 "punpckhbw %%mm3, %%mm7 \n\t"\
921 MOVNTQ(%%mm1, (dst, index, 2))\
922 MOVNTQ(%%mm7, 8(dst, index, 2))\
924 "add $8, "#index" \n\t"\
925 "cmp "#dstw", "#index" \n\t"\
926 " jb 1b \n\t"
927 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
930 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
931 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
934 #ifdef HAVE_MMX
935 if (c->flags & SWS_ACCURATE_RND){
936 if (uDest){
937 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
941 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
942 }else{
943 if (uDest){
944 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
948 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
950 #else
951 #ifdef HAVE_ALTIVEC
952 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953 chrFilter, chrSrc, chrFilterSize,
954 dest, uDest, vDest, dstW, chrDstW);
955 #else //HAVE_ALTIVEC
956 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
957 chrFilter, chrSrc, chrFilterSize,
958 dest, uDest, vDest, dstW, chrDstW);
959 #endif //!HAVE_ALTIVEC
960 #endif /* HAVE_MMX */
963 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
967 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968 chrFilter, chrSrc, chrFilterSize,
969 dest, uDest, dstW, chrDstW, dstFormat);
972 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
973 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
975 #ifdef HAVE_MMX
976 if (uDest)
978 asm volatile(
979 YSCALEYUV2YV121
980 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
981 "g" (-chrDstW)
982 : "%"REG_a
985 asm volatile(
986 YSCALEYUV2YV121
987 :: "r" (chrSrc + VOFW + chrDstW), "r" (vDest + chrDstW),
988 "g" (-chrDstW)
989 : "%"REG_a
993 asm volatile(
994 YSCALEYUV2YV121
995 :: "r" (lumSrc + dstW), "r" (dest + dstW),
996 "g" (-dstW)
997 : "%"REG_a
999 #else
1000 int i;
1001 for (i=0; i<dstW; i++)
1003 int val= lumSrc[i]>>7;
1005 if (val&256){
1006 if (val<0) val=0;
1007 else val=255;
1010 dest[i]= val;
1013 if (uDest)
1014 for (i=0; i<chrDstW; i++)
1016 int u=chrSrc[i]>>7;
1017 int v=chrSrc[i + VOFW]>>7;
1019 if ((u|v)&256){
1020 if (u<0) u=0;
1021 else if (u>255) u=255;
1022 if (v<0) v=0;
1023 else if (v>255) v=255;
1026 uDest[i]= u;
1027 vDest[i]= v;
1029 #endif
1034 * vertical scale YV12 to RGB
1036 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1037 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038 uint8_t *dest, long dstW, long dstY)
1040 #ifdef HAVE_MMX
1041 long dummy=0;
1042 if (c->flags & SWS_ACCURATE_RND){
1043 switch(c->dstFormat){
1044 case PIX_FMT_RGB32:
1045 YSCALEYUV2PACKEDX_ACCURATE
1046 YSCALEYUV2RGBX
1047 WRITEBGR32(%4, %5, %%REGa)
1049 YSCALEYUV2PACKEDX_END
1050 return;
1051 case PIX_FMT_BGR24:
1052 YSCALEYUV2PACKEDX_ACCURATE
1053 YSCALEYUV2RGBX
1054 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055 "add %4, %%"REG_c" \n\t"
1056 WRITEBGR24(%%REGc, %5, %%REGa)
1059 :: "r" (&c->redDither),
1060 "m" (dummy), "m" (dummy), "m" (dummy),
1061 "r" (dest), "m" (dstW)
1062 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1064 return;
1065 case PIX_FMT_BGR555:
1066 YSCALEYUV2PACKEDX_ACCURATE
1067 YSCALEYUV2RGBX
1068 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1069 #ifdef DITHER1XBPP
1070 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1073 #endif
1075 WRITEBGR15(%4, %5, %%REGa)
1076 YSCALEYUV2PACKEDX_END
1077 return;
1078 case PIX_FMT_BGR565:
1079 YSCALEYUV2PACKEDX_ACCURATE
1080 YSCALEYUV2RGBX
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082 #ifdef DITHER1XBPP
1083 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1086 #endif
1088 WRITEBGR16(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1090 return;
1091 case PIX_FMT_YUYV422:
1092 YSCALEYUV2PACKEDX_ACCURATE
1093 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1095 "psraw $3, %%mm3 \n\t"
1096 "psraw $3, %%mm4 \n\t"
1097 "psraw $3, %%mm1 \n\t"
1098 "psraw $3, %%mm7 \n\t"
1099 WRITEYUY2(%4, %5, %%REGa)
1100 YSCALEYUV2PACKEDX_END
1101 return;
1103 }else{
1104 switch(c->dstFormat)
1106 case PIX_FMT_RGB32:
1107 YSCALEYUV2PACKEDX
1108 YSCALEYUV2RGBX
1109 WRITEBGR32(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1111 return;
1112 case PIX_FMT_BGR24:
1113 YSCALEYUV2PACKEDX
1114 YSCALEYUV2RGBX
1115 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1116 "add %4, %%"REG_c" \n\t"
1117 WRITEBGR24(%%REGc, %5, %%REGa)
1119 :: "r" (&c->redDither),
1120 "m" (dummy), "m" (dummy), "m" (dummy),
1121 "r" (dest), "m" (dstW)
1122 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1124 return;
1125 case PIX_FMT_BGR555:
1126 YSCALEYUV2PACKEDX
1127 YSCALEYUV2RGBX
1128 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1129 #ifdef DITHER1XBPP
1130 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1131 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1132 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1133 #endif
1135 WRITEBGR15(%4, %5, %%REGa)
1136 YSCALEYUV2PACKEDX_END
1137 return;
1138 case PIX_FMT_BGR565:
1139 YSCALEYUV2PACKEDX
1140 YSCALEYUV2RGBX
1141 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1142 #ifdef DITHER1XBPP
1143 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1144 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1145 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1146 #endif
1148 WRITEBGR16(%4, %5, %%REGa)
1149 YSCALEYUV2PACKEDX_END
1150 return;
1151 case PIX_FMT_YUYV422:
1152 YSCALEYUV2PACKEDX
1153 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1155 "psraw $3, %%mm3 \n\t"
1156 "psraw $3, %%mm4 \n\t"
1157 "psraw $3, %%mm1 \n\t"
1158 "psraw $3, %%mm7 \n\t"
1159 WRITEYUY2(%4, %5, %%REGa)
1160 YSCALEYUV2PACKEDX_END
1161 return;
1164 #endif /* HAVE_MMX */
1165 #ifdef HAVE_ALTIVEC
1166 /* The following list of supported dstFormat values should
1167 match what's found in the body of altivec_yuv2packedX() */
1168 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1169 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1171 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172 chrFilter, chrSrc, chrFilterSize,
1173 dest, dstW, dstY);
1174 else
1175 #endif
1176 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177 chrFilter, chrSrc, chrFilterSize,
1178 dest, dstW, dstY);
1182 * vertical bilinear scale YV12 to RGB
1184 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1185 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1187 int yalpha1=yalpha^4095;
1188 int uvalpha1=uvalpha^4095;
1189 int i;
1191 #if 0 //isn't used
1192 if (flags&SWS_FULL_CHR_H_INT)
1194 switch(dstFormat)
1196 #ifdef HAVE_MMX
1197 case PIX_FMT_RGB32:
1198 asm volatile(
1201 FULL_YSCALEYUV2RGB
1202 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1203 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1205 "movq %%mm3, %%mm1 \n\t"
1206 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1207 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1209 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1210 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1212 "add $4, %%"REG_a" \n\t"
1213 "cmp %5, %%"REG_a" \n\t"
1214 " jb 1b \n\t"
1216 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1217 "m" (yalpha1), "m" (uvalpha1)
1218 : "%"REG_a
1220 break;
1221 case PIX_FMT_BGR24:
1222 asm volatile(
1224 FULL_YSCALEYUV2RGB
1226 // lsb ... msb
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1234 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1235 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1236 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1237 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1238 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1239 "movq %%mm1, %%mm2 \n\t"
1240 "psllq $48, %%mm1 \n\t" // 000000BG
1241 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1243 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1244 "psrld $16, %%mm2 \n\t" // R000R000
1245 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1246 "por %%mm2, %%mm1 \n\t" // RBGRR000
1248 "mov %4, %%"REG_b" \n\t"
1249 "add %%"REG_a", %%"REG_b" \n\t"
1251 #ifdef HAVE_MMX2
1252 //FIXME Alignment
1253 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1254 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1255 #else
1256 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1257 "psrlq $32, %%mm3 \n\t"
1258 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1259 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1260 #endif
1261 "add $4, %%"REG_a" \n\t"
1262 "cmp %5, %%"REG_a" \n\t"
1263 " jb 1b \n\t"
1265 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1266 "m" (yalpha1), "m" (uvalpha1)
1267 : "%"REG_a, "%"REG_b
1269 break;
1270 case PIX_FMT_BGR555:
1271 asm volatile(
1273 FULL_YSCALEYUV2RGB
1274 #ifdef DITHER1XBPP
1275 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1276 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1277 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1278 #endif
1279 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1280 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1281 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1283 "psrlw $3, %%mm3 \n\t"
1284 "psllw $2, %%mm1 \n\t"
1285 "psllw $7, %%mm0 \n\t"
1286 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1287 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1289 "por %%mm3, %%mm1 \n\t"
1290 "por %%mm1, %%mm0 \n\t"
1292 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1294 "add $4, %%"REG_a" \n\t"
1295 "cmp %5, %%"REG_a" \n\t"
1296 " jb 1b \n\t"
1298 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1299 "m" (yalpha1), "m" (uvalpha1)
1300 : "%"REG_a
1302 break;
1303 case PIX_FMT_BGR565:
1304 asm volatile(
1306 FULL_YSCALEYUV2RGB
1307 #ifdef DITHER1XBPP
1308 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1309 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1310 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1311 #endif
1312 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1313 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1314 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1316 "psrlw $3, %%mm3 \n\t"
1317 "psllw $3, %%mm1 \n\t"
1318 "psllw $8, %%mm0 \n\t"
1319 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1320 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1322 "por %%mm3, %%mm1 \n\t"
1323 "por %%mm1, %%mm0 \n\t"
1325 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1327 "add $4, %%"REG_a" \n\t"
1328 "cmp %5, %%"REG_a" \n\t"
1329 " jb 1b \n\t"
1331 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1332 "m" (yalpha1), "m" (uvalpha1)
1333 : "%"REG_a
1335 break;
1336 #endif /* HAVE_MMX */
1337 case PIX_FMT_BGR32:
1338 #ifndef HAVE_MMX
1339 case PIX_FMT_RGB32:
1340 #endif
1341 if (dstFormat==PIX_FMT_RGB32)
1343 int i;
1344 #ifdef WORDS_BIGENDIAN
1345 dest++;
1346 #endif
1347 for (i=0;i<dstW;i++){
1348 // vertical linear interpolation && yuv2rgb in a single step:
1349 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1350 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1351 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1352 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1353 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1354 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1355 dest+= 4;
1358 else if (dstFormat==PIX_FMT_BGR24)
1360 int i;
1361 for (i=0;i<dstW;i++){
1362 // vertical linear interpolation && yuv2rgb in a single step:
1363 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1364 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1365 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1366 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1367 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1368 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1369 dest+= 3;
1372 else if (dstFormat==PIX_FMT_BGR565)
1374 int i;
1375 for (i=0;i<dstW;i++){
1376 // vertical linear interpolation && yuv2rgb in a single step:
1377 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1378 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1379 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1381 ((uint16_t*)dest)[i] =
1382 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1383 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1384 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1387 else if (dstFormat==PIX_FMT_BGR555)
1389 int i;
1390 for (i=0;i<dstW;i++){
1391 // vertical linear interpolation && yuv2rgb in a single step:
1392 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1393 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1394 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1396 ((uint16_t*)dest)[i] =
1397 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1398 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1399 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1402 }//FULL_UV_IPOL
1403 else
1405 #endif // if 0
1406 #ifdef HAVE_MMX
1407 switch(c->dstFormat)
1409 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1410 case PIX_FMT_RGB32:
1411 asm volatile(
1412 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1413 "mov %4, %%"REG_b" \n\t"
1414 "push %%"REG_BP" \n\t"
1415 YSCALEYUV2RGB(%%REGBP, %5)
1416 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1417 "pop %%"REG_BP" \n\t"
1418 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1420 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1421 "a" (&c->redDither)
1423 return;
1424 case PIX_FMT_BGR24:
1425 asm volatile(
1426 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1427 "mov %4, %%"REG_b" \n\t"
1428 "push %%"REG_BP" \n\t"
1429 YSCALEYUV2RGB(%%REGBP, %5)
1430 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1431 "pop %%"REG_BP" \n\t"
1432 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1433 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1434 "a" (&c->redDither)
1436 return;
1437 case PIX_FMT_BGR555:
1438 asm volatile(
1439 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1440 "mov %4, %%"REG_b" \n\t"
1441 "push %%"REG_BP" \n\t"
1442 YSCALEYUV2RGB(%%REGBP, %5)
1443 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1444 #ifdef DITHER1XBPP
1445 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1446 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1447 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1448 #endif
1450 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1454 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1455 "a" (&c->redDither)
1457 return;
1458 case PIX_FMT_BGR565:
1459 asm volatile(
1460 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1461 "mov %4, %%"REG_b" \n\t"
1462 "push %%"REG_BP" \n\t"
1463 YSCALEYUV2RGB(%%REGBP, %5)
1464 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465 #ifdef DITHER1XBPP
1466 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1467 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1468 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1469 #endif
1471 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1472 "pop %%"REG_BP" \n\t"
1473 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475 "a" (&c->redDither)
1477 return;
1478 case PIX_FMT_YUYV422:
1479 asm volatile(
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2PACKED(%%REGBP, %5)
1484 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1485 "pop %%"REG_BP" \n\t"
1486 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1487 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488 "a" (&c->redDither)
1490 return;
1491 default: break;
1493 #endif //HAVE_MMX
1494 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1498 * YV12 to RGB without scaling or interpolating
1500 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1501 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1503 const int yalpha1=0;
1504 int i;
1506 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1507 const int yalpha= 4096; //FIXME ...
1509 if (flags&SWS_FULL_CHR_H_INT)
1511 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1512 return;
1515 #ifdef HAVE_MMX
1516 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1518 switch(dstFormat)
1520 case PIX_FMT_RGB32:
1521 asm volatile(
1522 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1523 "mov %4, %%"REG_b" \n\t"
1524 "push %%"REG_BP" \n\t"
1525 YSCALEYUV2RGB1(%%REGBP, %5)
1526 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1527 "pop %%"REG_BP" \n\t"
1528 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1530 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1531 "a" (&c->redDither)
1533 return;
1534 case PIX_FMT_BGR24:
1535 asm volatile(
1536 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1537 "mov %4, %%"REG_b" \n\t"
1538 "push %%"REG_BP" \n\t"
1539 YSCALEYUV2RGB1(%%REGBP, %5)
1540 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1541 "pop %%"REG_BP" \n\t"
1542 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1544 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545 "a" (&c->redDither)
1547 return;
1548 case PIX_FMT_BGR555:
1549 asm volatile(
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP, %5)
1554 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1555 #ifdef DITHER1XBPP
1556 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1557 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1558 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1559 #endif
1560 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565 "a" (&c->redDither)
1567 return;
1568 case PIX_FMT_BGR565:
1569 asm volatile(
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575 #ifdef DITHER1XBPP
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1579 #endif
1581 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1582 "pop %%"REG_BP" \n\t"
1583 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1585 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1586 "a" (&c->redDither)
1588 return;
1589 case PIX_FMT_YUYV422:
1590 asm volatile(
1591 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1592 "mov %4, %%"REG_b" \n\t"
1593 "push %%"REG_BP" \n\t"
1594 YSCALEYUV2PACKED1(%%REGBP, %5)
1595 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1596 "pop %%"REG_BP" \n\t"
1597 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1599 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1600 "a" (&c->redDither)
1602 return;
1605 else
1607 switch(dstFormat)
1609 case PIX_FMT_RGB32:
1610 asm volatile(
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2RGB1b(%%REGBP, %5)
1615 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620 "a" (&c->redDither)
1622 return;
1623 case PIX_FMT_BGR24:
1624 asm volatile(
1625 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1626 "mov %4, %%"REG_b" \n\t"
1627 "push %%"REG_BP" \n\t"
1628 YSCALEYUV2RGB1b(%%REGBP, %5)
1629 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1630 "pop %%"REG_BP" \n\t"
1631 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1633 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1634 "a" (&c->redDither)
1636 return;
1637 case PIX_FMT_BGR555:
1638 asm volatile(
1639 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640 "mov %4, %%"REG_b" \n\t"
1641 "push %%"REG_BP" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP, %5)
1643 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1644 #ifdef DITHER1XBPP
1645 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1646 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1647 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1648 #endif
1649 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654 "a" (&c->redDither)
1656 return;
1657 case PIX_FMT_BGR565:
1658 asm volatile(
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664 #ifdef DITHER1XBPP
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1668 #endif
1670 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1671 "pop %%"REG_BP" \n\t"
1672 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1674 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1675 "a" (&c->redDither)
1677 return;
1678 case PIX_FMT_YUYV422:
1679 asm volatile(
1680 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1681 "mov %4, %%"REG_b" \n\t"
1682 "push %%"REG_BP" \n\t"
1683 YSCALEYUV2PACKED1b(%%REGBP, %5)
1684 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1685 "pop %%"REG_BP" \n\t"
1686 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1688 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1689 "a" (&c->redDither)
1691 return;
1694 #endif /* HAVE_MMX */
1695 if (uvalpha < 2048)
1697 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1698 }else{
1699 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1703 //FIXME yuy2* can read upto 7 samples to much
1705 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1707 #ifdef HAVE_MMX
1708 asm volatile(
1709 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1710 "mov %0, %%"REG_a" \n\t"
1711 "1: \n\t"
1712 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1713 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1714 "pand %%mm2, %%mm0 \n\t"
1715 "pand %%mm2, %%mm1 \n\t"
1716 "packuswb %%mm1, %%mm0 \n\t"
1717 "movq %%mm0, (%2, %%"REG_a") \n\t"
1718 "add $8, %%"REG_a" \n\t"
1719 " js 1b \n\t"
1720 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1721 : "%"REG_a
1723 #else
1724 int i;
1725 for (i=0; i<width; i++)
1726 dst[i]= src[2*i];
1727 #endif
1730 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1732 #ifdef HAVE_MMX
1733 asm volatile(
1734 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1735 "mov %0, %%"REG_a" \n\t"
1736 "1: \n\t"
1737 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1738 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1739 "psrlw $8, %%mm0 \n\t"
1740 "psrlw $8, %%mm1 \n\t"
1741 "packuswb %%mm1, %%mm0 \n\t"
1742 "movq %%mm0, %%mm1 \n\t"
1743 "psrlw $8, %%mm0 \n\t"
1744 "pand %%mm4, %%mm1 \n\t"
1745 "packuswb %%mm0, %%mm0 \n\t"
1746 "packuswb %%mm1, %%mm1 \n\t"
1747 "movd %%mm0, (%3, %%"REG_a") \n\t"
1748 "movd %%mm1, (%2, %%"REG_a") \n\t"
1749 "add $4, %%"REG_a" \n\t"
1750 " js 1b \n\t"
1751 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1752 : "%"REG_a
1754 #else
1755 int i;
1756 for (i=0; i<width; i++)
1758 dstU[i]= src1[4*i + 1];
1759 dstV[i]= src1[4*i + 3];
1761 #endif
1762 assert(src1 == src2);
1765 /* This is almost identical to the previous, end exists only because
1766 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1767 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1769 #ifdef HAVE_MMX
1770 asm volatile(
1771 "mov %0, %%"REG_a" \n\t"
1772 "1: \n\t"
1773 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1774 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1775 "psrlw $8, %%mm0 \n\t"
1776 "psrlw $8, %%mm1 \n\t"
1777 "packuswb %%mm1, %%mm0 \n\t"
1778 "movq %%mm0, (%2, %%"REG_a") \n\t"
1779 "add $8, %%"REG_a" \n\t"
1780 " js 1b \n\t"
1781 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1782 : "%"REG_a
1784 #else
1785 int i;
1786 for (i=0; i<width; i++)
1787 dst[i]= src[2*i+1];
1788 #endif
1791 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1793 #ifdef HAVE_MMX
1794 asm volatile(
1795 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1796 "mov %0, %%"REG_a" \n\t"
1797 "1: \n\t"
1798 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1799 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1800 "pand %%mm4, %%mm0 \n\t"
1801 "pand %%mm4, %%mm1 \n\t"
1802 "packuswb %%mm1, %%mm0 \n\t"
1803 "movq %%mm0, %%mm1 \n\t"
1804 "psrlw $8, %%mm0 \n\t"
1805 "pand %%mm4, %%mm1 \n\t"
1806 "packuswb %%mm0, %%mm0 \n\t"
1807 "packuswb %%mm1, %%mm1 \n\t"
1808 "movd %%mm0, (%3, %%"REG_a") \n\t"
1809 "movd %%mm1, (%2, %%"REG_a") \n\t"
1810 "add $4, %%"REG_a" \n\t"
1811 " js 1b \n\t"
1812 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1813 : "%"REG_a
1815 #else
1816 int i;
1817 for (i=0; i<width; i++)
1819 dstU[i]= src1[4*i + 0];
1820 dstV[i]= src1[4*i + 2];
1822 #endif
1823 assert(src1 == src2);
1826 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1828 int i;
1829 for (i=0; i<width; i++)
1831 int b= ((uint32_t*)src)[i]&0xFF;
1832 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1833 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1835 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1839 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1841 int i;
1842 assert(src1 == src2);
1843 for (i=0; i<width; i++)
1845 const int a= ((uint32_t*)src1)[2*i+0];
1846 const int e= ((uint32_t*)src1)[2*i+1];
1847 const int l= (a&0xFF00FF) + (e&0xFF00FF);
1848 const int h= (a&0x00FF00) + (e&0x00FF00);
1849 const int b= l&0x3FF;
1850 const int g= h>>8;
1851 const int r= l>>16;
1853 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1854 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1858 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1860 #ifdef HAVE_MMX
1861 asm volatile(
1862 "mov %2, %%"REG_a" \n\t"
1863 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1864 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1865 "pxor %%mm7, %%mm7 \n\t"
1866 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1867 ASMALIGN(4)
1868 "1: \n\t"
1869 PREFETCH" 64(%0, %%"REG_d") \n\t"
1870 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1871 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1872 "punpcklbw %%mm7, %%mm0 \n\t"
1873 "punpcklbw %%mm7, %%mm1 \n\t"
1874 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1875 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1876 "punpcklbw %%mm7, %%mm2 \n\t"
1877 "punpcklbw %%mm7, %%mm3 \n\t"
1878 "pmaddwd %%mm6, %%mm0 \n\t"
1879 "pmaddwd %%mm6, %%mm1 \n\t"
1880 "pmaddwd %%mm6, %%mm2 \n\t"
1881 "pmaddwd %%mm6, %%mm3 \n\t"
1882 #ifndef FAST_BGR2YV12
1883 "psrad $8, %%mm0 \n\t"
1884 "psrad $8, %%mm1 \n\t"
1885 "psrad $8, %%mm2 \n\t"
1886 "psrad $8, %%mm3 \n\t"
1887 #endif
1888 "packssdw %%mm1, %%mm0 \n\t"
1889 "packssdw %%mm3, %%mm2 \n\t"
1890 "pmaddwd %%mm5, %%mm0 \n\t"
1891 "pmaddwd %%mm5, %%mm2 \n\t"
1892 "packssdw %%mm2, %%mm0 \n\t"
1893 "psraw $7, %%mm0 \n\t"
1895 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1896 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1897 "punpcklbw %%mm7, %%mm4 \n\t"
1898 "punpcklbw %%mm7, %%mm1 \n\t"
1899 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1900 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1901 "punpcklbw %%mm7, %%mm2 \n\t"
1902 "punpcklbw %%mm7, %%mm3 \n\t"
1903 "pmaddwd %%mm6, %%mm4 \n\t"
1904 "pmaddwd %%mm6, %%mm1 \n\t"
1905 "pmaddwd %%mm6, %%mm2 \n\t"
1906 "pmaddwd %%mm6, %%mm3 \n\t"
1907 #ifndef FAST_BGR2YV12
1908 "psrad $8, %%mm4 \n\t"
1909 "psrad $8, %%mm1 \n\t"
1910 "psrad $8, %%mm2 \n\t"
1911 "psrad $8, %%mm3 \n\t"
1912 #endif
1913 "packssdw %%mm1, %%mm4 \n\t"
1914 "packssdw %%mm3, %%mm2 \n\t"
1915 "pmaddwd %%mm5, %%mm4 \n\t"
1916 "pmaddwd %%mm5, %%mm2 \n\t"
1917 "add $24, %%"REG_d" \n\t"
1918 "packssdw %%mm2, %%mm4 \n\t"
1919 "psraw $7, %%mm4 \n\t"
1921 "packuswb %%mm4, %%mm0 \n\t"
1922 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1924 "movq %%mm0, (%1, %%"REG_a") \n\t"
1925 "add $8, %%"REG_a" \n\t"
1926 " js 1b \n\t"
1927 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1928 : "%"REG_a, "%"REG_d
1930 #else
1931 int i;
1932 for (i=0; i<width; i++)
1934 int b= src[i*3+0];
1935 int g= src[i*3+1];
1936 int r= src[i*3+2];
1938 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1940 #endif /* HAVE_MMX */
1943 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1945 #ifdef HAVE_MMX
1946 asm volatile(
1947 "mov %3, %%"REG_a" \n\t"
1948 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1949 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1950 "pxor %%mm7, %%mm7 \n\t"
1951 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1952 "add %%"REG_d", %%"REG_d" \n\t"
1953 ASMALIGN(4)
1954 "1: \n\t"
1955 PREFETCH" 64(%0, %%"REG_d") \n\t"
1956 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1957 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1958 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1959 "movq %%mm0, %%mm1 \n\t"
1960 "movq %%mm2, %%mm3 \n\t"
1961 "psrlq $24, %%mm0 \n\t"
1962 "psrlq $24, %%mm2 \n\t"
1963 PAVGB(%%mm1, %%mm0)
1964 PAVGB(%%mm3, %%mm2)
1965 "punpcklbw %%mm7, %%mm0 \n\t"
1966 "punpcklbw %%mm7, %%mm2 \n\t"
1967 #else
1968 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1969 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1970 "punpcklbw %%mm7, %%mm0 \n\t"
1971 "punpcklbw %%mm7, %%mm2 \n\t"
1972 "paddw %%mm2, %%mm0 \n\t"
1973 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1974 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1975 "punpcklbw %%mm7, %%mm4 \n\t"
1976 "punpcklbw %%mm7, %%mm2 \n\t"
1977 "paddw %%mm4, %%mm2 \n\t"
1978 "psrlw $1, %%mm0 \n\t"
1979 "psrlw $1, %%mm2 \n\t"
1980 #endif
1981 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1982 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1984 "pmaddwd %%mm0, %%mm1 \n\t"
1985 "pmaddwd %%mm2, %%mm3 \n\t"
1986 "pmaddwd %%mm6, %%mm0 \n\t"
1987 "pmaddwd %%mm6, %%mm2 \n\t"
1988 #ifndef FAST_BGR2YV12
1989 "psrad $8, %%mm0 \n\t"
1990 "psrad $8, %%mm1 \n\t"
1991 "psrad $8, %%mm2 \n\t"
1992 "psrad $8, %%mm3 \n\t"
1993 #endif
1994 "packssdw %%mm2, %%mm0 \n\t"
1995 "packssdw %%mm3, %%mm1 \n\t"
1996 "pmaddwd %%mm5, %%mm0 \n\t"
1997 "pmaddwd %%mm5, %%mm1 \n\t"
1998 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1999 "psraw $7, %%mm0 \n\t"
2001 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2002 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2003 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2004 "movq %%mm4, %%mm1 \n\t"
2005 "movq %%mm2, %%mm3 \n\t"
2006 "psrlq $24, %%mm4 \n\t"
2007 "psrlq $24, %%mm2 \n\t"
2008 PAVGB(%%mm1, %%mm4)
2009 PAVGB(%%mm3, %%mm2)
2010 "punpcklbw %%mm7, %%mm4 \n\t"
2011 "punpcklbw %%mm7, %%mm2 \n\t"
2012 #else
2013 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2014 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2015 "punpcklbw %%mm7, %%mm4 \n\t"
2016 "punpcklbw %%mm7, %%mm2 \n\t"
2017 "paddw %%mm2, %%mm4 \n\t"
2018 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2019 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2020 "punpcklbw %%mm7, %%mm5 \n\t"
2021 "punpcklbw %%mm7, %%mm2 \n\t"
2022 "paddw %%mm5, %%mm2 \n\t"
2023 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2024 "psrlw $2, %%mm4 \n\t"
2025 "psrlw $2, %%mm2 \n\t"
2026 #endif
2027 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2028 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2030 "pmaddwd %%mm4, %%mm1 \n\t"
2031 "pmaddwd %%mm2, %%mm3 \n\t"
2032 "pmaddwd %%mm6, %%mm4 \n\t"
2033 "pmaddwd %%mm6, %%mm2 \n\t"
2034 #ifndef FAST_BGR2YV12
2035 "psrad $8, %%mm4 \n\t"
2036 "psrad $8, %%mm1 \n\t"
2037 "psrad $8, %%mm2 \n\t"
2038 "psrad $8, %%mm3 \n\t"
2039 #endif
2040 "packssdw %%mm2, %%mm4 \n\t"
2041 "packssdw %%mm3, %%mm1 \n\t"
2042 "pmaddwd %%mm5, %%mm4 \n\t"
2043 "pmaddwd %%mm5, %%mm1 \n\t"
2044 "add $24, %%"REG_d" \n\t"
2045 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2046 "psraw $7, %%mm4 \n\t"
2048 "movq %%mm0, %%mm1 \n\t"
2049 "punpckldq %%mm4, %%mm0 \n\t"
2050 "punpckhdq %%mm4, %%mm1 \n\t"
2051 "packsswb %%mm1, %%mm0 \n\t"
2052 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2054 "movd %%mm0, (%1, %%"REG_a") \n\t"
2055 "punpckhdq %%mm0, %%mm0 \n\t"
2056 "movd %%mm0, (%2, %%"REG_a") \n\t"
2057 "add $4, %%"REG_a" \n\t"
2058 " js 1b \n\t"
2059 : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2060 : "%"REG_a, "%"REG_d
2062 #else
2063 int i;
2064 for (i=0; i<width; i++)
2066 int b= src1[6*i + 0] + src1[6*i + 3];
2067 int g= src1[6*i + 1] + src1[6*i + 4];
2068 int r= src1[6*i + 2] + src1[6*i + 5];
2070 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2071 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2073 #endif /* HAVE_MMX */
2074 assert(src1 == src2);
2077 static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2079 int i;
2080 for (i=0; i<width; i++)
2082 int d= ((uint16_t*)src)[i];
2083 int b= d&0x1F;
2084 int g= (d>>5)&0x3F;
2085 int r= (d>>11)&0x1F;
2087 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2091 static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2093 int i;
2094 assert(src1==src2);
2095 for (i=0; i<width; i++)
2097 int d0= ((uint32_t*)src1)[i];
2099 int dl= (d0&0x07E0F81F);
2100 int dh= ((d0>>5)&0x07C0F83F);
2102 int dh2= (dh>>11) + (dh<<21);
2103 int d= dh2 + dl;
2105 int b= d&0x7F;
2106 int r= (d>>11)&0x7F;
2107 int g= d>>21;
2108 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2109 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2113 static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2115 int i;
2116 for (i=0; i<width; i++)
2118 int d= ((uint16_t*)src)[i];
2119 int b= d&0x1F;
2120 int g= (d>>5)&0x1F;
2121 int r= (d>>10)&0x1F;
2123 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2127 static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2129 int i;
2130 assert(src1==src2);
2131 for (i=0; i<width; i++)
2133 int d0= ((uint32_t*)src1)[i];
2135 int dl= (d0&0x03E07C1F);
2136 int dh= ((d0>>5)&0x03E0F81F);
2138 int dh2= (dh>>11) + (dh<<21);
2139 int d= dh2 + dl;
2141 int b= d&0x7F;
2142 int r= (d>>10)&0x7F;
2143 int g= d>>21;
2144 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2145 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2150 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2152 int i;
2153 for (i=0; i<width; i++)
2155 int r= ((uint32_t*)src)[i]&0xFF;
2156 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2157 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2159 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2163 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2165 int i;
2166 assert(src1==src2);
2167 for (i=0; i<width; i++)
2169 const int a= ((uint32_t*)src1)[2*i+0];
2170 const int e= ((uint32_t*)src1)[2*i+1];
2171 const int l= (a&0xFF00FF) + (e&0xFF00FF);
2172 const int h= (a&0x00FF00) + (e&0x00FF00);
2173 const int r= l&0x3FF;
2174 const int g= h>>8;
2175 const int b= l>>16;
2177 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2178 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2182 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2184 int i;
2185 for (i=0; i<width; i++)
2187 int r= src[i*3+0];
2188 int g= src[i*3+1];
2189 int b= src[i*3+2];
2191 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2195 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2197 int i;
2198 assert(src1==src2);
2199 for (i=0; i<width; i++)
2201 int r= src1[6*i + 0] + src1[6*i + 3];
2202 int g= src1[6*i + 1] + src1[6*i + 4];
2203 int b= src1[6*i + 2] + src1[6*i + 5];
2205 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2206 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2210 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2212 int i;
2213 for (i=0; i<width; i++)
2215 int d= ((uint16_t*)src)[i];
2216 int r= d&0x1F;
2217 int g= (d>>5)&0x3F;
2218 int b= (d>>11)&0x1F;
2220 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2224 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2226 int i;
2227 assert(src1 == src2);
2228 for (i=0; i<width; i++)
2230 int d0= ((uint32_t*)src1)[i];
2232 int dl= (d0&0x07E0F81F);
2233 int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2235 int r= d&0x3F;
2236 int b= (d>>11)&0x3F;
2237 int g= d>>21;
2238 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2239 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2243 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2245 int i;
2246 for (i=0; i<width; i++)
2248 int d= ((uint16_t*)src)[i];
2249 int r= d&0x1F;
2250 int g= (d>>5)&0x1F;
2251 int b= (d>>10)&0x1F;
2253 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2257 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2259 int i;
2260 assert(src1 == src2);
2261 for (i=0; i<width; i++)
2263 int d0= ((uint32_t*)src1)[i];
2265 int dl= (d0&0x03E07C1F);
2266 int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2268 int r= d&0x3F;
2269 int b= (d>>10)&0x3F;
2270 int g= d>>21;
2271 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2272 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2276 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2278 int i;
2279 for (i=0; i<width; i++)
2281 int d= src[i];
2283 dst[i]= pal[d] & 0xFF;
2287 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2289 int i;
2290 assert(src1 == src2);
2291 for (i=0; i<width; i++)
2293 int p= pal[src1[i]];
2295 dstU[i]= p>>8;
2296 dstV[i]= p>>16;
2300 // Bilinear / Bicubic scaling
2301 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2302 int16_t *filter, int16_t *filterPos, long filterSize)
2304 #ifdef HAVE_MMX
2305 assert(filterSize % 4 == 0 && filterSize>0);
2306 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2308 long counter= -2*dstW;
2309 filter-= counter*2;
2310 filterPos-= counter/2;
2311 dst-= counter/2;
2312 asm volatile(
2313 #if defined(PIC)
2314 "push %%"REG_b" \n\t"
2315 #endif
2316 "pxor %%mm7, %%mm7 \n\t"
2317 "movq "MANGLE(w02)", %%mm6 \n\t"
2318 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2319 "mov %%"REG_a", %%"REG_BP" \n\t"
2320 ASMALIGN(4)
2321 "1: \n\t"
2322 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2323 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2324 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2325 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2326 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2327 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2328 "punpcklbw %%mm7, %%mm0 \n\t"
2329 "punpcklbw %%mm7, %%mm2 \n\t"
2330 "pmaddwd %%mm1, %%mm0 \n\t"
2331 "pmaddwd %%mm2, %%mm3 \n\t"
2332 "psrad $8, %%mm0 \n\t"
2333 "psrad $8, %%mm3 \n\t"
2334 "packssdw %%mm3, %%mm0 \n\t"
2335 "pmaddwd %%mm6, %%mm0 \n\t"
2336 "packssdw %%mm0, %%mm0 \n\t"
2337 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2338 "add $4, %%"REG_BP" \n\t"
2339 " jnc 1b \n\t"
2341 "pop %%"REG_BP" \n\t"
2342 #if defined(PIC)
2343 "pop %%"REG_b" \n\t"
2344 #endif
2345 : "+a" (counter)
2346 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2347 #if !defined(PIC)
2348 : "%"REG_b
2349 #endif
2352 else if (filterSize==8)
2354 long counter= -2*dstW;
2355 filter-= counter*4;
2356 filterPos-= counter/2;
2357 dst-= counter/2;
2358 asm volatile(
2359 #if defined(PIC)
2360 "push %%"REG_b" \n\t"
2361 #endif
2362 "pxor %%mm7, %%mm7 \n\t"
2363 "movq "MANGLE(w02)", %%mm6 \n\t"
2364 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2365 "mov %%"REG_a", %%"REG_BP" \n\t"
2366 ASMALIGN(4)
2367 "1: \n\t"
2368 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2369 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2370 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2371 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2372 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2373 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2374 "punpcklbw %%mm7, %%mm0 \n\t"
2375 "punpcklbw %%mm7, %%mm2 \n\t"
2376 "pmaddwd %%mm1, %%mm0 \n\t"
2377 "pmaddwd %%mm2, %%mm3 \n\t"
2379 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2380 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2381 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2382 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2383 "punpcklbw %%mm7, %%mm4 \n\t"
2384 "punpcklbw %%mm7, %%mm2 \n\t"
2385 "pmaddwd %%mm1, %%mm4 \n\t"
2386 "pmaddwd %%mm2, %%mm5 \n\t"
2387 "paddd %%mm4, %%mm0 \n\t"
2388 "paddd %%mm5, %%mm3 \n\t"
2390 "psrad $8, %%mm0 \n\t"
2391 "psrad $8, %%mm3 \n\t"
2392 "packssdw %%mm3, %%mm0 \n\t"
2393 "pmaddwd %%mm6, %%mm0 \n\t"
2394 "packssdw %%mm0, %%mm0 \n\t"
2395 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2396 "add $4, %%"REG_BP" \n\t"
2397 " jnc 1b \n\t"
2399 "pop %%"REG_BP" \n\t"
2400 #if defined(PIC)
2401 "pop %%"REG_b" \n\t"
2402 #endif
2403 : "+a" (counter)
2404 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2405 #if !defined(PIC)
2406 : "%"REG_b
2407 #endif
2410 else
2412 uint8_t *offset = src+filterSize;
2413 long counter= -2*dstW;
2414 //filter-= counter*filterSize/2;
2415 filterPos-= counter/2;
2416 dst-= counter/2;
2417 asm volatile(
2418 "pxor %%mm7, %%mm7 \n\t"
2419 "movq "MANGLE(w02)", %%mm6 \n\t"
2420 ASMALIGN(4)
2421 "1: \n\t"
2422 "mov %2, %%"REG_c" \n\t"
2423 "movzwl (%%"REG_c", %0), %%eax \n\t"
2424 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2425 "mov %5, %%"REG_c" \n\t"
2426 "pxor %%mm4, %%mm4 \n\t"
2427 "pxor %%mm5, %%mm5 \n\t"
2428 "2: \n\t"
2429 "movq (%1), %%mm1 \n\t"
2430 "movq (%1, %6), %%mm3 \n\t"
2431 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2432 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2433 "punpcklbw %%mm7, %%mm0 \n\t"
2434 "punpcklbw %%mm7, %%mm2 \n\t"
2435 "pmaddwd %%mm1, %%mm0 \n\t"
2436 "pmaddwd %%mm2, %%mm3 \n\t"
2437 "paddd %%mm3, %%mm5 \n\t"
2438 "paddd %%mm0, %%mm4 \n\t"
2439 "add $8, %1 \n\t"
2440 "add $4, %%"REG_c" \n\t"
2441 "cmp %4, %%"REG_c" \n\t"
2442 " jb 2b \n\t"
2443 "add %6, %1 \n\t"
2444 "psrad $8, %%mm4 \n\t"
2445 "psrad $8, %%mm5 \n\t"
2446 "packssdw %%mm5, %%mm4 \n\t"
2447 "pmaddwd %%mm6, %%mm4 \n\t"
2448 "packssdw %%mm4, %%mm4 \n\t"
2449 "mov %3, %%"REG_a" \n\t"
2450 "movd %%mm4, (%%"REG_a", %0) \n\t"
2451 "add $4, %0 \n\t"
2452 " jnc 1b \n\t"
2454 : "+r" (counter), "+r" (filter)
2455 : "m" (filterPos), "m" (dst), "m"(offset),
2456 "m" (src), "r" (filterSize*2)
2457 : "%"REG_a, "%"REG_c, "%"REG_d
2460 #else
2461 #ifdef HAVE_ALTIVEC
2462 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2463 #else
2464 int i;
2465 for (i=0; i<dstW; i++)
2467 int j;
2468 int srcPos= filterPos[i];
2469 int val=0;
2470 //printf("filterPos: %d\n", filterPos[i]);
2471 for (j=0; j<filterSize; j++)
2473 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2474 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2476 //filter += hFilterSize;
2477 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2478 //dst[i] = val>>7;
2480 #endif /* HAVE_ALTIVEC */
2481 #endif /* HAVE_MMX */
2483 // *** horizontal scale Y line to temp buffer
2484 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2485 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2486 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2487 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2488 int32_t *mmx2FilterPos, uint8_t *pal)
2490 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2492 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2493 src= formatConvBuffer;
2495 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2497 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2498 src= formatConvBuffer;
2500 else if (srcFormat==PIX_FMT_RGB32)
2502 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2503 src= formatConvBuffer;
2505 else if (srcFormat==PIX_FMT_BGR24)
2507 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2508 src= formatConvBuffer;
2510 else if (srcFormat==PIX_FMT_BGR565)
2512 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2513 src= formatConvBuffer;
2515 else if (srcFormat==PIX_FMT_BGR555)
2517 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2518 src= formatConvBuffer;
2520 else if (srcFormat==PIX_FMT_BGR32)
2522 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2523 src= formatConvBuffer;
2525 else if (srcFormat==PIX_FMT_RGB24)
2527 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2528 src= formatConvBuffer;
2530 else if (srcFormat==PIX_FMT_RGB565)
2532 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2533 src= formatConvBuffer;
2535 else if (srcFormat==PIX_FMT_RGB555)
2537 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2538 src= formatConvBuffer;
2540 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2542 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2543 src= formatConvBuffer;
2546 #ifdef HAVE_MMX
2547 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2548 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2549 #else
2550 if (!(flags&SWS_FAST_BILINEAR))
2551 #endif
2553 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2555 else // Fast Bilinear upscale / crap downscale
2557 #if defined(ARCH_X86)
2558 #ifdef HAVE_MMX2
2559 int i;
2560 #if defined(PIC)
2561 uint64_t ebxsave __attribute__((aligned(8)));
2562 #endif
2563 if (canMMX2BeUsed)
2565 asm volatile(
2566 #if defined(PIC)
2567 "mov %%"REG_b", %5 \n\t"
2568 #endif
2569 "pxor %%mm7, %%mm7 \n\t"
2570 "mov %0, %%"REG_c" \n\t"
2571 "mov %1, %%"REG_D" \n\t"
2572 "mov %2, %%"REG_d" \n\t"
2573 "mov %3, %%"REG_b" \n\t"
2574 "xor %%"REG_a", %%"REG_a" \n\t" // i
2575 PREFETCH" (%%"REG_c") \n\t"
2576 PREFETCH" 32(%%"REG_c") \n\t"
2577 PREFETCH" 64(%%"REG_c") \n\t"
2579 #ifdef ARCH_X86_64
2581 #define FUNNY_Y_CODE \
2582 "movl (%%"REG_b"), %%esi \n\t"\
2583 "call *%4 \n\t"\
2584 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2585 "add %%"REG_S", %%"REG_c" \n\t"\
2586 "add %%"REG_a", %%"REG_D" \n\t"\
2587 "xor %%"REG_a", %%"REG_a" \n\t"\
2589 #else
2591 #define FUNNY_Y_CODE \
2592 "movl (%%"REG_b"), %%esi \n\t"\
2593 "call *%4 \n\t"\
2594 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2595 "add %%"REG_a", %%"REG_D" \n\t"\
2596 "xor %%"REG_a", %%"REG_a" \n\t"\
2598 #endif /* ARCH_X86_64 */
2600 FUNNY_Y_CODE
2601 FUNNY_Y_CODE
2602 FUNNY_Y_CODE
2603 FUNNY_Y_CODE
2604 FUNNY_Y_CODE
2605 FUNNY_Y_CODE
2606 FUNNY_Y_CODE
2607 FUNNY_Y_CODE
2609 #if defined(PIC)
2610 "mov %5, %%"REG_b" \n\t"
2611 #endif
2612 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2613 "m" (funnyYCode)
2614 #if defined(PIC)
2615 ,"m" (ebxsave)
2616 #endif
2617 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2618 #if !defined(PIC)
2619 ,"%"REG_b
2620 #endif
2622 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2624 else
2626 #endif /* HAVE_MMX2 */
2627 long xInc_shr16 = xInc >> 16;
2628 uint16_t xInc_mask = xInc & 0xffff;
2629 //NO MMX just normal asm ...
2630 asm volatile(
2631 "xor %%"REG_a", %%"REG_a" \n\t" // i
2632 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2633 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2634 ASMALIGN(4)
2635 "1: \n\t"
2636 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2637 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2638 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2639 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2640 "shll $16, %%edi \n\t"
2641 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2642 "mov %1, %%"REG_D" \n\t"
2643 "shrl $9, %%esi \n\t"
2644 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2645 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2646 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2648 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2649 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2650 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2651 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2652 "shll $16, %%edi \n\t"
2653 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2654 "mov %1, %%"REG_D" \n\t"
2655 "shrl $9, %%esi \n\t"
2656 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2657 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2658 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2661 "add $2, %%"REG_a" \n\t"
2662 "cmp %2, %%"REG_a" \n\t"
2663 " jb 1b \n\t"
2666 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2667 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2669 #ifdef HAVE_MMX2
2670 } //if MMX2 can't be used
2671 #endif
2672 #else
2673 int i;
2674 unsigned int xpos=0;
2675 for (i=0;i<dstWidth;i++)
2677 register unsigned int xx=xpos>>16;
2678 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2679 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2680 xpos+=xInc;
2682 #endif /* defined(ARCH_X86) */
2686 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2687 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2688 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2689 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2690 int32_t *mmx2FilterPos, uint8_t *pal)
2692 if (srcFormat==PIX_FMT_YUYV422)
2694 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2695 src1= formatConvBuffer;
2696 src2= formatConvBuffer+VOFW;
2698 else if (srcFormat==PIX_FMT_UYVY422)
2700 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2701 src1= formatConvBuffer;
2702 src2= formatConvBuffer+VOFW;
2704 else if (srcFormat==PIX_FMT_RGB32)
2706 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2707 src1= formatConvBuffer;
2708 src2= formatConvBuffer+VOFW;
2710 else if (srcFormat==PIX_FMT_BGR24)
2712 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2713 src1= formatConvBuffer;
2714 src2= formatConvBuffer+VOFW;
2716 else if (srcFormat==PIX_FMT_BGR565)
2718 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2719 src1= formatConvBuffer;
2720 src2= formatConvBuffer+VOFW;
2722 else if (srcFormat==PIX_FMT_BGR555)
2724 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2725 src1= formatConvBuffer;
2726 src2= formatConvBuffer+VOFW;
2728 else if (srcFormat==PIX_FMT_BGR32)
2730 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2731 src1= formatConvBuffer;
2732 src2= formatConvBuffer+VOFW;
2734 else if (srcFormat==PIX_FMT_RGB24)
2736 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2737 src1= formatConvBuffer;
2738 src2= formatConvBuffer+VOFW;
2740 else if (srcFormat==PIX_FMT_RGB565)
2742 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2743 src1= formatConvBuffer;
2744 src2= formatConvBuffer+VOFW;
2746 else if (srcFormat==PIX_FMT_RGB555)
2748 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2749 src1= formatConvBuffer;
2750 src2= formatConvBuffer+VOFW;
2752 else if (isGray(srcFormat))
2754 return;
2756 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2758 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2759 src1= formatConvBuffer;
2760 src2= formatConvBuffer+VOFW;
2763 #ifdef HAVE_MMX
2764 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2765 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2766 #else
2767 if (!(flags&SWS_FAST_BILINEAR))
2768 #endif
2770 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2771 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2773 else // Fast Bilinear upscale / crap downscale
2775 #if defined(ARCH_X86)
2776 #ifdef HAVE_MMX2
2777 int i;
2778 #if defined(PIC)
2779 uint64_t ebxsave __attribute__((aligned(8)));
2780 #endif
2781 if (canMMX2BeUsed)
2783 asm volatile(
2784 #if defined(PIC)
2785 "mov %%"REG_b", %6 \n\t"
2786 #endif
2787 "pxor %%mm7, %%mm7 \n\t"
2788 "mov %0, %%"REG_c" \n\t"
2789 "mov %1, %%"REG_D" \n\t"
2790 "mov %2, %%"REG_d" \n\t"
2791 "mov %3, %%"REG_b" \n\t"
2792 "xor %%"REG_a", %%"REG_a" \n\t" // i
2793 PREFETCH" (%%"REG_c") \n\t"
2794 PREFETCH" 32(%%"REG_c") \n\t"
2795 PREFETCH" 64(%%"REG_c") \n\t"
2797 #ifdef ARCH_X86_64
2799 #define FUNNY_UV_CODE \
2800 "movl (%%"REG_b"), %%esi \n\t"\
2801 "call *%4 \n\t"\
2802 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2803 "add %%"REG_S", %%"REG_c" \n\t"\
2804 "add %%"REG_a", %%"REG_D" \n\t"\
2805 "xor %%"REG_a", %%"REG_a" \n\t"\
2807 #else
2809 #define FUNNY_UV_CODE \
2810 "movl (%%"REG_b"), %%esi \n\t"\
2811 "call *%4 \n\t"\
2812 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2813 "add %%"REG_a", %%"REG_D" \n\t"\
2814 "xor %%"REG_a", %%"REG_a" \n\t"\
2816 #endif /* ARCH_X86_64 */
2818 FUNNY_UV_CODE
2819 FUNNY_UV_CODE
2820 FUNNY_UV_CODE
2821 FUNNY_UV_CODE
2822 "xor %%"REG_a", %%"REG_a" \n\t" // i
2823 "mov %5, %%"REG_c" \n\t" // src
2824 "mov %1, %%"REG_D" \n\t" // buf1
2825 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2826 PREFETCH" (%%"REG_c") \n\t"
2827 PREFETCH" 32(%%"REG_c") \n\t"
2828 PREFETCH" 64(%%"REG_c") \n\t"
2830 FUNNY_UV_CODE
2831 FUNNY_UV_CODE
2832 FUNNY_UV_CODE
2833 FUNNY_UV_CODE
2835 #if defined(PIC)
2836 "mov %6, %%"REG_b" \n\t"
2837 #endif
2838 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2839 "m" (funnyUVCode), "m" (src2)
2840 #if defined(PIC)
2841 ,"m" (ebxsave)
2842 #endif
2843 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2844 #if !defined(PIC)
2845 ,"%"REG_b
2846 #endif
2848 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2850 //printf("%d %d %d\n", dstWidth, i, srcW);
2851 dst[i] = src1[srcW-1]*128;
2852 dst[i+VOFW] = src2[srcW-1]*128;
2855 else
2857 #endif /* HAVE_MMX2 */
2858 long xInc_shr16 = (long) (xInc >> 16);
2859 uint16_t xInc_mask = xInc & 0xffff;
2860 asm volatile(
2861 "xor %%"REG_a", %%"REG_a" \n\t" // i
2862 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2863 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2864 ASMALIGN(4)
2865 "1: \n\t"
2866 "mov %0, %%"REG_S" \n\t"
2867 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2868 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2869 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2870 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2871 "shll $16, %%edi \n\t"
2872 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2873 "mov %1, %%"REG_D" \n\t"
2874 "shrl $9, %%esi \n\t"
2875 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2877 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2878 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2879 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2880 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2881 "shll $16, %%edi \n\t"
2882 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2883 "mov %1, %%"REG_D" \n\t"
2884 "shrl $9, %%esi \n\t"
2885 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2887 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2888 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2889 "add $1, %%"REG_a" \n\t"
2890 "cmp %2, %%"REG_a" \n\t"
2891 " jb 1b \n\t"
2893 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2894 which is needed to support GCC-4.0 */
2895 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2896 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2897 #else
2898 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2899 #endif
2900 "r" (src2)
2901 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2903 #ifdef HAVE_MMX2
2904 } //if MMX2 can't be used
2905 #endif
2906 #else
2907 int i;
2908 unsigned int xpos=0;
2909 for (i=0;i<dstWidth;i++)
2911 register unsigned int xx=xpos>>16;
2912 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2913 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2914 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2915 /* slower
2916 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2917 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2919 xpos+=xInc;
2921 #endif /* defined(ARCH_X86) */
2925 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2926 int srcSliceH, uint8_t* dst[], int dstStride[]){
2928 /* load a few things into local vars to make the code more readable? and faster */
2929 const int srcW= c->srcW;
2930 const int dstW= c->dstW;
2931 const int dstH= c->dstH;
2932 const int chrDstW= c->chrDstW;
2933 const int chrSrcW= c->chrSrcW;
2934 const int lumXInc= c->lumXInc;
2935 const int chrXInc= c->chrXInc;
2936 const int dstFormat= c->dstFormat;
2937 const int srcFormat= c->srcFormat;
2938 const int flags= c->flags;
2939 const int canMMX2BeUsed= c->canMMX2BeUsed;
2940 int16_t *vLumFilterPos= c->vLumFilterPos;
2941 int16_t *vChrFilterPos= c->vChrFilterPos;
2942 int16_t *hLumFilterPos= c->hLumFilterPos;
2943 int16_t *hChrFilterPos= c->hChrFilterPos;
2944 int16_t *vLumFilter= c->vLumFilter;
2945 int16_t *vChrFilter= c->vChrFilter;
2946 int16_t *hLumFilter= c->hLumFilter;
2947 int16_t *hChrFilter= c->hChrFilter;
2948 int32_t *lumMmxFilter= c->lumMmxFilter;
2949 int32_t *chrMmxFilter= c->chrMmxFilter;
2950 const int vLumFilterSize= c->vLumFilterSize;
2951 const int vChrFilterSize= c->vChrFilterSize;
2952 const int hLumFilterSize= c->hLumFilterSize;
2953 const int hChrFilterSize= c->hChrFilterSize;
2954 int16_t **lumPixBuf= c->lumPixBuf;
2955 int16_t **chrPixBuf= c->chrPixBuf;
2956 const int vLumBufSize= c->vLumBufSize;
2957 const int vChrBufSize= c->vChrBufSize;
2958 uint8_t *funnyYCode= c->funnyYCode;
2959 uint8_t *funnyUVCode= c->funnyUVCode;
2960 uint8_t *formatConvBuffer= c->formatConvBuffer;
2961 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2962 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2963 int lastDstY;
2964 uint8_t *pal=NULL;
2966 /* vars whch will change and which we need to storw back in the context */
2967 int dstY= c->dstY;
2968 int lumBufIndex= c->lumBufIndex;
2969 int chrBufIndex= c->chrBufIndex;
2970 int lastInLumBuf= c->lastInLumBuf;
2971 int lastInChrBuf= c->lastInChrBuf;
2973 if (isPacked(c->srcFormat)){
2974 pal= src[1];
2975 src[0]=
2976 src[1]=
2977 src[2]= src[0];
2978 srcStride[0]=
2979 srcStride[1]=
2980 srcStride[2]= srcStride[0];
2982 srcStride[1]<<= c->vChrDrop;
2983 srcStride[2]<<= c->vChrDrop;
2985 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2986 // (int)dst[0], (int)dst[1], (int)dst[2]);
2988 #if 0 //self test FIXME move to a vfilter or something
2990 static volatile int i=0;
2991 i++;
2992 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2993 selfTest(src, srcStride, c->srcW, c->srcH);
2994 i--;
2996 #endif
2998 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2999 //dstStride[0],dstStride[1],dstStride[2]);
3001 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3003 static int firstTime=1; //FIXME move this into the context perhaps
3004 if (flags & SWS_PRINT_INFO && firstTime)
3006 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
3007 " ->cannot do aligned memory acesses anymore\n");
3008 firstTime=0;
3012 /* Note the user might start scaling the picture in the middle so this will not get executed
3013 this is not really intended but works currently, so ppl might do it */
3014 if (srcSliceY ==0){
3015 lumBufIndex=0;
3016 chrBufIndex=0;
3017 dstY=0;
3018 lastInLumBuf= -1;
3019 lastInChrBuf= -1;
3022 lastDstY= dstY;
3024 for (;dstY < dstH; dstY++){
3025 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3026 const int chrDstY= dstY>>c->chrDstVSubSample;
3027 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3028 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3030 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3031 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3032 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3033 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3035 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3036 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3037 //handle holes (FAST_BILINEAR & weird filters)
3038 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3039 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3040 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3041 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3042 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3044 // Do we have enough lines in this slice to output the dstY line
3045 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3047 //Do horizontal scaling
3048 while(lastInLumBuf < lastLumSrcY)
3050 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3051 lumBufIndex++;
3052 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3053 ASSERT(lumBufIndex < 2*vLumBufSize)
3054 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3055 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3056 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3057 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3058 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3059 funnyYCode, c->srcFormat, formatConvBuffer,
3060 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3061 lastInLumBuf++;
3063 while(lastInChrBuf < lastChrSrcY)
3065 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3066 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3067 chrBufIndex++;
3068 ASSERT(chrBufIndex < 2*vChrBufSize)
3069 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3070 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3071 //FIXME replace parameters through context struct (some at least)
3073 if (!(isGray(srcFormat) || isGray(dstFormat)))
3074 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3075 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3076 funnyUVCode, c->srcFormat, formatConvBuffer,
3077 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3078 lastInChrBuf++;
3080 //wrap buf index around to stay inside the ring buffer
3081 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3082 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3084 else // not enough lines left in this slice -> load the rest in the buffer
3086 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3087 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3088 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3089 vChrBufSize, vLumBufSize);*/
3091 //Do horizontal scaling
3092 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3094 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3095 lumBufIndex++;
3096 ASSERT(lumBufIndex < 2*vLumBufSize)
3097 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3098 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3099 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3100 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3101 funnyYCode, c->srcFormat, formatConvBuffer,
3102 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3103 lastInLumBuf++;
3105 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3107 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3108 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3109 chrBufIndex++;
3110 ASSERT(chrBufIndex < 2*vChrBufSize)
3111 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3112 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3114 if (!(isGray(srcFormat) || isGray(dstFormat)))
3115 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3116 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3117 funnyUVCode, c->srcFormat, formatConvBuffer,
3118 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3119 lastInChrBuf++;
3121 //wrap buf index around to stay inside the ring buffer
3122 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3123 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3124 break; //we can't output a dstY line so let's try with the next slice
3127 #ifdef HAVE_MMX
3128 b5Dither= ff_dither8[dstY&1];
3129 g6Dither= ff_dither4[dstY&1];
3130 g5Dither= ff_dither8[dstY&1];
3131 r5Dither= ff_dither8[(dstY+1)&1];
3132 #endif
3133 if (dstY < dstH-2)
3135 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3136 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3137 #ifdef HAVE_MMX
3138 int i;
3139 if (flags & SWS_ACCURATE_RND){
3140 for (i=0; i<vLumFilterSize; i+=2){
3141 lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i ];
3142 lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3143 lumMmxFilter[2*i+2]=
3144 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3145 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3147 for (i=0; i<vChrFilterSize; i+=2){
3148 chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i ];
3149 chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3150 chrMmxFilter[2*i+2]=
3151 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3152 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3154 }else{
3155 for (i=0; i<vLumFilterSize; i++)
3157 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3158 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3159 lumMmxFilter[4*i+2]=
3160 lumMmxFilter[4*i+3]=
3161 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3163 for (i=0; i<vChrFilterSize; i++)
3165 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3166 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3167 chrMmxFilter[4*i+2]=
3168 chrMmxFilter[4*i+3]=
3169 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3172 #endif
3173 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3174 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3175 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3176 RENAME(yuv2nv12X)(c,
3177 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3178 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3179 dest, uDest, dstW, chrDstW, dstFormat);
3181 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3183 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3184 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3185 if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3187 int16_t *lumBuf = lumPixBuf[0];
3188 int16_t *chrBuf= chrPixBuf[0];
3189 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3191 else //General YV12
3193 RENAME(yuv2yuvX)(c,
3194 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3195 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3196 dest, uDest, vDest, dstW, chrDstW);
3199 else
3201 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3202 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3203 if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3205 int chrAlpha= vChrFilter[2*dstY+1];
3206 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3207 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3209 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3211 int lumAlpha= vLumFilter[2*dstY+1];
3212 int chrAlpha= vChrFilter[2*dstY+1];
3213 lumMmxFilter[2]=
3214 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3215 chrMmxFilter[2]=
3216 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3217 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3218 dest, dstW, lumAlpha, chrAlpha, dstY);
3220 else //General RGB
3222 RENAME(yuv2packedX)(c,
3223 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3224 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3225 dest, dstW, dstY);
3229 else // hmm looks like we can't use MMX here without overwriting this array's tail
3231 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3232 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3233 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3234 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3235 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3236 yuv2nv12XinC(
3237 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3238 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3239 dest, uDest, dstW, chrDstW, dstFormat);
3241 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3243 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3244 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3245 yuv2yuvXinC(
3246 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3247 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3248 dest, uDest, vDest, dstW, chrDstW);
3250 else
3252 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3253 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3254 yuv2packedXinC(c,
3255 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3256 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3257 dest, dstW, dstY);
3262 #ifdef HAVE_MMX
3263 asm volatile(SFENCE:::"memory");
3264 asm volatile(EMMS:::"memory");
3265 #endif
3266 /* store changed local vars back in the context */
3267 c->dstY= dstY;
3268 c->lumBufIndex= lumBufIndex;
3269 c->chrBufIndex= chrBufIndex;
3270 c->lastInLumBuf= lastInLumBuf;
3271 c->lastInChrBuf= lastInChrBuf;
3273 return dstY - lastDstY;