better ao/vo profile examples
[mplayer/greg.git] / libswscale / swscale_template.c
blobb15983096a924915e9d76ba5f0f01cacb1d49713
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * the C code (not assembly, mmx, ...) of this file can be used
21 * under the LGPL license too
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
28 #undef PREFETCHW
29 #undef EMMS
30 #undef SFENCE
32 #ifdef HAVE_3DNOW
33 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34 #define EMMS "femms"
35 #else
36 #define EMMS "emms"
37 #endif
39 #ifdef HAVE_3DNOW
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined ( HAVE_MMX2 )
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
45 #else
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
48 #endif
50 #ifdef HAVE_MMX2
51 #define SFENCE "sfence"
52 #else
53 #define SFENCE " # nop"
54 #endif
56 #ifdef HAVE_MMX2
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60 #endif
62 #ifdef HAVE_MMX2
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64 #else
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66 #endif
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
69 #ifdef HAVE_ALTIVEC
70 #include "swscale_altivec_template.c"
71 #endif
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
74 asm volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 asm volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $16, %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
185 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187 "r" (dest), "m" (dstW),
188 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
191 #define YSCALEYUV2PACKEDX \
192 asm volatile(\
193 "xor %%"REG_a", %%"REG_a" \n\t"\
194 ASMALIGN(4)\
195 "nop \n\t"\
196 "1: \n\t"\
197 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
198 "mov (%%"REG_d"), %%"REG_S" \n\t"\
199 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
200 "movq %%mm3, %%mm4 \n\t"\
201 ASMALIGN(4)\
202 "2: \n\t"\
203 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
204 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
205 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
206 "add $16, %%"REG_d" \n\t"\
207 "mov (%%"REG_d"), %%"REG_S" \n\t"\
208 "pmulhw %%mm0, %%mm2 \n\t"\
209 "pmulhw %%mm0, %%mm5 \n\t"\
210 "paddw %%mm2, %%mm3 \n\t"\
211 "paddw %%mm5, %%mm4 \n\t"\
212 "test %%"REG_S", %%"REG_S" \n\t"\
213 " jnz 2b \n\t"\
215 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
218 "movq %%mm1, %%mm7 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm1 \n\t"\
229 "paddw %%mm5, %%mm7 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
233 #define YSCALEYUV2PACKEDX_END \
234 :: "r" (&c->redDither), \
235 "m" (dummy), "m" (dummy), "m" (dummy),\
236 "r" (dest), "m" (dstW) \
237 : "%"REG_a, "%"REG_d, "%"REG_S \
240 #define YSCALEYUV2PACKEDX_ACCURATE \
241 asm volatile(\
242 "xor %%"REG_a", %%"REG_a" \n\t"\
243 ASMALIGN(4)\
244 "nop \n\t"\
245 "1: \n\t"\
246 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
247 "mov (%%"REG_d"), %%"REG_S" \n\t"\
248 "pxor %%mm4, %%mm4 \n\t"\
249 "pxor %%mm5, %%mm5 \n\t"\
250 "pxor %%mm6, %%mm6 \n\t"\
251 "pxor %%mm7, %%mm7 \n\t"\
252 ASMALIGN(4)\
253 "2: \n\t"\
254 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
255 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
256 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
257 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
258 "movq %%mm0, %%mm3 \n\t"\
259 "punpcklwd %%mm1, %%mm0 \n\t"\
260 "punpckhwd %%mm1, %%mm3 \n\t"\
261 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
262 "pmaddwd %%mm1, %%mm0 \n\t"\
263 "pmaddwd %%mm1, %%mm3 \n\t"\
264 "paddd %%mm0, %%mm4 \n\t"\
265 "paddd %%mm3, %%mm5 \n\t"\
266 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
267 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
268 "add $16, %%"REG_d" \n\t"\
269 "test %%"REG_S", %%"REG_S" \n\t"\
270 "movq %%mm2, %%mm0 \n\t"\
271 "punpcklwd %%mm3, %%mm2 \n\t"\
272 "punpckhwd %%mm3, %%mm0 \n\t"\
273 "pmaddwd %%mm1, %%mm2 \n\t"\
274 "pmaddwd %%mm1, %%mm0 \n\t"\
275 "paddd %%mm2, %%mm6 \n\t"\
276 "paddd %%mm0, %%mm7 \n\t"\
277 " jnz 2b \n\t"\
278 "psrad $16, %%mm4 \n\t"\
279 "psrad $16, %%mm5 \n\t"\
280 "psrad $16, %%mm6 \n\t"\
281 "psrad $16, %%mm7 \n\t"\
282 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
283 "packssdw %%mm5, %%mm4 \n\t"\
284 "packssdw %%mm7, %%mm6 \n\t"\
285 "paddw %%mm0, %%mm4 \n\t"\
286 "paddw %%mm0, %%mm6 \n\t"\
287 "movq %%mm4, "U_TEMP"(%0) \n\t"\
288 "movq %%mm6, "V_TEMP"(%0) \n\t"\
290 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
291 "mov (%%"REG_d"), %%"REG_S" \n\t"\
292 "pxor %%mm1, %%mm1 \n\t"\
293 "pxor %%mm5, %%mm5 \n\t"\
294 "pxor %%mm7, %%mm7 \n\t"\
295 "pxor %%mm6, %%mm6 \n\t"\
296 ASMALIGN(4)\
297 "2: \n\t"\
298 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
299 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
300 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
301 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
302 "movq %%mm0, %%mm3 \n\t"\
303 "punpcklwd %%mm4, %%mm0 \n\t"\
304 "punpckhwd %%mm4, %%mm3 \n\t"\
305 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
306 "pmaddwd %%mm4, %%mm0 \n\t"\
307 "pmaddwd %%mm4, %%mm3 \n\t"\
308 "paddd %%mm0, %%mm1 \n\t"\
309 "paddd %%mm3, %%mm5 \n\t"\
310 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
311 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
312 "add $16, %%"REG_d" \n\t"\
313 "test %%"REG_S", %%"REG_S" \n\t"\
314 "movq %%mm2, %%mm0 \n\t"\
315 "punpcklwd %%mm3, %%mm2 \n\t"\
316 "punpckhwd %%mm3, %%mm0 \n\t"\
317 "pmaddwd %%mm4, %%mm2 \n\t"\
318 "pmaddwd %%mm4, %%mm0 \n\t"\
319 "paddd %%mm2, %%mm7 \n\t"\
320 "paddd %%mm0, %%mm6 \n\t"\
321 " jnz 2b \n\t"\
322 "psrad $16, %%mm1 \n\t"\
323 "psrad $16, %%mm5 \n\t"\
324 "psrad $16, %%mm7 \n\t"\
325 "psrad $16, %%mm6 \n\t"\
326 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
327 "packssdw %%mm5, %%mm1 \n\t"\
328 "packssdw %%mm6, %%mm7 \n\t"\
329 "paddw %%mm0, %%mm1 \n\t"\
330 "paddw %%mm0, %%mm7 \n\t"\
331 "movq "U_TEMP"(%0), %%mm3 \n\t"\
332 "movq "V_TEMP"(%0), %%mm4 \n\t"\
334 #define YSCALEYUV2RGBX \
335 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
336 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
337 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
338 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
339 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
340 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
341 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
343 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
344 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
345 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
346 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
347 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
348 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349 "paddw %%mm3, %%mm4 \n\t"\
350 "movq %%mm2, %%mm0 \n\t"\
351 "movq %%mm5, %%mm6 \n\t"\
352 "movq %%mm4, %%mm3 \n\t"\
353 "punpcklwd %%mm2, %%mm2 \n\t"\
354 "punpcklwd %%mm5, %%mm5 \n\t"\
355 "punpcklwd %%mm4, %%mm4 \n\t"\
356 "paddw %%mm1, %%mm2 \n\t"\
357 "paddw %%mm1, %%mm5 \n\t"\
358 "paddw %%mm1, %%mm4 \n\t"\
359 "punpckhwd %%mm0, %%mm0 \n\t"\
360 "punpckhwd %%mm6, %%mm6 \n\t"\
361 "punpckhwd %%mm3, %%mm3 \n\t"\
362 "paddw %%mm7, %%mm0 \n\t"\
363 "paddw %%mm7, %%mm6 \n\t"\
364 "paddw %%mm7, %%mm3 \n\t"\
365 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366 "packuswb %%mm0, %%mm2 \n\t"\
367 "packuswb %%mm6, %%mm5 \n\t"\
368 "packuswb %%mm3, %%mm4 \n\t"\
369 "pxor %%mm7, %%mm7 \n\t"
370 #if 0
371 #define FULL_YSCALEYUV2RGB \
372 "pxor %%mm7, %%mm7 \n\t"\
373 "movd %6, %%mm6 \n\t" /*yalpha1*/\
374 "punpcklwd %%mm6, %%mm6 \n\t"\
375 "punpcklwd %%mm6, %%mm6 \n\t"\
376 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
377 "punpcklwd %%mm5, %%mm5 \n\t"\
378 "punpcklwd %%mm5, %%mm5 \n\t"\
379 "xor %%"REG_a", %%"REG_a" \n\t"\
380 ASMALIGN(4)\
381 "1: \n\t"\
382 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
383 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
384 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
385 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
386 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
387 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
392 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
395 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
398 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
399 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
402 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
404 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
405 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
407 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
411 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
412 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
413 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
414 "paddw %%mm1, %%mm3 \n\t" /* B*/\
415 "paddw %%mm1, %%mm0 \n\t" /* R*/\
416 "packuswb %%mm3, %%mm3 \n\t"\
418 "packuswb %%mm0, %%mm0 \n\t"\
419 "paddw %%mm4, %%mm2 \n\t"\
420 "paddw %%mm2, %%mm1 \n\t" /* G*/\
422 "packuswb %%mm1, %%mm1 \n\t"
423 #endif
425 #define REAL_YSCALEYUV2PACKED(index, c) \
426 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
427 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
428 "psraw $3, %%mm0 \n\t"\
429 "psraw $3, %%mm1 \n\t"\
430 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432 "xor "#index", "#index" \n\t"\
433 ASMALIGN(4)\
434 "1: \n\t"\
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
439 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
442 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
449 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
450 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
451 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
452 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
461 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
463 #define REAL_YSCALEYUV2RGB(index, c) \
464 "xor "#index", "#index" \n\t"\
465 ASMALIGN(4)\
466 "1: \n\t"\
467 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
468 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
469 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
470 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
471 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
474 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
481 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
482 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
483 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
484 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
485 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
486 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
488 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
489 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
490 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
491 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
492 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
493 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
500 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
501 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
502 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
503 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
504 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
505 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506 "paddw %%mm3, %%mm4 \n\t"\
507 "movq %%mm2, %%mm0 \n\t"\
508 "movq %%mm5, %%mm6 \n\t"\
509 "movq %%mm4, %%mm3 \n\t"\
510 "punpcklwd %%mm2, %%mm2 \n\t"\
511 "punpcklwd %%mm5, %%mm5 \n\t"\
512 "punpcklwd %%mm4, %%mm4 \n\t"\
513 "paddw %%mm1, %%mm2 \n\t"\
514 "paddw %%mm1, %%mm5 \n\t"\
515 "paddw %%mm1, %%mm4 \n\t"\
516 "punpckhwd %%mm0, %%mm0 \n\t"\
517 "punpckhwd %%mm6, %%mm6 \n\t"\
518 "punpckhwd %%mm3, %%mm3 \n\t"\
519 "paddw %%mm7, %%mm0 \n\t"\
520 "paddw %%mm7, %%mm6 \n\t"\
521 "paddw %%mm7, %%mm3 \n\t"\
522 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523 "packuswb %%mm0, %%mm2 \n\t"\
524 "packuswb %%mm6, %%mm5 \n\t"\
525 "packuswb %%mm3, %%mm4 \n\t"\
526 "pxor %%mm7, %%mm7 \n\t"
527 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
529 #define REAL_YSCALEYUV2PACKED1(index, c) \
530 "xor "#index", "#index" \n\t"\
531 ASMALIGN(4)\
532 "1: \n\t"\
533 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
534 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
535 "psraw $7, %%mm3 \n\t" \
536 "psraw $7, %%mm4 \n\t" \
537 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
538 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
539 "psraw $7, %%mm1 \n\t" \
540 "psraw $7, %%mm7 \n\t" \
542 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
544 #define REAL_YSCALEYUV2RGB1(index, c) \
545 "xor "#index", "#index" \n\t"\
546 ASMALIGN(4)\
547 "1: \n\t"\
548 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
549 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
550 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
553 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
554 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
555 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
556 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
557 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
558 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
560 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
561 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
564 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
565 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
566 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
567 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
568 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
569 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570 "paddw %%mm3, %%mm4 \n\t"\
571 "movq %%mm2, %%mm0 \n\t"\
572 "movq %%mm5, %%mm6 \n\t"\
573 "movq %%mm4, %%mm3 \n\t"\
574 "punpcklwd %%mm2, %%mm2 \n\t"\
575 "punpcklwd %%mm5, %%mm5 \n\t"\
576 "punpcklwd %%mm4, %%mm4 \n\t"\
577 "paddw %%mm1, %%mm2 \n\t"\
578 "paddw %%mm1, %%mm5 \n\t"\
579 "paddw %%mm1, %%mm4 \n\t"\
580 "punpckhwd %%mm0, %%mm0 \n\t"\
581 "punpckhwd %%mm6, %%mm6 \n\t"\
582 "punpckhwd %%mm3, %%mm3 \n\t"\
583 "paddw %%mm7, %%mm0 \n\t"\
584 "paddw %%mm7, %%mm6 \n\t"\
585 "paddw %%mm7, %%mm3 \n\t"\
586 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587 "packuswb %%mm0, %%mm2 \n\t"\
588 "packuswb %%mm6, %%mm5 \n\t"\
589 "packuswb %%mm3, %%mm4 \n\t"\
590 "pxor %%mm7, %%mm7 \n\t"
591 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
593 #define REAL_YSCALEYUV2PACKED1b(index, c) \
594 "xor "#index", "#index" \n\t"\
595 ASMALIGN(4)\
596 "1: \n\t"\
597 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
598 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
599 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
601 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603 "psrlw $8, %%mm3 \n\t" \
604 "psrlw $8, %%mm4 \n\t" \
605 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
606 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
607 "psraw $7, %%mm1 \n\t" \
608 "psraw $7, %%mm7 \n\t"
609 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
611 // do vertical chrominance interpolation
612 #define REAL_YSCALEYUV2RGB1b(index, c) \
613 "xor "#index", "#index" \n\t"\
614 ASMALIGN(4)\
615 "1: \n\t"\
616 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
617 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
618 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
620 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
623 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
624 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
625 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
626 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
627 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
628 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
629 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
630 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
632 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
633 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
636 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
637 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
638 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
639 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
640 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
641 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642 "paddw %%mm3, %%mm4 \n\t"\
643 "movq %%mm2, %%mm0 \n\t"\
644 "movq %%mm5, %%mm6 \n\t"\
645 "movq %%mm4, %%mm3 \n\t"\
646 "punpcklwd %%mm2, %%mm2 \n\t"\
647 "punpcklwd %%mm5, %%mm5 \n\t"\
648 "punpcklwd %%mm4, %%mm4 \n\t"\
649 "paddw %%mm1, %%mm2 \n\t"\
650 "paddw %%mm1, %%mm5 \n\t"\
651 "paddw %%mm1, %%mm4 \n\t"\
652 "punpckhwd %%mm0, %%mm0 \n\t"\
653 "punpckhwd %%mm6, %%mm6 \n\t"\
654 "punpckhwd %%mm3, %%mm3 \n\t"\
655 "paddw %%mm7, %%mm0 \n\t"\
656 "paddw %%mm7, %%mm6 \n\t"\
657 "paddw %%mm7, %%mm3 \n\t"\
658 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659 "packuswb %%mm0, %%mm2 \n\t"\
660 "packuswb %%mm6, %%mm5 \n\t"\
661 "packuswb %%mm3, %%mm4 \n\t"\
662 "pxor %%mm7, %%mm7 \n\t"
663 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
665 #define REAL_WRITEBGR32(dst, dstw, index) \
666 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667 "movq %%mm2, %%mm1 \n\t" /* B */\
668 "movq %%mm5, %%mm6 \n\t" /* R */\
669 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
670 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
671 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
672 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
673 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
674 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
675 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
676 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
677 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
678 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
680 MOVNTQ(%%mm0, (dst, index, 4))\
681 MOVNTQ(%%mm2, 8(dst, index, 4))\
682 MOVNTQ(%%mm1, 16(dst, index, 4))\
683 MOVNTQ(%%mm3, 24(dst, index, 4))\
685 "add $8, "#index" \n\t"\
686 "cmp "#dstw", "#index" \n\t"\
687 " jb 1b \n\t"
688 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
690 #define REAL_WRITEBGR16(dst, dstw, index) \
691 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
692 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
693 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
694 "psrlq $3, %%mm2 \n\t"\
696 "movq %%mm2, %%mm1 \n\t"\
697 "movq %%mm4, %%mm3 \n\t"\
699 "punpcklbw %%mm7, %%mm3 \n\t"\
700 "punpcklbw %%mm5, %%mm2 \n\t"\
701 "punpckhbw %%mm7, %%mm4 \n\t"\
702 "punpckhbw %%mm5, %%mm1 \n\t"\
704 "psllq $3, %%mm3 \n\t"\
705 "psllq $3, %%mm4 \n\t"\
707 "por %%mm3, %%mm2 \n\t"\
708 "por %%mm4, %%mm1 \n\t"\
710 MOVNTQ(%%mm2, (dst, index, 2))\
711 MOVNTQ(%%mm1, 8(dst, index, 2))\
713 "add $8, "#index" \n\t"\
714 "cmp "#dstw", "#index" \n\t"\
715 " jb 1b \n\t"
716 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
718 #define REAL_WRITEBGR15(dst, dstw, index) \
719 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
720 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
721 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
722 "psrlq $3, %%mm2 \n\t"\
723 "psrlq $1, %%mm5 \n\t"\
725 "movq %%mm2, %%mm1 \n\t"\
726 "movq %%mm4, %%mm3 \n\t"\
728 "punpcklbw %%mm7, %%mm3 \n\t"\
729 "punpcklbw %%mm5, %%mm2 \n\t"\
730 "punpckhbw %%mm7, %%mm4 \n\t"\
731 "punpckhbw %%mm5, %%mm1 \n\t"\
733 "psllq $2, %%mm3 \n\t"\
734 "psllq $2, %%mm4 \n\t"\
736 "por %%mm3, %%mm2 \n\t"\
737 "por %%mm4, %%mm1 \n\t"\
739 MOVNTQ(%%mm2, (dst, index, 2))\
740 MOVNTQ(%%mm1, 8(dst, index, 2))\
742 "add $8, "#index" \n\t"\
743 "cmp "#dstw", "#index" \n\t"\
744 " jb 1b \n\t"
745 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
747 #define WRITEBGR24OLD(dst, dstw, index) \
748 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749 "movq %%mm2, %%mm1 \n\t" /* B */\
750 "movq %%mm5, %%mm6 \n\t" /* R */\
751 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
752 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
753 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
754 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
755 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
756 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
757 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
758 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
759 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
760 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
762 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
763 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
764 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
765 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
766 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
767 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
768 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
769 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
771 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
772 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
773 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
774 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
775 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
776 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
777 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
778 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
779 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
780 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
781 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
782 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
783 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
785 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
786 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
787 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
788 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
789 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
790 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
791 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
792 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
794 MOVNTQ(%%mm0, (dst))\
795 MOVNTQ(%%mm2, 8(dst))\
796 MOVNTQ(%%mm3, 16(dst))\
797 "add $24, "#dst" \n\t"\
799 "add $8, "#index" \n\t"\
800 "cmp "#dstw", "#index" \n\t"\
801 " jb 1b \n\t"
803 #define WRITEBGR24MMX(dst, dstw, index) \
804 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805 "movq %%mm2, %%mm1 \n\t" /* B */\
806 "movq %%mm5, %%mm6 \n\t" /* R */\
807 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
808 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
809 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
810 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
811 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
812 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
813 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
814 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
815 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
816 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
818 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
819 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
820 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
821 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
823 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
824 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
825 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
826 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
828 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
829 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
830 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
831 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
833 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
834 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
835 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
836 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
837 MOVNTQ(%%mm0, (dst))\
839 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
840 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
841 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
842 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
843 MOVNTQ(%%mm6, 8(dst))\
845 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
846 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
847 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
848 MOVNTQ(%%mm5, 16(dst))\
850 "add $24, "#dst" \n\t"\
852 "add $8, "#index" \n\t"\
853 "cmp "#dstw", "#index" \n\t"\
854 " jb 1b \n\t"
856 #define WRITEBGR24MMX2(dst, dstw, index) \
857 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858 "movq "MANGLE(M24A)", %%mm0 \n\t"\
859 "movq "MANGLE(M24C)", %%mm7 \n\t"\
860 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
861 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
862 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
864 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
865 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
866 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
868 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
869 "por %%mm1, %%mm6 \n\t"\
870 "por %%mm3, %%mm6 \n\t"\
871 MOVNTQ(%%mm6, (dst))\
873 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
874 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
875 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
876 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
878 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
879 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
880 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
882 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
883 "por %%mm3, %%mm6 \n\t"\
884 MOVNTQ(%%mm6, 8(dst))\
886 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
887 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
888 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
890 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
891 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
892 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
894 "por %%mm1, %%mm3 \n\t"\
895 "por %%mm3, %%mm6 \n\t"\
896 MOVNTQ(%%mm6, 16(dst))\
898 "add $24, "#dst" \n\t"\
900 "add $8, "#index" \n\t"\
901 "cmp "#dstw", "#index" \n\t"\
902 " jb 1b \n\t"
904 #ifdef HAVE_MMX2
905 #undef WRITEBGR24
906 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
907 #else
908 #undef WRITEBGR24
909 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
910 #endif
912 #define REAL_WRITEYUY2(dst, dstw, index) \
913 "packuswb %%mm3, %%mm3 \n\t"\
914 "packuswb %%mm4, %%mm4 \n\t"\
915 "packuswb %%mm7, %%mm1 \n\t"\
916 "punpcklbw %%mm4, %%mm3 \n\t"\
917 "movq %%mm1, %%mm7 \n\t"\
918 "punpcklbw %%mm3, %%mm1 \n\t"\
919 "punpckhbw %%mm3, %%mm7 \n\t"\
921 MOVNTQ(%%mm1, (dst, index, 2))\
922 MOVNTQ(%%mm7, 8(dst, index, 2))\
924 "add $8, "#index" \n\t"\
925 "cmp "#dstw", "#index" \n\t"\
926 " jb 1b \n\t"
927 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
930 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
931 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
934 #ifdef HAVE_MMX
935 if (c->flags & SWS_ACCURATE_RND){
936 if (uDest){
937 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
941 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
942 }else{
943 if (uDest){
944 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
948 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
950 #else
951 #ifdef HAVE_ALTIVEC
952 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953 chrFilter, chrSrc, chrFilterSize,
954 dest, uDest, vDest, dstW, chrDstW);
955 #else //HAVE_ALTIVEC
956 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
957 chrFilter, chrSrc, chrFilterSize,
958 dest, uDest, vDest, dstW, chrDstW);
959 #endif //!HAVE_ALTIVEC
960 #endif /* HAVE_MMX */
963 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
967 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968 chrFilter, chrSrc, chrFilterSize,
969 dest, uDest, dstW, chrDstW, dstFormat);
972 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
973 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
975 #ifdef HAVE_MMX
976 if (uDest != NULL)
978 asm volatile(
979 YSCALEYUV2YV121
980 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
981 "g" (-chrDstW)
982 : "%"REG_a
985 asm volatile(
986 YSCALEYUV2YV121
987 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
988 "g" (-chrDstW)
989 : "%"REG_a
993 asm volatile(
994 YSCALEYUV2YV121
995 :: "r" (lumSrc + dstW), "r" (dest + dstW),
996 "g" (-dstW)
997 : "%"REG_a
999 #else
1000 int i;
1001 for (i=0; i<dstW; i++)
1003 int val= lumSrc[i]>>7;
1005 if (val&256){
1006 if (val<0) val=0;
1007 else val=255;
1010 dest[i]= val;
1013 if (uDest != NULL)
1014 for (i=0; i<chrDstW; i++)
1016 int u=chrSrc[i]>>7;
1017 int v=chrSrc[i + 2048]>>7;
1019 if ((u|v)&256){
1020 if (u<0) u=0;
1021 else if (u>255) u=255;
1022 if (v<0) v=0;
1023 else if (v>255) v=255;
1026 uDest[i]= u;
1027 vDest[i]= v;
1029 #endif
1034 * vertical scale YV12 to RGB
1036 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1037 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038 uint8_t *dest, long dstW, long dstY)
1040 #ifdef HAVE_MMX
1041 long dummy=0;
1042 if (c->flags & SWS_ACCURATE_RND){
1043 switch(c->dstFormat){
1044 case PIX_FMT_RGB32:
1045 YSCALEYUV2PACKEDX_ACCURATE
1046 YSCALEYUV2RGBX
1047 WRITEBGR32(%4, %5, %%REGa)
1049 YSCALEYUV2PACKEDX_END
1050 return;
1051 case PIX_FMT_BGR24:
1052 YSCALEYUV2PACKEDX_ACCURATE
1053 YSCALEYUV2RGBX
1054 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055 "add %4, %%"REG_c" \n\t"
1056 WRITEBGR24(%%REGc, %5, %%REGa)
1059 :: "r" (&c->redDither),
1060 "m" (dummy), "m" (dummy), "m" (dummy),
1061 "r" (dest), "m" (dstW)
1062 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1064 return;
1065 case PIX_FMT_BGR555:
1066 YSCALEYUV2PACKEDX_ACCURATE
1067 YSCALEYUV2RGBX
1068 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1069 #ifdef DITHER1XBPP
1070 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1073 #endif
1075 WRITEBGR15(%4, %5, %%REGa)
1076 YSCALEYUV2PACKEDX_END
1077 return;
1078 case PIX_FMT_BGR565:
1079 YSCALEYUV2PACKEDX_ACCURATE
1080 YSCALEYUV2RGBX
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082 #ifdef DITHER1XBPP
1083 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1086 #endif
1088 WRITEBGR16(%4, %5, %%REGa)
1089 YSCALEYUV2PACKEDX_END
1090 return;
1091 case PIX_FMT_YUYV422:
1092 YSCALEYUV2PACKEDX_ACCURATE
1093 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1095 "psraw $3, %%mm3 \n\t"
1096 "psraw $3, %%mm4 \n\t"
1097 "psraw $3, %%mm1 \n\t"
1098 "psraw $3, %%mm7 \n\t"
1099 WRITEYUY2(%4, %5, %%REGa)
1100 YSCALEYUV2PACKEDX_END
1101 return;
1103 }else{
1104 switch(c->dstFormat)
1106 case PIX_FMT_RGB32:
1107 YSCALEYUV2PACKEDX
1108 YSCALEYUV2RGBX
1109 WRITEBGR32(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1111 return;
1112 case PIX_FMT_BGR24:
1113 YSCALEYUV2PACKEDX
1114 YSCALEYUV2RGBX
1115 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1116 "add %4, %%"REG_c" \n\t"
1117 WRITEBGR24(%%REGc, %5, %%REGa)
1119 :: "r" (&c->redDither),
1120 "m" (dummy), "m" (dummy), "m" (dummy),
1121 "r" (dest), "m" (dstW)
1122 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1124 return;
1125 case PIX_FMT_BGR555:
1126 YSCALEYUV2PACKEDX
1127 YSCALEYUV2RGBX
1128 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1129 #ifdef DITHER1XBPP
1130 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1131 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1132 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1133 #endif
1135 WRITEBGR15(%4, %5, %%REGa)
1136 YSCALEYUV2PACKEDX_END
1137 return;
1138 case PIX_FMT_BGR565:
1139 YSCALEYUV2PACKEDX
1140 YSCALEYUV2RGBX
1141 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1142 #ifdef DITHER1XBPP
1143 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1144 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1145 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1146 #endif
1148 WRITEBGR16(%4, %5, %%REGa)
1149 YSCALEYUV2PACKEDX_END
1150 return;
1151 case PIX_FMT_YUYV422:
1152 YSCALEYUV2PACKEDX
1153 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1155 "psraw $3, %%mm3 \n\t"
1156 "psraw $3, %%mm4 \n\t"
1157 "psraw $3, %%mm1 \n\t"
1158 "psraw $3, %%mm7 \n\t"
1159 WRITEYUY2(%4, %5, %%REGa)
1160 YSCALEYUV2PACKEDX_END
1161 return;
1164 #endif /* HAVE_MMX */
1165 #ifdef HAVE_ALTIVEC
1166 /* The following list of supported dstFormat values should
1167 match what's found in the body of altivec_yuv2packedX() */
1168 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1169 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1171 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172 chrFilter, chrSrc, chrFilterSize,
1173 dest, dstW, dstY);
1174 else
1175 #endif
1176 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177 chrFilter, chrSrc, chrFilterSize,
1178 dest, dstW, dstY);
1182 * vertical bilinear scale YV12 to RGB
1184 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1185 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1187 int yalpha1=yalpha^4095;
1188 int uvalpha1=uvalpha^4095;
1189 int i;
1191 #if 0 //isn't used
1192 if (flags&SWS_FULL_CHR_H_INT)
1194 switch(dstFormat)
1196 #ifdef HAVE_MMX
1197 case PIX_FMT_RGB32:
1198 asm volatile(
1201 FULL_YSCALEYUV2RGB
1202 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1203 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1205 "movq %%mm3, %%mm1 \n\t"
1206 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1207 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1209 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1210 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1212 "add $4, %%"REG_a" \n\t"
1213 "cmp %5, %%"REG_a" \n\t"
1214 " jb 1b \n\t"
1216 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1217 "m" (yalpha1), "m" (uvalpha1)
1218 : "%"REG_a
1220 break;
1221 case PIX_FMT_BGR24:
1222 asm volatile(
1224 FULL_YSCALEYUV2RGB
1226 // lsb ... msb
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1234 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1235 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1236 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1237 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1238 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1239 "movq %%mm1, %%mm2 \n\t"
1240 "psllq $48, %%mm1 \n\t" // 000000BG
1241 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1243 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1244 "psrld $16, %%mm2 \n\t" // R000R000
1245 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1246 "por %%mm2, %%mm1 \n\t" // RBGRR000
1248 "mov %4, %%"REG_b" \n\t"
1249 "add %%"REG_a", %%"REG_b" \n\t"
1251 #ifdef HAVE_MMX2
1252 //FIXME Alignment
1253 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1254 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1255 #else
1256 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1257 "psrlq $32, %%mm3 \n\t"
1258 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1259 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1260 #endif
1261 "add $4, %%"REG_a" \n\t"
1262 "cmp %5, %%"REG_a" \n\t"
1263 " jb 1b \n\t"
1265 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1266 "m" (yalpha1), "m" (uvalpha1)
1267 : "%"REG_a, "%"REG_b
1269 break;
1270 case PIX_FMT_BGR555:
1271 asm volatile(
1273 FULL_YSCALEYUV2RGB
1274 #ifdef DITHER1XBPP
1275 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1276 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1277 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1278 #endif
1279 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1280 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1281 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1283 "psrlw $3, %%mm3 \n\t"
1284 "psllw $2, %%mm1 \n\t"
1285 "psllw $7, %%mm0 \n\t"
1286 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1287 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1289 "por %%mm3, %%mm1 \n\t"
1290 "por %%mm1, %%mm0 \n\t"
1292 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1294 "add $4, %%"REG_a" \n\t"
1295 "cmp %5, %%"REG_a" \n\t"
1296 " jb 1b \n\t"
1298 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1299 "m" (yalpha1), "m" (uvalpha1)
1300 : "%"REG_a
1302 break;
1303 case PIX_FMT_BGR565:
1304 asm volatile(
1306 FULL_YSCALEYUV2RGB
1307 #ifdef DITHER1XBPP
1308 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1309 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1310 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1311 #endif
1312 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1313 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1314 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1316 "psrlw $3, %%mm3 \n\t"
1317 "psllw $3, %%mm1 \n\t"
1318 "psllw $8, %%mm0 \n\t"
1319 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1320 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1322 "por %%mm3, %%mm1 \n\t"
1323 "por %%mm1, %%mm0 \n\t"
1325 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1327 "add $4, %%"REG_a" \n\t"
1328 "cmp %5, %%"REG_a" \n\t"
1329 " jb 1b \n\t"
1331 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1332 "m" (yalpha1), "m" (uvalpha1)
1333 : "%"REG_a
1335 break;
1336 #endif /* HAVE_MMX */
1337 case PIX_FMT_BGR32:
1338 #ifndef HAVE_MMX
1339 case PIX_FMT_RGB32:
1340 #endif
1341 if (dstFormat==PIX_FMT_RGB32)
1343 int i;
1344 #ifdef WORDS_BIGENDIAN
1345 dest++;
1346 #endif
1347 for (i=0;i<dstW;i++){
1348 // vertical linear interpolation && yuv2rgb in a single step:
1349 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1350 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1351 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1352 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1353 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1354 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1355 dest+= 4;
1358 else if (dstFormat==PIX_FMT_BGR24)
1360 int i;
1361 for (i=0;i<dstW;i++){
1362 // vertical linear interpolation && yuv2rgb in a single step:
1363 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1364 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1365 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1366 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1367 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1368 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1369 dest+= 3;
1372 else if (dstFormat==PIX_FMT_BGR565)
1374 int i;
1375 for (i=0;i<dstW;i++){
1376 // vertical linear interpolation && yuv2rgb in a single step:
1377 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1378 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1379 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1381 ((uint16_t*)dest)[i] =
1382 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1383 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1384 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1387 else if (dstFormat==PIX_FMT_BGR555)
1389 int i;
1390 for (i=0;i<dstW;i++){
1391 // vertical linear interpolation && yuv2rgb in a single step:
1392 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1393 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1394 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1396 ((uint16_t*)dest)[i] =
1397 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1398 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1399 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1402 }//FULL_UV_IPOL
1403 else
1405 #endif // if 0
1406 #ifdef HAVE_MMX
1407 switch(c->dstFormat)
1409 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1410 case PIX_FMT_RGB32:
1411 asm volatile(
1412 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1413 "mov %4, %%"REG_b" \n\t"
1414 "push %%"REG_BP" \n\t"
1415 YSCALEYUV2RGB(%%REGBP, %5)
1416 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1417 "pop %%"REG_BP" \n\t"
1418 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1420 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1421 "a" (&c->redDither)
1423 return;
1424 case PIX_FMT_BGR24:
1425 asm volatile(
1426 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1427 "mov %4, %%"REG_b" \n\t"
1428 "push %%"REG_BP" \n\t"
1429 YSCALEYUV2RGB(%%REGBP, %5)
1430 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1431 "pop %%"REG_BP" \n\t"
1432 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1433 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1434 "a" (&c->redDither)
1436 return;
1437 case PIX_FMT_BGR555:
1438 asm volatile(
1439 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1440 "mov %4, %%"REG_b" \n\t"
1441 "push %%"REG_BP" \n\t"
1442 YSCALEYUV2RGB(%%REGBP, %5)
1443 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1444 #ifdef DITHER1XBPP
1445 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1446 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1447 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1448 #endif
1450 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1454 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1455 "a" (&c->redDither)
1457 return;
1458 case PIX_FMT_BGR565:
1459 asm volatile(
1460 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1461 "mov %4, %%"REG_b" \n\t"
1462 "push %%"REG_BP" \n\t"
1463 YSCALEYUV2RGB(%%REGBP, %5)
1464 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465 #ifdef DITHER1XBPP
1466 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1467 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1468 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1469 #endif
1471 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1472 "pop %%"REG_BP" \n\t"
1473 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475 "a" (&c->redDither)
1477 return;
1478 case PIX_FMT_YUYV422:
1479 asm volatile(
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2PACKED(%%REGBP, %5)
1484 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1485 "pop %%"REG_BP" \n\t"
1486 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1487 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488 "a" (&c->redDither)
1490 return;
1491 default: break;
1493 #endif //HAVE_MMX
1494 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1498 * YV12 to RGB without scaling or interpolating
1500 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1501 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1503 const int yalpha1=0;
1504 int i;
1506 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1507 const int yalpha= 4096; //FIXME ...
1509 if (flags&SWS_FULL_CHR_H_INT)
1511 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1512 return;
1515 #ifdef HAVE_MMX
1516 if ( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1518 switch(dstFormat)
1520 case PIX_FMT_RGB32:
1521 asm volatile(
1522 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1523 "mov %4, %%"REG_b" \n\t"
1524 "push %%"REG_BP" \n\t"
1525 YSCALEYUV2RGB1(%%REGBP, %5)
1526 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1527 "pop %%"REG_BP" \n\t"
1528 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1530 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1531 "a" (&c->redDither)
1533 return;
1534 case PIX_FMT_BGR24:
1535 asm volatile(
1536 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1537 "mov %4, %%"REG_b" \n\t"
1538 "push %%"REG_BP" \n\t"
1539 YSCALEYUV2RGB1(%%REGBP, %5)
1540 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1541 "pop %%"REG_BP" \n\t"
1542 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1544 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545 "a" (&c->redDither)
1547 return;
1548 case PIX_FMT_BGR555:
1549 asm volatile(
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP, %5)
1554 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1555 #ifdef DITHER1XBPP
1556 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1557 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1558 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1559 #endif
1560 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565 "a" (&c->redDither)
1567 return;
1568 case PIX_FMT_BGR565:
1569 asm volatile(
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575 #ifdef DITHER1XBPP
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1579 #endif
1581 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1582 "pop %%"REG_BP" \n\t"
1583 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1585 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1586 "a" (&c->redDither)
1588 return;
1589 case PIX_FMT_YUYV422:
1590 asm volatile(
1591 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1592 "mov %4, %%"REG_b" \n\t"
1593 "push %%"REG_BP" \n\t"
1594 YSCALEYUV2PACKED1(%%REGBP, %5)
1595 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1596 "pop %%"REG_BP" \n\t"
1597 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1599 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1600 "a" (&c->redDither)
1602 return;
1605 else
1607 switch(dstFormat)
1609 case PIX_FMT_RGB32:
1610 asm volatile(
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2RGB1b(%%REGBP, %5)
1615 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620 "a" (&c->redDither)
1622 return;
1623 case PIX_FMT_BGR24:
1624 asm volatile(
1625 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1626 "mov %4, %%"REG_b" \n\t"
1627 "push %%"REG_BP" \n\t"
1628 YSCALEYUV2RGB1b(%%REGBP, %5)
1629 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1630 "pop %%"REG_BP" \n\t"
1631 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1633 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1634 "a" (&c->redDither)
1636 return;
1637 case PIX_FMT_BGR555:
1638 asm volatile(
1639 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1640 "mov %4, %%"REG_b" \n\t"
1641 "push %%"REG_BP" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP, %5)
1643 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1644 #ifdef DITHER1XBPP
1645 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1646 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1647 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1648 #endif
1649 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654 "a" (&c->redDither)
1656 return;
1657 case PIX_FMT_BGR565:
1658 asm volatile(
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664 #ifdef DITHER1XBPP
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1668 #endif
1670 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1671 "pop %%"REG_BP" \n\t"
1672 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1674 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1675 "a" (&c->redDither)
1677 return;
1678 case PIX_FMT_YUYV422:
1679 asm volatile(
1680 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1681 "mov %4, %%"REG_b" \n\t"
1682 "push %%"REG_BP" \n\t"
1683 YSCALEYUV2PACKED1b(%%REGBP, %5)
1684 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1685 "pop %%"REG_BP" \n\t"
1686 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1688 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1689 "a" (&c->redDither)
1691 return;
1694 #endif /* HAVE_MMX */
1695 if ( uvalpha < 2048 )
1697 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1698 }else{
1699 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1703 //FIXME yuy2* can read upto 7 samples to much
1705 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1707 #ifdef HAVE_MMX
1708 asm volatile(
1709 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1710 "mov %0, %%"REG_a" \n\t"
1711 "1: \n\t"
1712 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1713 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1714 "pand %%mm2, %%mm0 \n\t"
1715 "pand %%mm2, %%mm1 \n\t"
1716 "packuswb %%mm1, %%mm0 \n\t"
1717 "movq %%mm0, (%2, %%"REG_a") \n\t"
1718 "add $8, %%"REG_a" \n\t"
1719 " js 1b \n\t"
1720 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1721 : "%"REG_a
1723 #else
1724 int i;
1725 for (i=0; i<width; i++)
1726 dst[i]= src[2*i];
1727 #endif
1730 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1732 #ifdef HAVE_MMX
1733 asm volatile(
1734 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1735 "mov %0, %%"REG_a" \n\t"
1736 "1: \n\t"
1737 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1738 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1739 "psrlw $8, %%mm0 \n\t"
1740 "psrlw $8, %%mm1 \n\t"
1741 "packuswb %%mm1, %%mm0 \n\t"
1742 "movq %%mm0, %%mm1 \n\t"
1743 "psrlw $8, %%mm0 \n\t"
1744 "pand %%mm4, %%mm1 \n\t"
1745 "packuswb %%mm0, %%mm0 \n\t"
1746 "packuswb %%mm1, %%mm1 \n\t"
1747 "movd %%mm0, (%3, %%"REG_a") \n\t"
1748 "movd %%mm1, (%2, %%"REG_a") \n\t"
1749 "add $4, %%"REG_a" \n\t"
1750 " js 1b \n\t"
1751 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1752 : "%"REG_a
1754 #else
1755 int i;
1756 for (i=0; i<width; i++)
1758 dstU[i]= src1[4*i + 1];
1759 dstV[i]= src1[4*i + 3];
1761 #endif
1762 assert(src1 == src2);
1765 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1766 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1768 #ifdef HAVE_MMX
1769 asm volatile(
1770 "mov %0, %%"REG_a" \n\t"
1771 "1: \n\t"
1772 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1773 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1774 "psrlw $8, %%mm0 \n\t"
1775 "psrlw $8, %%mm1 \n\t"
1776 "packuswb %%mm1, %%mm0 \n\t"
1777 "movq %%mm0, (%2, %%"REG_a") \n\t"
1778 "add $8, %%"REG_a" \n\t"
1779 " js 1b \n\t"
1780 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1781 : "%"REG_a
1783 #else
1784 int i;
1785 for (i=0; i<width; i++)
1786 dst[i]= src[2*i+1];
1787 #endif
1790 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1792 #ifdef HAVE_MMX
1793 asm volatile(
1794 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1795 "mov %0, %%"REG_a" \n\t"
1796 "1: \n\t"
1797 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1798 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1799 "pand %%mm4, %%mm0 \n\t"
1800 "pand %%mm4, %%mm1 \n\t"
1801 "packuswb %%mm1, %%mm0 \n\t"
1802 "movq %%mm0, %%mm1 \n\t"
1803 "psrlw $8, %%mm0 \n\t"
1804 "pand %%mm4, %%mm1 \n\t"
1805 "packuswb %%mm0, %%mm0 \n\t"
1806 "packuswb %%mm1, %%mm1 \n\t"
1807 "movd %%mm0, (%3, %%"REG_a") \n\t"
1808 "movd %%mm1, (%2, %%"REG_a") \n\t"
1809 "add $4, %%"REG_a" \n\t"
1810 " js 1b \n\t"
1811 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1812 : "%"REG_a
1814 #else
1815 int i;
1816 for (i=0; i<width; i++)
1818 dstU[i]= src1[4*i + 0];
1819 dstV[i]= src1[4*i + 2];
1821 #endif
1822 assert(src1 == src2);
1825 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1827 int i;
1828 for (i=0; i<width; i++)
1830 int b= ((uint32_t*)src)[i]&0xFF;
1831 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1832 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1834 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1838 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1840 int i;
1841 assert(src1 == src2);
1842 for (i=0; i<width; i++)
1844 const int a= ((uint32_t*)src1)[2*i+0];
1845 const int e= ((uint32_t*)src1)[2*i+1];
1846 const int l= (a&0xFF00FF) + (e&0xFF00FF);
1847 const int h= (a&0x00FF00) + (e&0x00FF00);
1848 const int b= l&0x3FF;
1849 const int g= h>>8;
1850 const int r= l>>16;
1852 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1853 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1857 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1859 #ifdef HAVE_MMX
1860 asm volatile(
1861 "mov %2, %%"REG_a" \n\t"
1862 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1863 "movq "MANGLE(w1111)", %%mm5 \n\t"
1864 "pxor %%mm7, %%mm7 \n\t"
1865 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1866 ASMALIGN(4)
1867 "1: \n\t"
1868 PREFETCH" 64(%0, %%"REG_d") \n\t"
1869 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1870 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1871 "punpcklbw %%mm7, %%mm0 \n\t"
1872 "punpcklbw %%mm7, %%mm1 \n\t"
1873 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1874 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1875 "punpcklbw %%mm7, %%mm2 \n\t"
1876 "punpcklbw %%mm7, %%mm3 \n\t"
1877 "pmaddwd %%mm6, %%mm0 \n\t"
1878 "pmaddwd %%mm6, %%mm1 \n\t"
1879 "pmaddwd %%mm6, %%mm2 \n\t"
1880 "pmaddwd %%mm6, %%mm3 \n\t"
1881 #ifndef FAST_BGR2YV12
1882 "psrad $8, %%mm0 \n\t"
1883 "psrad $8, %%mm1 \n\t"
1884 "psrad $8, %%mm2 \n\t"
1885 "psrad $8, %%mm3 \n\t"
1886 #endif
1887 "packssdw %%mm1, %%mm0 \n\t"
1888 "packssdw %%mm3, %%mm2 \n\t"
1889 "pmaddwd %%mm5, %%mm0 \n\t"
1890 "pmaddwd %%mm5, %%mm2 \n\t"
1891 "packssdw %%mm2, %%mm0 \n\t"
1892 "psraw $7, %%mm0 \n\t"
1894 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1895 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1896 "punpcklbw %%mm7, %%mm4 \n\t"
1897 "punpcklbw %%mm7, %%mm1 \n\t"
1898 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1899 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1900 "punpcklbw %%mm7, %%mm2 \n\t"
1901 "punpcklbw %%mm7, %%mm3 \n\t"
1902 "pmaddwd %%mm6, %%mm4 \n\t"
1903 "pmaddwd %%mm6, %%mm1 \n\t"
1904 "pmaddwd %%mm6, %%mm2 \n\t"
1905 "pmaddwd %%mm6, %%mm3 \n\t"
1906 #ifndef FAST_BGR2YV12
1907 "psrad $8, %%mm4 \n\t"
1908 "psrad $8, %%mm1 \n\t"
1909 "psrad $8, %%mm2 \n\t"
1910 "psrad $8, %%mm3 \n\t"
1911 #endif
1912 "packssdw %%mm1, %%mm4 \n\t"
1913 "packssdw %%mm3, %%mm2 \n\t"
1914 "pmaddwd %%mm5, %%mm4 \n\t"
1915 "pmaddwd %%mm5, %%mm2 \n\t"
1916 "add $24, %%"REG_d" \n\t"
1917 "packssdw %%mm2, %%mm4 \n\t"
1918 "psraw $7, %%mm4 \n\t"
1920 "packuswb %%mm4, %%mm0 \n\t"
1921 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1923 "movq %%mm0, (%1, %%"REG_a") \n\t"
1924 "add $8, %%"REG_a" \n\t"
1925 " js 1b \n\t"
1926 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1927 : "%"REG_a, "%"REG_d
1929 #else
1930 int i;
1931 for (i=0; i<width; i++)
1933 int b= src[i*3+0];
1934 int g= src[i*3+1];
1935 int r= src[i*3+2];
1937 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1939 #endif /* HAVE_MMX */
1942 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1944 #ifdef HAVE_MMX
1945 asm volatile(
1946 "mov %3, %%"REG_a" \n\t"
1947 "movq "MANGLE(w1111)", %%mm5 \n\t"
1948 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1949 "pxor %%mm7, %%mm7 \n\t"
1950 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1951 "add %%"REG_d", %%"REG_d" \n\t"
1952 ASMALIGN(4)
1953 "1: \n\t"
1954 PREFETCH" 64(%0, %%"REG_d") \n\t"
1955 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1956 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1957 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1958 "movq %%mm0, %%mm1 \n\t"
1959 "movq %%mm2, %%mm3 \n\t"
1960 "psrlq $24, %%mm0 \n\t"
1961 "psrlq $24, %%mm2 \n\t"
1962 PAVGB(%%mm1, %%mm0)
1963 PAVGB(%%mm3, %%mm2)
1964 "punpcklbw %%mm7, %%mm0 \n\t"
1965 "punpcklbw %%mm7, %%mm2 \n\t"
1966 #else
1967 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1968 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1969 "punpcklbw %%mm7, %%mm0 \n\t"
1970 "punpcklbw %%mm7, %%mm2 \n\t"
1971 "paddw %%mm2, %%mm0 \n\t"
1972 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1973 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1974 "punpcklbw %%mm7, %%mm4 \n\t"
1975 "punpcklbw %%mm7, %%mm2 \n\t"
1976 "paddw %%mm4, %%mm2 \n\t"
1977 "psrlw $1, %%mm0 \n\t"
1978 "psrlw $1, %%mm2 \n\t"
1979 #endif
1980 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1981 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1983 "pmaddwd %%mm0, %%mm1 \n\t"
1984 "pmaddwd %%mm2, %%mm3 \n\t"
1985 "pmaddwd %%mm6, %%mm0 \n\t"
1986 "pmaddwd %%mm6, %%mm2 \n\t"
1987 #ifndef FAST_BGR2YV12
1988 "psrad $8, %%mm0 \n\t"
1989 "psrad $8, %%mm1 \n\t"
1990 "psrad $8, %%mm2 \n\t"
1991 "psrad $8, %%mm3 \n\t"
1992 #endif
1993 "packssdw %%mm2, %%mm0 \n\t"
1994 "packssdw %%mm3, %%mm1 \n\t"
1995 "pmaddwd %%mm5, %%mm0 \n\t"
1996 "pmaddwd %%mm5, %%mm1 \n\t"
1997 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1998 "psraw $7, %%mm0 \n\t"
2000 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2001 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2002 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2003 "movq %%mm4, %%mm1 \n\t"
2004 "movq %%mm2, %%mm3 \n\t"
2005 "psrlq $24, %%mm4 \n\t"
2006 "psrlq $24, %%mm2 \n\t"
2007 PAVGB(%%mm1, %%mm4)
2008 PAVGB(%%mm3, %%mm2)
2009 "punpcklbw %%mm7, %%mm4 \n\t"
2010 "punpcklbw %%mm7, %%mm2 \n\t"
2011 #else
2012 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2013 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2014 "punpcklbw %%mm7, %%mm4 \n\t"
2015 "punpcklbw %%mm7, %%mm2 \n\t"
2016 "paddw %%mm2, %%mm4 \n\t"
2017 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2018 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2019 "punpcklbw %%mm7, %%mm5 \n\t"
2020 "punpcklbw %%mm7, %%mm2 \n\t"
2021 "paddw %%mm5, %%mm2 \n\t"
2022 "movq "MANGLE(w1111)", %%mm5 \n\t"
2023 "psrlw $2, %%mm4 \n\t"
2024 "psrlw $2, %%mm2 \n\t"
2025 #endif
2026 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2027 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2029 "pmaddwd %%mm4, %%mm1 \n\t"
2030 "pmaddwd %%mm2, %%mm3 \n\t"
2031 "pmaddwd %%mm6, %%mm4 \n\t"
2032 "pmaddwd %%mm6, %%mm2 \n\t"
2033 #ifndef FAST_BGR2YV12
2034 "psrad $8, %%mm4 \n\t"
2035 "psrad $8, %%mm1 \n\t"
2036 "psrad $8, %%mm2 \n\t"
2037 "psrad $8, %%mm3 \n\t"
2038 #endif
2039 "packssdw %%mm2, %%mm4 \n\t"
2040 "packssdw %%mm3, %%mm1 \n\t"
2041 "pmaddwd %%mm5, %%mm4 \n\t"
2042 "pmaddwd %%mm5, %%mm1 \n\t"
2043 "add $24, %%"REG_d" \n\t"
2044 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2045 "psraw $7, %%mm4 \n\t"
2047 "movq %%mm0, %%mm1 \n\t"
2048 "punpckldq %%mm4, %%mm0 \n\t"
2049 "punpckhdq %%mm4, %%mm1 \n\t"
2050 "packsswb %%mm1, %%mm0 \n\t"
2051 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2053 "movd %%mm0, (%1, %%"REG_a") \n\t"
2054 "punpckhdq %%mm0, %%mm0 \n\t"
2055 "movd %%mm0, (%2, %%"REG_a") \n\t"
2056 "add $4, %%"REG_a" \n\t"
2057 " js 1b \n\t"
2058 : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2059 : "%"REG_a, "%"REG_d
2061 #else
2062 int i;
2063 for (i=0; i<width; i++)
2065 int b= src1[6*i + 0] + src1[6*i + 3];
2066 int g= src1[6*i + 1] + src1[6*i + 4];
2067 int r= src1[6*i + 2] + src1[6*i + 5];
2069 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2070 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2072 #endif /* HAVE_MMX */
2073 assert(src1 == src2);
2076 static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2078 int i;
2079 for (i=0; i<width; i++)
2081 int d= ((uint16_t*)src)[i];
2082 int b= d&0x1F;
2083 int g= (d>>5)&0x3F;
2084 int r= (d>>11)&0x1F;
2086 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2090 static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2092 int i;
2093 assert(src1==src2);
2094 for (i=0; i<width; i++)
2096 int d0= ((uint32_t*)src1)[i];
2098 int dl= (d0&0x07E0F81F);
2099 int dh= ((d0>>5)&0x07C0F83F);
2101 int dh2= (dh>>11) + (dh<<21);
2102 int d= dh2 + dl;
2104 int b= d&0x7F;
2105 int r= (d>>11)&0x7F;
2106 int g= d>>21;
2107 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2108 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2112 static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2114 int i;
2115 for (i=0; i<width; i++)
2117 int d= ((uint16_t*)src)[i];
2118 int b= d&0x1F;
2119 int g= (d>>5)&0x1F;
2120 int r= (d>>10)&0x1F;
2122 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2126 static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2128 int i;
2129 assert(src1==src2);
2130 for (i=0; i<width; i++)
2132 int d0= ((uint32_t*)src1)[i];
2134 int dl= (d0&0x03E07C1F);
2135 int dh= ((d0>>5)&0x03E0F81F);
2137 int dh2= (dh>>11) + (dh<<21);
2138 int d= dh2 + dl;
2140 int b= d&0x7F;
2141 int r= (d>>10)&0x7F;
2142 int g= d>>21;
2143 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2144 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2149 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2151 int i;
2152 for (i=0; i<width; i++)
2154 int r= ((uint32_t*)src)[i]&0xFF;
2155 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2156 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2158 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2162 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2164 int i;
2165 assert(src1==src2);
2166 for (i=0; i<width; i++)
2168 const int a= ((uint32_t*)src1)[2*i+0];
2169 const int e= ((uint32_t*)src1)[2*i+1];
2170 const int l= (a&0xFF00FF) + (e&0xFF00FF);
2171 const int h= (a&0x00FF00) + (e&0x00FF00);
2172 const int r= l&0x3FF;
2173 const int g= h>>8;
2174 const int b= l>>16;
2176 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2177 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2181 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2183 int i;
2184 for (i=0; i<width; i++)
2186 int r= src[i*3+0];
2187 int g= src[i*3+1];
2188 int b= src[i*3+2];
2190 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2194 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2196 int i;
2197 assert(src1==src2);
2198 for (i=0; i<width; i++)
2200 int r= src1[6*i + 0] + src1[6*i + 3];
2201 int g= src1[6*i + 1] + src1[6*i + 4];
2202 int b= src1[6*i + 2] + src1[6*i + 5];
2204 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2205 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2209 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2211 int i;
2212 for (i=0; i<width; i++)
2214 int d= ((uint16_t*)src)[i];
2215 int r= d&0x1F;
2216 int g= (d>>5)&0x3F;
2217 int b= (d>>11)&0x1F;
2219 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2223 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2225 int i;
2226 assert(src1 == src2);
2227 for (i=0; i<width; i++)
2229 int d0= ((uint32_t*)src1)[i];
2231 int dl= (d0&0x07E0F81F);
2232 int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2234 int r= d&0x3F;
2235 int b= (d>>11)&0x3F;
2236 int g= d>>21;
2237 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2238 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2242 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2244 int i;
2245 for (i=0; i<width; i++)
2247 int d= ((uint16_t*)src)[i];
2248 int r= d&0x1F;
2249 int g= (d>>5)&0x1F;
2250 int b= (d>>10)&0x1F;
2252 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2256 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2258 int i;
2259 assert(src1 == src2);
2260 for (i=0; i<width; i++)
2262 int d0= ((uint32_t*)src1)[i];
2264 int dl= (d0&0x03E07C1F);
2265 int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2267 int r= d&0x3F;
2268 int b= (d>>10)&0x3F;
2269 int g= d>>21;
2270 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2271 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2275 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2277 int i;
2278 for (i=0; i<width; i++)
2280 int d= src[i];
2282 dst[i]= pal[d] & 0xFF;
2286 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2288 int i;
2289 assert(src1 == src2);
2290 for (i=0; i<width; i++)
2292 int p= pal[src1[i]];
2294 dstU[i]= p>>8;
2295 dstV[i]= p>>16;
2299 // Bilinear / Bicubic scaling
2300 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2301 int16_t *filter, int16_t *filterPos, long filterSize)
2303 #ifdef HAVE_MMX
2304 assert(filterSize % 4 == 0 && filterSize>0);
2305 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2307 long counter= -2*dstW;
2308 filter-= counter*2;
2309 filterPos-= counter/2;
2310 dst-= counter/2;
2311 asm volatile(
2312 #if defined(PIC)
2313 "push %%"REG_b" \n\t"
2314 #endif
2315 "pxor %%mm7, %%mm7 \n\t"
2316 "movq "MANGLE(w02)", %%mm6 \n\t"
2317 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2318 "mov %%"REG_a", %%"REG_BP" \n\t"
2319 ASMALIGN(4)
2320 "1: \n\t"
2321 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2322 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2323 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2324 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2325 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2326 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2327 "punpcklbw %%mm7, %%mm0 \n\t"
2328 "punpcklbw %%mm7, %%mm2 \n\t"
2329 "pmaddwd %%mm1, %%mm0 \n\t"
2330 "pmaddwd %%mm2, %%mm3 \n\t"
2331 "psrad $8, %%mm0 \n\t"
2332 "psrad $8, %%mm3 \n\t"
2333 "packssdw %%mm3, %%mm0 \n\t"
2334 "pmaddwd %%mm6, %%mm0 \n\t"
2335 "packssdw %%mm0, %%mm0 \n\t"
2336 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2337 "add $4, %%"REG_BP" \n\t"
2338 " jnc 1b \n\t"
2340 "pop %%"REG_BP" \n\t"
2341 #if defined(PIC)
2342 "pop %%"REG_b" \n\t"
2343 #endif
2344 : "+a" (counter)
2345 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2346 #if !defined(PIC)
2347 : "%"REG_b
2348 #endif
2351 else if (filterSize==8)
2353 long counter= -2*dstW;
2354 filter-= counter*4;
2355 filterPos-= counter/2;
2356 dst-= counter/2;
2357 asm volatile(
2358 #if defined(PIC)
2359 "push %%"REG_b" \n\t"
2360 #endif
2361 "pxor %%mm7, %%mm7 \n\t"
2362 "movq "MANGLE(w02)", %%mm6 \n\t"
2363 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2364 "mov %%"REG_a", %%"REG_BP" \n\t"
2365 ASMALIGN(4)
2366 "1: \n\t"
2367 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2368 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2369 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2370 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2371 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2372 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2373 "punpcklbw %%mm7, %%mm0 \n\t"
2374 "punpcklbw %%mm7, %%mm2 \n\t"
2375 "pmaddwd %%mm1, %%mm0 \n\t"
2376 "pmaddwd %%mm2, %%mm3 \n\t"
2378 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2379 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2380 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2381 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2382 "punpcklbw %%mm7, %%mm4 \n\t"
2383 "punpcklbw %%mm7, %%mm2 \n\t"
2384 "pmaddwd %%mm1, %%mm4 \n\t"
2385 "pmaddwd %%mm2, %%mm5 \n\t"
2386 "paddd %%mm4, %%mm0 \n\t"
2387 "paddd %%mm5, %%mm3 \n\t"
2389 "psrad $8, %%mm0 \n\t"
2390 "psrad $8, %%mm3 \n\t"
2391 "packssdw %%mm3, %%mm0 \n\t"
2392 "pmaddwd %%mm6, %%mm0 \n\t"
2393 "packssdw %%mm0, %%mm0 \n\t"
2394 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2395 "add $4, %%"REG_BP" \n\t"
2396 " jnc 1b \n\t"
2398 "pop %%"REG_BP" \n\t"
2399 #if defined(PIC)
2400 "pop %%"REG_b" \n\t"
2401 #endif
2402 : "+a" (counter)
2403 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2404 #if !defined(PIC)
2405 : "%"REG_b
2406 #endif
2409 else
2411 uint8_t *offset = src+filterSize;
2412 long counter= -2*dstW;
2413 //filter-= counter*filterSize/2;
2414 filterPos-= counter/2;
2415 dst-= counter/2;
2416 asm volatile(
2417 "pxor %%mm7, %%mm7 \n\t"
2418 "movq "MANGLE(w02)", %%mm6 \n\t"
2419 ASMALIGN(4)
2420 "1: \n\t"
2421 "mov %2, %%"REG_c" \n\t"
2422 "movzwl (%%"REG_c", %0), %%eax \n\t"
2423 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2424 "mov %5, %%"REG_c" \n\t"
2425 "pxor %%mm4, %%mm4 \n\t"
2426 "pxor %%mm5, %%mm5 \n\t"
2427 "2: \n\t"
2428 "movq (%1), %%mm1 \n\t"
2429 "movq (%1, %6), %%mm3 \n\t"
2430 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2431 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2432 "punpcklbw %%mm7, %%mm0 \n\t"
2433 "punpcklbw %%mm7, %%mm2 \n\t"
2434 "pmaddwd %%mm1, %%mm0 \n\t"
2435 "pmaddwd %%mm2, %%mm3 \n\t"
2436 "paddd %%mm3, %%mm5 \n\t"
2437 "paddd %%mm0, %%mm4 \n\t"
2438 "add $8, %1 \n\t"
2439 "add $4, %%"REG_c" \n\t"
2440 "cmp %4, %%"REG_c" \n\t"
2441 " jb 2b \n\t"
2442 "add %6, %1 \n\t"
2443 "psrad $8, %%mm4 \n\t"
2444 "psrad $8, %%mm5 \n\t"
2445 "packssdw %%mm5, %%mm4 \n\t"
2446 "pmaddwd %%mm6, %%mm4 \n\t"
2447 "packssdw %%mm4, %%mm4 \n\t"
2448 "mov %3, %%"REG_a" \n\t"
2449 "movd %%mm4, (%%"REG_a", %0) \n\t"
2450 "add $4, %0 \n\t"
2451 " jnc 1b \n\t"
2453 : "+r" (counter), "+r" (filter)
2454 : "m" (filterPos), "m" (dst), "m"(offset),
2455 "m" (src), "r" (filterSize*2)
2456 : "%"REG_a, "%"REG_c, "%"REG_d
2459 #else
2460 #ifdef HAVE_ALTIVEC
2461 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2462 #else
2463 int i;
2464 for (i=0; i<dstW; i++)
2466 int j;
2467 int srcPos= filterPos[i];
2468 int val=0;
2469 //printf("filterPos: %d\n", filterPos[i]);
2470 for (j=0; j<filterSize; j++)
2472 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2473 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2475 //filter += hFilterSize;
2476 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2477 //dst[i] = val>>7;
2479 #endif /* HAVE_ALTIVEC */
2480 #endif /* HAVE_MMX */
2482 // *** horizontal scale Y line to temp buffer
2483 static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2484 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2485 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2486 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2487 int32_t *mmx2FilterPos, uint8_t *pal)
2489 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2491 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2492 src= formatConvBuffer;
2494 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2496 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2497 src= formatConvBuffer;
2499 else if (srcFormat==PIX_FMT_RGB32)
2501 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2502 src= formatConvBuffer;
2504 else if (srcFormat==PIX_FMT_BGR24)
2506 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2507 src= formatConvBuffer;
2509 else if (srcFormat==PIX_FMT_BGR565)
2511 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2512 src= formatConvBuffer;
2514 else if (srcFormat==PIX_FMT_BGR555)
2516 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2517 src= formatConvBuffer;
2519 else if (srcFormat==PIX_FMT_BGR32)
2521 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2522 src= formatConvBuffer;
2524 else if (srcFormat==PIX_FMT_RGB24)
2526 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2527 src= formatConvBuffer;
2529 else if (srcFormat==PIX_FMT_RGB565)
2531 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2532 src= formatConvBuffer;
2534 else if (srcFormat==PIX_FMT_RGB555)
2536 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2537 src= formatConvBuffer;
2539 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2541 RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2542 src= formatConvBuffer;
2545 #ifdef HAVE_MMX
2546 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2547 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2548 #else
2549 if (!(flags&SWS_FAST_BILINEAR))
2550 #endif
2552 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2554 else // Fast Bilinear upscale / crap downscale
2556 #if defined(ARCH_X86)
2557 #ifdef HAVE_MMX2
2558 int i;
2559 #if defined(PIC)
2560 uint64_t ebxsave __attribute__((aligned(8)));
2561 #endif
2562 if (canMMX2BeUsed)
2564 asm volatile(
2565 #if defined(PIC)
2566 "mov %%"REG_b", %5 \n\t"
2567 #endif
2568 "pxor %%mm7, %%mm7 \n\t"
2569 "mov %0, %%"REG_c" \n\t"
2570 "mov %1, %%"REG_D" \n\t"
2571 "mov %2, %%"REG_d" \n\t"
2572 "mov %3, %%"REG_b" \n\t"
2573 "xor %%"REG_a", %%"REG_a" \n\t" // i
2574 PREFETCH" (%%"REG_c") \n\t"
2575 PREFETCH" 32(%%"REG_c") \n\t"
2576 PREFETCH" 64(%%"REG_c") \n\t"
2578 #ifdef ARCH_X86_64
2580 #define FUNNY_Y_CODE \
2581 "movl (%%"REG_b"), %%esi \n\t"\
2582 "call *%4 \n\t"\
2583 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2584 "add %%"REG_S", %%"REG_c" \n\t"\
2585 "add %%"REG_a", %%"REG_D" \n\t"\
2586 "xor %%"REG_a", %%"REG_a" \n\t"\
2588 #else
2590 #define FUNNY_Y_CODE \
2591 "movl (%%"REG_b"), %%esi \n\t"\
2592 "call *%4 \n\t"\
2593 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2594 "add %%"REG_a", %%"REG_D" \n\t"\
2595 "xor %%"REG_a", %%"REG_a" \n\t"\
2597 #endif /* ARCH_X86_64 */
2599 FUNNY_Y_CODE
2600 FUNNY_Y_CODE
2601 FUNNY_Y_CODE
2602 FUNNY_Y_CODE
2603 FUNNY_Y_CODE
2604 FUNNY_Y_CODE
2605 FUNNY_Y_CODE
2606 FUNNY_Y_CODE
2608 #if defined(PIC)
2609 "mov %5, %%"REG_b" \n\t"
2610 #endif
2611 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2612 "m" (funnyYCode)
2613 #if defined(PIC)
2614 ,"m" (ebxsave)
2615 #endif
2616 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2617 #if !defined(PIC)
2618 ,"%"REG_b
2619 #endif
2621 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2623 else
2625 #endif /* HAVE_MMX2 */
2626 long xInc_shr16 = xInc >> 16;
2627 uint16_t xInc_mask = xInc & 0xffff;
2628 //NO MMX just normal asm ...
2629 asm volatile(
2630 "xor %%"REG_a", %%"REG_a" \n\t" // i
2631 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2632 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2633 ASMALIGN(4)
2634 "1: \n\t"
2635 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2636 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2637 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2638 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2639 "shll $16, %%edi \n\t"
2640 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2641 "mov %1, %%"REG_D" \n\t"
2642 "shrl $9, %%esi \n\t"
2643 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2644 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2645 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2647 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2648 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2649 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2650 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2651 "shll $16, %%edi \n\t"
2652 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2653 "mov %1, %%"REG_D" \n\t"
2654 "shrl $9, %%esi \n\t"
2655 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2656 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2657 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2660 "add $2, %%"REG_a" \n\t"
2661 "cmp %2, %%"REG_a" \n\t"
2662 " jb 1b \n\t"
2665 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2666 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2668 #ifdef HAVE_MMX2
2669 } //if MMX2 can't be used
2670 #endif
2671 #else
2672 int i;
2673 unsigned int xpos=0;
2674 for (i=0;i<dstWidth;i++)
2676 register unsigned int xx=xpos>>16;
2677 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2678 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2679 xpos+=xInc;
2681 #endif /* defined(ARCH_X86) */
2685 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2686 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2687 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2688 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2689 int32_t *mmx2FilterPos, uint8_t *pal)
2691 if (srcFormat==PIX_FMT_YUYV422)
2693 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2694 src1= formatConvBuffer;
2695 src2= formatConvBuffer+2048;
2697 else if (srcFormat==PIX_FMT_UYVY422)
2699 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2700 src1= formatConvBuffer;
2701 src2= formatConvBuffer+2048;
2703 else if (srcFormat==PIX_FMT_RGB32)
2705 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2706 src1= formatConvBuffer;
2707 src2= formatConvBuffer+2048;
2709 else if (srcFormat==PIX_FMT_BGR24)
2711 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2712 src1= formatConvBuffer;
2713 src2= formatConvBuffer+2048;
2715 else if (srcFormat==PIX_FMT_BGR565)
2717 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2718 src1= formatConvBuffer;
2719 src2= formatConvBuffer+2048;
2721 else if (srcFormat==PIX_FMT_BGR555)
2723 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2724 src1= formatConvBuffer;
2725 src2= formatConvBuffer+2048;
2727 else if (srcFormat==PIX_FMT_BGR32)
2729 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2730 src1= formatConvBuffer;
2731 src2= formatConvBuffer+2048;
2733 else if (srcFormat==PIX_FMT_RGB24)
2735 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2736 src1= formatConvBuffer;
2737 src2= formatConvBuffer+2048;
2739 else if (srcFormat==PIX_FMT_RGB565)
2741 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2742 src1= formatConvBuffer;
2743 src2= formatConvBuffer+2048;
2745 else if (srcFormat==PIX_FMT_RGB555)
2747 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2748 src1= formatConvBuffer;
2749 src2= formatConvBuffer+2048;
2751 else if (isGray(srcFormat))
2753 return;
2755 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2757 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW, pal);
2758 src1= formatConvBuffer;
2759 src2= formatConvBuffer+2048;
2762 #ifdef HAVE_MMX
2763 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2764 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2765 #else
2766 if (!(flags&SWS_FAST_BILINEAR))
2767 #endif
2769 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2770 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2772 else // Fast Bilinear upscale / crap downscale
2774 #if defined(ARCH_X86)
2775 #ifdef HAVE_MMX2
2776 int i;
2777 #if defined(PIC)
2778 uint64_t ebxsave __attribute__((aligned(8)));
2779 #endif
2780 if (canMMX2BeUsed)
2782 asm volatile(
2783 #if defined(PIC)
2784 "mov %%"REG_b", %6 \n\t"
2785 #endif
2786 "pxor %%mm7, %%mm7 \n\t"
2787 "mov %0, %%"REG_c" \n\t"
2788 "mov %1, %%"REG_D" \n\t"
2789 "mov %2, %%"REG_d" \n\t"
2790 "mov %3, %%"REG_b" \n\t"
2791 "xor %%"REG_a", %%"REG_a" \n\t" // i
2792 PREFETCH" (%%"REG_c") \n\t"
2793 PREFETCH" 32(%%"REG_c") \n\t"
2794 PREFETCH" 64(%%"REG_c") \n\t"
2796 #ifdef ARCH_X86_64
2798 #define FUNNY_UV_CODE \
2799 "movl (%%"REG_b"), %%esi \n\t"\
2800 "call *%4 \n\t"\
2801 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2802 "add %%"REG_S", %%"REG_c" \n\t"\
2803 "add %%"REG_a", %%"REG_D" \n\t"\
2804 "xor %%"REG_a", %%"REG_a" \n\t"\
2806 #else
2808 #define FUNNY_UV_CODE \
2809 "movl (%%"REG_b"), %%esi \n\t"\
2810 "call *%4 \n\t"\
2811 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2812 "add %%"REG_a", %%"REG_D" \n\t"\
2813 "xor %%"REG_a", %%"REG_a" \n\t"\
2815 #endif /* ARCH_X86_64 */
2817 FUNNY_UV_CODE
2818 FUNNY_UV_CODE
2819 FUNNY_UV_CODE
2820 FUNNY_UV_CODE
2821 "xor %%"REG_a", %%"REG_a" \n\t" // i
2822 "mov %5, %%"REG_c" \n\t" // src
2823 "mov %1, %%"REG_D" \n\t" // buf1
2824 "add $4096, %%"REG_D" \n\t"
2825 PREFETCH" (%%"REG_c") \n\t"
2826 PREFETCH" 32(%%"REG_c") \n\t"
2827 PREFETCH" 64(%%"REG_c") \n\t"
2829 FUNNY_UV_CODE
2830 FUNNY_UV_CODE
2831 FUNNY_UV_CODE
2832 FUNNY_UV_CODE
2834 #if defined(PIC)
2835 "mov %6, %%"REG_b" \n\t"
2836 #endif
2837 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2838 "m" (funnyUVCode), "m" (src2)
2839 #if defined(PIC)
2840 ,"m" (ebxsave)
2841 #endif
2842 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2843 #if !defined(PIC)
2844 ,"%"REG_b
2845 #endif
2847 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2849 //printf("%d %d %d\n", dstWidth, i, srcW);
2850 dst[i] = src1[srcW-1]*128;
2851 dst[i+2048] = src2[srcW-1]*128;
2854 else
2856 #endif /* HAVE_MMX2 */
2857 long xInc_shr16 = (long) (xInc >> 16);
2858 uint16_t xInc_mask = xInc & 0xffff;
2859 asm volatile(
2860 "xor %%"REG_a", %%"REG_a" \n\t" // i
2861 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2862 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2863 ASMALIGN(4)
2864 "1: \n\t"
2865 "mov %0, %%"REG_S" \n\t"
2866 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2867 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2868 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2869 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2870 "shll $16, %%edi \n\t"
2871 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2872 "mov %1, %%"REG_D" \n\t"
2873 "shrl $9, %%esi \n\t"
2874 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2876 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2877 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2878 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2879 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2880 "shll $16, %%edi \n\t"
2881 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2882 "mov %1, %%"REG_D" \n\t"
2883 "shrl $9, %%esi \n\t"
2884 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2) \n\t"
2886 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2887 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2888 "add $1, %%"REG_a" \n\t"
2889 "cmp %2, %%"REG_a" \n\t"
2890 " jb 1b \n\t"
2892 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2893 which is needed to support GCC-4.0 */
2894 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2895 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2896 #else
2897 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2898 #endif
2899 "r" (src2)
2900 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2902 #ifdef HAVE_MMX2
2903 } //if MMX2 can't be used
2904 #endif
2905 #else
2906 int i;
2907 unsigned int xpos=0;
2908 for (i=0;i<dstWidth;i++)
2910 register unsigned int xx=xpos>>16;
2911 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2912 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2913 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2914 /* slower
2915 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2916 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2918 xpos+=xInc;
2920 #endif /* defined(ARCH_X86) */
2924 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2925 int srcSliceH, uint8_t* dst[], int dstStride[]){
2927 /* load a few things into local vars to make the code more readable? and faster */
2928 const int srcW= c->srcW;
2929 const int dstW= c->dstW;
2930 const int dstH= c->dstH;
2931 const int chrDstW= c->chrDstW;
2932 const int chrSrcW= c->chrSrcW;
2933 const int lumXInc= c->lumXInc;
2934 const int chrXInc= c->chrXInc;
2935 const int dstFormat= c->dstFormat;
2936 const int srcFormat= c->srcFormat;
2937 const int flags= c->flags;
2938 const int canMMX2BeUsed= c->canMMX2BeUsed;
2939 int16_t *vLumFilterPos= c->vLumFilterPos;
2940 int16_t *vChrFilterPos= c->vChrFilterPos;
2941 int16_t *hLumFilterPos= c->hLumFilterPos;
2942 int16_t *hChrFilterPos= c->hChrFilterPos;
2943 int16_t *vLumFilter= c->vLumFilter;
2944 int16_t *vChrFilter= c->vChrFilter;
2945 int16_t *hLumFilter= c->hLumFilter;
2946 int16_t *hChrFilter= c->hChrFilter;
2947 int32_t *lumMmxFilter= c->lumMmxFilter;
2948 int32_t *chrMmxFilter= c->chrMmxFilter;
2949 const int vLumFilterSize= c->vLumFilterSize;
2950 const int vChrFilterSize= c->vChrFilterSize;
2951 const int hLumFilterSize= c->hLumFilterSize;
2952 const int hChrFilterSize= c->hChrFilterSize;
2953 int16_t **lumPixBuf= c->lumPixBuf;
2954 int16_t **chrPixBuf= c->chrPixBuf;
2955 const int vLumBufSize= c->vLumBufSize;
2956 const int vChrBufSize= c->vChrBufSize;
2957 uint8_t *funnyYCode= c->funnyYCode;
2958 uint8_t *funnyUVCode= c->funnyUVCode;
2959 uint8_t *formatConvBuffer= c->formatConvBuffer;
2960 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2961 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2962 int lastDstY;
2963 uint8_t *pal=NULL;
2965 /* vars whch will change and which we need to storw back in the context */
2966 int dstY= c->dstY;
2967 int lumBufIndex= c->lumBufIndex;
2968 int chrBufIndex= c->chrBufIndex;
2969 int lastInLumBuf= c->lastInLumBuf;
2970 int lastInChrBuf= c->lastInChrBuf;
2972 if (isPacked(c->srcFormat)){
2973 pal= src[1];
2974 src[0]=
2975 src[1]=
2976 src[2]= src[0];
2977 srcStride[0]=
2978 srcStride[1]=
2979 srcStride[2]= srcStride[0];
2981 srcStride[1]<<= c->vChrDrop;
2982 srcStride[2]<<= c->vChrDrop;
2984 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2985 // (int)dst[0], (int)dst[1], (int)dst[2]);
2987 #if 0 //self test FIXME move to a vfilter or something
2989 static volatile int i=0;
2990 i++;
2991 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2992 selfTest(src, srcStride, c->srcW, c->srcH);
2993 i--;
2995 #endif
2997 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2998 //dstStride[0],dstStride[1],dstStride[2]);
3000 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3002 static int firstTime=1; //FIXME move this into the context perhaps
3003 if (flags & SWS_PRINT_INFO && firstTime)
3005 av_log(c, AV_LOG_WARNING, "SwScaler: Warning: dstStride is not aligned!\n"
3006 "SwScaler: ->cannot do aligned memory acesses anymore\n");
3007 firstTime=0;
3011 /* Note the user might start scaling the picture in the middle so this will not get executed
3012 this is not really intended but works currently, so ppl might do it */
3013 if (srcSliceY ==0){
3014 lumBufIndex=0;
3015 chrBufIndex=0;
3016 dstY=0;
3017 lastInLumBuf= -1;
3018 lastInChrBuf= -1;
3021 lastDstY= dstY;
3023 for (;dstY < dstH; dstY++){
3024 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3025 const int chrDstY= dstY>>c->chrDstVSubSample;
3026 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3027 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3029 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3030 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3031 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3032 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3034 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3035 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3036 //handle holes (FAST_BILINEAR & weird filters)
3037 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3038 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3039 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3040 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3041 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3043 // Do we have enough lines in this slice to output the dstY line
3044 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3046 //Do horizontal scaling
3047 while(lastInLumBuf < lastLumSrcY)
3049 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3050 lumBufIndex++;
3051 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3052 ASSERT(lumBufIndex < 2*vLumBufSize)
3053 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3054 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3055 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3056 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3057 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3058 funnyYCode, c->srcFormat, formatConvBuffer,
3059 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3060 lastInLumBuf++;
3062 while(lastInChrBuf < lastChrSrcY)
3064 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3065 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3066 chrBufIndex++;
3067 ASSERT(chrBufIndex < 2*vChrBufSize)
3068 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3069 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3070 //FIXME replace parameters through context struct (some at least)
3072 if (!(isGray(srcFormat) || isGray(dstFormat)))
3073 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3074 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3075 funnyUVCode, c->srcFormat, formatConvBuffer,
3076 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3077 lastInChrBuf++;
3079 //wrap buf index around to stay inside the ring buffer
3080 if (lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3081 if (chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3083 else // not enough lines left in this slice -> load the rest in the buffer
3085 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3086 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3087 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3088 vChrBufSize, vLumBufSize);*/
3090 //Do horizontal scaling
3091 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3093 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3094 lumBufIndex++;
3095 ASSERT(lumBufIndex < 2*vLumBufSize)
3096 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3097 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3098 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3099 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3100 funnyYCode, c->srcFormat, formatConvBuffer,
3101 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3102 lastInLumBuf++;
3104 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3106 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3107 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3108 chrBufIndex++;
3109 ASSERT(chrBufIndex < 2*vChrBufSize)
3110 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3111 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3113 if (!(isGray(srcFormat) || isGray(dstFormat)))
3114 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3115 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3116 funnyUVCode, c->srcFormat, formatConvBuffer,
3117 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3118 lastInChrBuf++;
3120 //wrap buf index around to stay inside the ring buffer
3121 if (lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3122 if (chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3123 break; //we can't output a dstY line so let's try with the next slice
3126 #ifdef HAVE_MMX
3127 b5Dither= dither8[dstY&1];
3128 g6Dither= dither4[dstY&1];
3129 g5Dither= dither8[dstY&1];
3130 r5Dither= dither8[(dstY+1)&1];
3131 #endif
3132 if (dstY < dstH-2)
3134 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3135 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3136 #ifdef HAVE_MMX
3137 int i;
3138 if (flags & SWS_ACCURATE_RND){
3139 for (i=0; i<vLumFilterSize; i+=2){
3140 lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i ];
3141 lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3142 lumMmxFilter[2*i+2]=
3143 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3144 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3146 for (i=0; i<vChrFilterSize; i+=2){
3147 chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i ];
3148 chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3149 chrMmxFilter[2*i+2]=
3150 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3151 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3153 }else{
3154 for (i=0; i<vLumFilterSize; i++)
3156 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3157 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3158 lumMmxFilter[4*i+2]=
3159 lumMmxFilter[4*i+3]=
3160 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3162 for (i=0; i<vChrFilterSize; i++)
3164 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3165 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3166 chrMmxFilter[4*i+2]=
3167 chrMmxFilter[4*i+3]=
3168 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3171 #endif
3172 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3173 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3174 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3175 RENAME(yuv2nv12X)(c,
3176 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3177 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3178 dest, uDest, dstW, chrDstW, dstFormat);
3180 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3182 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3183 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3184 if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3186 int16_t *lumBuf = lumPixBuf[0];
3187 int16_t *chrBuf= chrPixBuf[0];
3188 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3190 else //General YV12
3192 RENAME(yuv2yuvX)(c,
3193 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3194 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3195 dest, uDest, vDest, dstW, chrDstW);
3198 else
3200 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3201 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3202 if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3204 int chrAlpha= vChrFilter[2*dstY+1];
3205 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3206 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3208 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3210 int lumAlpha= vLumFilter[2*dstY+1];
3211 int chrAlpha= vChrFilter[2*dstY+1];
3212 lumMmxFilter[2]=
3213 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3214 chrMmxFilter[2]=
3215 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3216 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3217 dest, dstW, lumAlpha, chrAlpha, dstY);
3219 else //General RGB
3221 RENAME(yuv2packedX)(c,
3222 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3223 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3224 dest, dstW, dstY);
3228 else // hmm looks like we can't use MMX here without overwriting this array's tail
3230 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3231 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3232 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3233 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3234 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3235 yuv2nv12XinC(
3236 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3237 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3238 dest, uDest, dstW, chrDstW, dstFormat);
3240 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3242 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3243 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3244 yuv2yuvXinC(
3245 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3246 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3247 dest, uDest, vDest, dstW, chrDstW);
3249 else
3251 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3252 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3253 yuv2packedXinC(c,
3254 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3255 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3256 dest, dstW, dstY);
3261 #ifdef HAVE_MMX
3262 __asm __volatile(SFENCE:::"memory");
3263 __asm __volatile(EMMS:::"memory");
3264 #endif
3265 /* store changed local vars back in the context */
3266 c->dstY= dstY;
3267 c->lumBufIndex= lumBufIndex;
3268 c->chrBufIndex= chrBufIndex;
3269 c->lastInLumBuf= lastInLumBuf;
3270 c->lastInChrBuf= lastInChrBuf;
3272 return dstY - lastDstY;